Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9ada9fd5 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull EDAC fixes from Borislav Petkov:

 - EDAC core error path fix, from Denis Kirjanov.

 - Generalization of AMD MCE bank names and some minor error reporting
   improvements.

 - EDAC core cleanups and simplifications, from Wei Yongjun.

 - amd64_edac fixes for sysfs-reported values, from Josh Hunt.

 - some heavy amd64_edac error reporting path shaving, leading to
   removing a bunch of code.

 - amd64_edac error injection method improvements.

 - EDAC core cleanups and fixes

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: (24 commits)
  EDAC, pci_sysfs: Use for_each_pci_dev to simplify the code
  EDAC: Handle error path in edac_mc_sysfs_init() properly
  MCE, AMD: Dump error status
  MCE, AMD: Report decoded error type first
  MCE, AMD: Dump CPU f/m/s triple with the error
  MCE, AMD: Remove functional unit references
  EDAC: Convert to use simple_open()
  EDAC, Calxeda highbank: Convert to use simple_open()
  EDAC: Fix mc size reported in sysfs
  EDAC: Fix csrow size reported in sysfs
  EDAC: Pass mci parent
  EDAC: Add memory controller flags
  amd64_edac: Fix csrows size and pages computation
  amd64_edac: Use DBAM_DIMM macro
  amd64_edac: Fix K8 chip select reporting
  amd64_edac: Reorganize error reporting path
  amd64_edac: Do not check whether error address is valid
  amd64_edac: Improve error injection
  amd64_edac: Cleanup error injection code
  amd64_edac: Small fixlets and cleanups
  ...
parents c45564e9 3bfe5aae
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -42,10 +42,10 @@ config EDAC_LEGACY_SYSFS
config EDAC_DEBUG
	bool "Debugging"
	help
	  This turns on debugging information for the entire EDAC
	  sub-system. You can insert module with "debug_level=x", current
	  there're four debug levels (x=0,1,2,3 from low to high).
	  Usually you should select 'N'.
	  This turns on debugging information for the entire EDAC subsystem.
	  You do so by inserting edac_module with "edac_debug_level=x." Valid
	  levels are 0-4 (from low to high) and by default it is set to 2.
	  Usually you should select 'N' here.

config EDAC_DECODE_MCE
	tristate "Decode MCEs in human-readable form (only on AMD for now)"
+113 −184
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@ struct scrubrate {
	{ 0x00, 0UL},        /* scrubbing off */
};

static int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
			       u32 *val, const char *func)
{
	int err = 0;
@@ -423,7 +423,6 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
			     u64 *hole_offset, u64 *hole_size)
{
	struct amd64_pvt *pvt = mci->pvt_info;
	u64 base;

	/* only revE and later have the DRAM Hole Address Register */
	if (boot_cpu_data.x86 == 0xf && pvt->ext_model < K8_REV_E) {
@@ -462,10 +461,8 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
	 * addresses in the hole so that they start at 0x100000000.
	 */

	base = dhar_base(pvt);

	*hole_base = base;
	*hole_size = (0x1ull << 32) - base;
	*hole_base = dhar_base(pvt);
	*hole_size = (1ULL << 32) - *hole_base;

	if (boot_cpu_data.x86 > 0xf)
		*hole_offset = f10_dhar_offset(pvt);
@@ -513,15 +510,15 @@ static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr)
{
	struct amd64_pvt *pvt = mci->pvt_info;
	u64 dram_base, hole_base, hole_offset, hole_size, dram_addr;
	int ret = 0;
	int ret;

	dram_base = get_dram_base(pvt, pvt->mc_node_id);

	ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
				      &hole_size);
	if (!ret) {
		if ((sys_addr >= (1ull << 32)) &&
		    (sys_addr < ((1ull << 32) + hole_size))) {
		if ((sys_addr >= (1ULL << 32)) &&
		    (sys_addr < ((1ULL << 32) + hole_size))) {
			/* use DHAR to translate SysAddr to DramAddr */
			dram_addr = sys_addr - hole_offset;

@@ -712,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,

/* Map the Error address to a PAGE and PAGE OFFSET. */
static inline void error_address_to_page_and_offset(u64 error_address,
						    u32 *page, u32 *offset)
						    struct err_info *err)
{
	*page = (u32) (error_address >> PAGE_SHIFT);
	*offset = ((u32) error_address) & ~PAGE_MASK;
	err->page = (u32) (error_address >> PAGE_SHIFT);
	err->offset = ((u32) error_address) & ~PAGE_MASK;
}

/*
@@ -1026,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
}

static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
				    u16 syndrome)
				    struct err_info *err)
{
	struct mem_ctl_info *src_mci;
	struct amd64_pvt *pvt = mci->pvt_info;
	int channel, csrow;
	u32 page, offset;

	error_address_to_page_and_offset(sys_addr, &page, &offset);
	error_address_to_page_and_offset(sys_addr, err);

	/*
	 * Find out which node the error address belongs to. This may be
	 * different from the node that detected the error.
	 */
	src_mci = find_mc_by_sys_addr(mci, sys_addr);
	if (!src_mci) {
	err->src_mci = find_mc_by_sys_addr(mci, sys_addr);
	if (!err->src_mci) {
		amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
			     (unsigned long)sys_addr);
		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
				     page, offset, syndrome,
				     -1, -1, -1,
				     "failed to map error addr to a node",
				     "");
		err->err_code = ERR_NODE;
		return;
	}

	/* Now map the sys_addr to a CSROW */
	csrow = sys_addr_to_csrow(src_mci, sys_addr);
	if (csrow < 0) {
		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
				     page, offset, syndrome,
				     -1, -1, -1,
				     "failed to map error addr to a csrow",
				     "");
	err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr);
	if (err->csrow < 0) {
		err->err_code = ERR_CSROW;
		return;
	}

	/* CHIPKILL enabled */
	if (pvt->nbcfg & NBCFG_CHIPKILL) {
		channel = get_channel_from_ecc_syndrome(mci, syndrome);
		if (channel < 0) {
		err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
		if (err->channel < 0) {
			/*
			 * Syndrome didn't map, so we don't know which of the
			 * 2 DIMMs is in error. So we need to ID 'both' of them
			 * as suspect.
			 */
			amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - "
			amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - "
				      "possible error reporting race\n",
				      syndrome);
			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
					     page, offset, syndrome,
					     csrow, -1, -1,
					     "unknown syndrome - possible error reporting race",
					     "");
				      err->syndrome);
			err->err_code = ERR_CHANNEL;
			return;
		}
	} else {
@@ -1090,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
		 * was obtained from email communication with someone at AMD.
		 * (Wish the email was placed in this comment - norsk)
		 */
		channel = ((sys_addr & BIT(3)) != 0);
		err->channel = ((sys_addr & BIT(3)) != 0);
	}

	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1,
			     page, offset, syndrome,
			     csrow, channel, -1,
			     "", "");
}

static int ddr2_cs_size(unsigned i, bool dct_width)
@@ -1482,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr)

/* For a given @dram_range, check if @sys_addr falls within it. */
static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
				  u64 sys_addr, int *nid, int *chan_sel)
				  u64 sys_addr, int *chan_sel)
{
	int cs_found = -EINVAL;
	u64 chan_addr;
@@ -1555,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,

	cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel);

	if (cs_found >= 0) {
		*nid = node_id;
	if (cs_found >= 0)
		*chan_sel = channel;
	}

	return cs_found;
}

static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
				       int *node, int *chan_sel)
				       int *chan_sel)
{
	int cs_found = -EINVAL;
	unsigned range;
@@ -1577,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
		    (get_dram_limit(pvt, range) >= sys_addr)) {

			cs_found = f1x_match_to_this_node(pvt, range,
							  sys_addr, node,
							  chan_sel);
							  sys_addr, chan_sel);
			if (cs_found >= 0)
				break;
		}
@@ -1594,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
 * (MCX_ADDR).
 */
static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
				     u16 syndrome)
				     struct err_info *err)
{
	struct amd64_pvt *pvt = mci->pvt_info;
	u32 page, offset;
	int nid, csrow, chan = 0;

	error_address_to_page_and_offset(sys_addr, &page, &offset);

	csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
	error_address_to_page_and_offset(sys_addr, err);

	if (csrow < 0) {
		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
				     page, offset, syndrome,
				     -1, -1, -1,
				     "failed to map error addr to a csrow",
				     "");
	err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel);
	if (err->csrow < 0) {
		err->err_code = ERR_CSROW;
		return;
	}

@@ -1619,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
	 * this point.
	 */
	if (dct_ganging_enabled(pvt))
		chan = get_channel_from_ecc_syndrome(mci, syndrome);

	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
			     page, offset, syndrome,
			     csrow, chan, -1,
			     "", "");
		err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
}

/*
@@ -1633,14 +1596,11 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 */
static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
{
	int dimm, size0, size1, factor = 0;
	int dimm, size0, size1;
	u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases;
	u32 dbam  = ctrl ? pvt->dbam1 : pvt->dbam0;

	if (boot_cpu_data.x86 == 0xf) {
		if (pvt->dclr0 & WIDTH_128)
			factor = 1;

		/* K8 families < revF not supported yet */
	       if (pvt->ext_model < K8_REV_F)
			return;
@@ -1671,8 +1631,8 @@ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
						     DBAM_DIMM(dimm, dbam));

		amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n",
				dimm * 2,     size0 << factor,
				dimm * 2 + 1, size1 << factor);
				dimm * 2,     size0,
				dimm * 2 + 1, size1);
	}
}

@@ -1893,101 +1853,56 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
	return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
}

/*
 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
 * ADDRESS and process.
 */
static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m)
static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err,
			    u8 ecc_type)
{
	struct amd64_pvt *pvt = mci->pvt_info;
	u64 sys_addr;
	u16 syndrome;

	/* Ensure that the Error Address is VALID */
	if (!(m->status & MCI_STATUS_ADDRV)) {
		amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
				     0, 0, 0,
				     -1, -1, -1,
				     "HW has no ERROR_ADDRESS available",
				     "");
		return;
	}

	sys_addr = get_error_address(m);
	syndrome = extract_syndrome(m->status);

	amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr);

	pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome);
}
	enum hw_event_mc_err_type err_type;
	const char *string;

/* Handle any Un-correctable Errors (UEs) */
static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
{
	struct mem_ctl_info *log_mci, *src_mci = NULL;
	int csrow;
	u64 sys_addr;
	u32 page, offset;

	log_mci = mci;

	if (!(m->status & MCI_STATUS_ADDRV)) {
		amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
				     0, 0, 0,
				     -1, -1, -1,
				     "HW has no ERROR_ADDRESS available",
				     "");
	if (ecc_type == 2)
		err_type = HW_EVENT_ERR_CORRECTED;
	else if (ecc_type == 1)
		err_type = HW_EVENT_ERR_UNCORRECTED;
	else {
		WARN(1, "Something is rotten in the state of Denmark.\n");
		return;
	}

	sys_addr = get_error_address(m);
	error_address_to_page_and_offset(sys_addr, &page, &offset);

	/*
	 * Find out which node the error address belongs to. This may be
	 * different from the node that detected the error.
	 */
	src_mci = find_mc_by_sys_addr(mci, sys_addr);
	if (!src_mci) {
		amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n",
				  (unsigned long)sys_addr);
		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
				     page, offset, 0,
				     -1, -1, -1,
				     "ERROR ADDRESS NOT mapped to a MC",
				     "");
		return;
	switch (err->err_code) {
	case DECODE_OK:
		string = "";
		break;
	case ERR_NODE:
		string = "Failed to map error addr to a node";
		break;
	case ERR_CSROW:
		string = "Failed to map error addr to a csrow";
		break;
	case ERR_CHANNEL:
		string = "unknown syndrome - possible error reporting race";
		break;
	default:
		string = "WTF error";
		break;
	}

	log_mci = src_mci;

	csrow = sys_addr_to_csrow(log_mci, sys_addr);
	if (csrow < 0) {
		amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n",
				  (unsigned long)sys_addr);
		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
				     page, offset, 0,
				     -1, -1, -1,
				     "ERROR ADDRESS NOT mapped to CS",
				     "");
	} else {
		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
				     page, offset, 0,
				     csrow, -1, -1,
				     "", "");
	}
	edac_mc_handle_error(err_type, mci, 1,
			     err->page, err->offset, err->syndrome,
			     err->csrow, err->channel, -1,
			     string, "");
}

static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
					    struct mce *m)
{
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, 0x1f);
	struct amd64_pvt *pvt = mci->pvt_info;
	u8 ecc_type = (m->status >> 45) & 0x3;
	u8 xec = XEC(m->status, 0x1f);
	u16 ec = EC(m->status);
	u64 sys_addr;
	struct err_info err;

	/* Bail early out if this was an 'observed' error */
	/* Bail out early if this was an 'observed' error */
	if (PP(ec) == NBSL_PP_OBS)
		return;

@@ -1995,10 +1910,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
	if (xec && xec != F10_NBSL_EXT_ERR_ECC)
		return;

	memset(&err, 0, sizeof(err));

	sys_addr = get_error_address(m);

	if (ecc_type == 2)
		amd64_handle_ce(mci, m);
	else if (ecc_type == 1)
		amd64_handle_ue(mci, m);
		err.syndrome = extract_syndrome(m->status);

	pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err);

	__log_bus_error(mci, &err, ecc_type);
}

void amd64_decode_bus_error(int node_id, struct mce *m)
@@ -2166,6 +2087,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
	u32 cs_mode, nr_pages;
	u32 dbam = dct ? pvt->dbam1 : pvt->dbam0;


	/*
	 * The math on this doesn't look right on the surface because x/2*4 can
	 * be simplified to x*2 but this expression makes use of the fact that
@@ -2173,13 +2095,13 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
	 * number of bits to shift the DBAM register to extract the proper CSROW
	 * field.
	 */
	cs_mode =  (dbam >> ((csrow_nr / 2) * 4)) & 0xF;
	cs_mode = DBAM_DIMM(csrow_nr / 2, dbam);

	nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT);

	edac_dbg(0, "  (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode);
	edac_dbg(0, "    nr_pages/channel= %u  channel-count = %d\n",
		 nr_pages, pvt->channel_count);
	edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n",
		    csrow_nr, dct,  cs_mode);
	edac_dbg(0, "nr_pages/channel: %u\n", nr_pages);

	return nr_pages;
}
@@ -2190,15 +2112,14 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
 */
static int init_csrows(struct mem_ctl_info *mci)
{
	struct amd64_pvt *pvt = mci->pvt_info;
	struct csrow_info *csrow;
	struct dimm_info *dimm;
	struct amd64_pvt *pvt = mci->pvt_info;
	u64 base, mask;
	u32 val;
	int i, j, empty = 1;
	enum mem_type mtype;
	enum edac_type edac_mode;
	enum mem_type mtype;
	int i, j, empty = 1;
	int nr_pages = 0;
	u32 val;

	amd64_read_pci_cfg(pvt->F3, NBCFG, &val);

@@ -2208,29 +2129,35 @@ static int init_csrows(struct mem_ctl_info *mci)
		 pvt->mc_node_id, val,
		 !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE));

	/*
	 * We iterate over DCT0 here but we look at DCT1 in parallel, if needed.
	 */
	for_each_chip_select(i, 0, pvt) {
		csrow = mci->csrows[i];
		bool row_dct0 = !!csrow_enabled(i, 0, pvt);
		bool row_dct1 = false;

		if (boot_cpu_data.x86 != 0xf)
			row_dct1 = !!csrow_enabled(i, 1, pvt);

		if (!csrow_enabled(i, 0, pvt) && !csrow_enabled(i, 1, pvt)) {
			edac_dbg(1, "----CSROW %d VALID for MC node %d\n",
				 i, pvt->mc_node_id);
		if (!row_dct0 && !row_dct1)
			continue;
		}

		csrow = mci->csrows[i];
		empty = 0;
		if (csrow_enabled(i, 0, pvt))

		edac_dbg(1, "MC node: %d, csrow: %d\n",
			    pvt->mc_node_id, i);

		if (row_dct0)
			nr_pages = amd64_csrow_nr_pages(pvt, 0, i);
		if (csrow_enabled(i, 1, pvt))
			nr_pages += amd64_csrow_nr_pages(pvt, 1, i);

		get_cs_base_and_mask(pvt, i, 0, &base, &mask);
		/* 8 bytes of resolution */
		/* K8 has only one DCT */
		if (boot_cpu_data.x86 != 0xf && row_dct1)
			nr_pages += amd64_csrow_nr_pages(pvt, 1, i);

		mtype = amd64_determine_memory_type(pvt, i);

		edac_dbg(1, "  for MC node %d csrow %d:\n", pvt->mc_node_id, i);
		edac_dbg(1, "    nr_pages: %u\n",
			 nr_pages * pvt->channel_count);
		edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages);

		/*
		 * determine whether CHIPKILL or JUST ECC or NO ECC is operating
@@ -2247,6 +2174,7 @@ static int init_csrows(struct mem_ctl_info *mci)
			dimm->edac_mode = edac_mode;
			dimm->nr_pages = nr_pages;
		}
		csrow->nr_pages = nr_pages;
	}

	return empty;
@@ -2591,6 +2519,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)

	mci->pvt_info = pvt;
	mci->pdev = &pvt->F2->dev;
	mci->csbased = 1;

	setup_mci_misc_attrs(mci, fam_type);

+46 −13
Original line number Diff line number Diff line
@@ -219,7 +219,7 @@
#define DBAM1				0x180

/* Extract the DIMM 'type' on the i'th DIMM from the DBAM reg value passed */
#define DBAM_DIMM(i, reg)		((((reg) >> (4*i))) & 0xF)
#define DBAM_DIMM(i, reg)		((((reg) >> (4*(i)))) & 0xF)

#define DBAM_MAX_VALUE			11

@@ -267,18 +267,20 @@
#define online_spare_bad_dramcs(pvt, c)	(((pvt)->online_spare >> (4 + 4 * (c))) & 0x7)

#define F10_NB_ARRAY_ADDR		0xB8
#define F10_NB_ARRAY_DRAM_ECC		BIT(31)
#define F10_NB_ARRAY_DRAM		BIT(31)

/* Bits [2:1] are used to select 16-byte section within a 64-byte cacheline  */
#define SET_NB_ARRAY_ADDRESS(section)	(((section) & 0x3) << 1)
#define SET_NB_ARRAY_ADDR(section)	(((section) & 0x3) << 1)

#define F10_NB_ARRAY_DATA		0xBC
#define SET_NB_DRAM_INJECTION_WRITE(word, bits)  \
					(BIT(((word) & 0xF) + 20) | \
					BIT(17) | bits)
#define SET_NB_DRAM_INJECTION_READ(word, bits)  \
					(BIT(((word) & 0xF) + 20) | \
					BIT(16) |  bits)
#define F10_NB_ARR_ECC_WR_REQ		BIT(17)
#define SET_NB_DRAM_INJECTION_WRITE(inj)  \
					(BIT(((inj.word) & 0xF) + 20) | \
					F10_NB_ARR_ECC_WR_REQ | inj.bit_map)
#define SET_NB_DRAM_INJECTION_READ(inj)  \
					(BIT(((inj.word) & 0xF) + 20) | \
					BIT(16) |  inj.bit_map)


#define NBCAP				0xE8
#define NBCAP_CHIPKILL			BIT(4)
@@ -374,6 +376,23 @@ struct amd64_pvt {
	struct error_injection injection;
};

enum err_codes {
	DECODE_OK	=  0,
	ERR_NODE	= -1,
	ERR_CSROW	= -2,
	ERR_CHANNEL	= -3,
};

struct err_info {
	int err_code;
	struct mem_ctl_info *src_mci;
	int csrow;
	int channel;
	u16 syndrome;
	u32 page;
	u32 offset;
};

static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
{
	u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
@@ -447,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci)
struct low_ops {
	int (*early_channel_count)	(struct amd64_pvt *pvt);
	void (*map_sysaddr_to_csrow)	(struct mem_ctl_info *mci, u64 sys_addr,
					 u16 syndrome);
					 struct err_info *);
	int (*dbam_to_cs)		(struct amd64_pvt *pvt, u8 dct, unsigned cs_mode);
	int (*read_dct_pci_cfg)		(struct amd64_pvt *pvt, int offset,
					 u32 *val, const char *func);
@@ -459,6 +478,8 @@ struct amd64_family_type {
	struct low_ops ops;
};

int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
			       u32 *val, const char *func);
int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset,
				u32 val, const char *func);

@@ -475,3 +496,15 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
			     u64 *hole_offset, u64 *hole_size);

#define to_mci(k) container_of(k, struct mem_ctl_info, dev)

/* Injection helpers */
static inline void disable_caches(void *dummy)
{
	write_cr0(read_cr0() | X86_CR0_CD);
	wbinvd();
}

static inline void enable_caches(void *dummy)
{
	write_cr0(read_cr0() & ~X86_CR0_CD);
}
+67 −61
Original line number Diff line number Diff line
@@ -22,10 +22,11 @@ static ssize_t amd64_inject_section_store(struct device *dev,
	struct mem_ctl_info *mci = to_mci(dev);
	struct amd64_pvt *pvt = mci->pvt_info;
	unsigned long value;
	int ret = 0;
	int ret;

	ret = strict_strtoul(data, 10, &value);
	if (ret != -EINVAL) {
	if (ret < 0)
		return ret;

	if (value > 3) {
		amd64_warn("%s: invalid section 0x%lx\n", __func__, value);
@@ -35,8 +36,6 @@ static ssize_t amd64_inject_section_store(struct device *dev,
	pvt->injection.section = (u32) value;
	return count;
}
	return ret;
}

static ssize_t amd64_inject_word_show(struct device *dev,
					struct device_attribute *mattr,
@@ -60,10 +59,11 @@ static ssize_t amd64_inject_word_store(struct device *dev,
	struct mem_ctl_info *mci = to_mci(dev);
	struct amd64_pvt *pvt = mci->pvt_info;
	unsigned long value;
	int ret = 0;
	int ret;

	ret = strict_strtoul(data, 10, &value);
	if (ret != -EINVAL) {
	if (ret < 0)
		return ret;

	if (value > 8) {
		amd64_warn("%s: invalid word 0x%lx\n", __func__, value);
@@ -73,8 +73,6 @@ static ssize_t amd64_inject_word_store(struct device *dev,
	pvt->injection.word = (u32) value;
	return count;
}
	return ret;
}

static ssize_t amd64_inject_ecc_vector_show(struct device *dev,
					    struct device_attribute *mattr,
@@ -97,22 +95,20 @@ static ssize_t amd64_inject_ecc_vector_store(struct device *dev,
	struct mem_ctl_info *mci = to_mci(dev);
	struct amd64_pvt *pvt = mci->pvt_info;
	unsigned long value;
	int ret = 0;
	int ret;

	ret = strict_strtoul(data, 16, &value);
	if (ret != -EINVAL) {
	if (ret < 0)
		return ret;

	if (value & 0xFFFF0000) {
			amd64_warn("%s: invalid EccVector: 0x%lx\n",
				   __func__, value);
		amd64_warn("%s: invalid EccVector: 0x%lx\n", __func__, value);
		return -EINVAL;
	}

	pvt->injection.bit_map = (u32) value;
	return count;
}
	return ret;
}

/*
 * Do a DRAM ECC read. Assemble staged values in the pvt area, format into
@@ -126,29 +122,26 @@ static ssize_t amd64_inject_read_store(struct device *dev,
	struct amd64_pvt *pvt = mci->pvt_info;
	unsigned long value;
	u32 section, word_bits;
	int ret = 0;
	int ret;

	ret = strict_strtoul(data, 10, &value);
	if (ret != -EINVAL) {
	if (ret < 0)
		return ret;

	/* Form value to choose 16-byte section of cacheline */
		section = F10_NB_ARRAY_DRAM_ECC |
				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
	section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section);

	amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section);

		word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection.word,
						pvt->injection.bit_map);
	word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection);

	/* Issue 'word' and 'bit' along with the READ request */
	amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits);

		edac_dbg(0, "section=0x%x word_bits=0x%x\n",
			 section, word_bits);
	edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits);

	return count;
}
	return ret;
}

/*
 * Do a DRAM ECC write. Assemble staged values in the pvt area and format into
@@ -160,31 +153,44 @@ static ssize_t amd64_inject_write_store(struct device *dev,
{
	struct mem_ctl_info *mci = to_mci(dev);
	struct amd64_pvt *pvt = mci->pvt_info;
	u32 section, word_bits, tmp;
	unsigned long value;
	u32 section, word_bits;
	int ret = 0;
	int ret;

	ret = strict_strtoul(data, 10, &value);
	if (ret != -EINVAL) {
	if (ret < 0)
		return ret;

	/* Form value to choose 16-byte section of cacheline */
		section = F10_NB_ARRAY_DRAM_ECC |
				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
	section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section);

	amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section);

		word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection.word,
						pvt->injection.bit_map);
	word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection);

	pr_notice_once("Don't forget to decrease MCE polling interval in\n"
			"/sys/bus/machinecheck/devices/machinecheck<CPUNUM>/check_interval\n"
			"so that you can get the error report faster.\n");

	on_each_cpu(disable_caches, NULL, 1);

	/* Issue 'word' and 'bit' along with the READ request */
	amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits);

		edac_dbg(0, "section=0x%x word_bits=0x%x\n",
			 section, word_bits);
 retry:
	/* wait until injection happens */
	amd64_read_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, &tmp);
	if (tmp & F10_NB_ARR_ECC_WR_REQ) {
		cpu_relax();
		goto retry;
	}

	on_each_cpu(enable_caches, NULL, 1);

	edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits);

	return count;
}
	return ret;
}

/*
 * update NUM_INJ_ATTRS in case you add new members
+27 −24
Original line number Diff line number Diff line
@@ -974,20 +974,22 @@ static void edac_ce_error(struct mem_ctl_info *mci,
			  long grain)
{
	unsigned long remapped_page;
	char *msg_aux = "";

	if (*msg)
		msg_aux = " ";

	if (edac_mc_get_log_ce()) {
		if (other_detail && *other_detail)
			edac_mc_printk(mci, KERN_WARNING,
				       "%d CE %s on %s (%s %s - %s)\n",
				       error_count,
				       msg, label, location,
				       detail, other_detail);
				       "%d CE %s%son %s (%s %s - %s)\n",
				       error_count, msg, msg_aux, label,
				       location, detail, other_detail);
		else
			edac_mc_printk(mci, KERN_WARNING,
				       "%d CE %s on %s (%s %s)\n",
				       error_count,
				       msg, label, location,
				       detail);
				       "%d CE %s%son %s (%s %s)\n",
				       error_count, msg, msg_aux, label,
				       location, detail);
	}
	edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count);

@@ -1022,27 +1024,31 @@ static void edac_ue_error(struct mem_ctl_info *mci,
			  const char *other_detail,
			  const bool enable_per_layer_report)
{
	char *msg_aux = "";

	if (*msg)
		msg_aux = " ";

	if (edac_mc_get_log_ue()) {
		if (other_detail && *other_detail)
			edac_mc_printk(mci, KERN_WARNING,
				       "%d UE %s on %s (%s %s - %s)\n",
				       error_count,
			               msg, label, location, detail,
				       other_detail);
				       "%d UE %s%son %s (%s %s - %s)\n",
				       error_count, msg, msg_aux, label,
				       location, detail, other_detail);
		else
			edac_mc_printk(mci, KERN_WARNING,
				       "%d UE %s on %s (%s %s)\n",
				       error_count,
			               msg, label, location, detail);
				       "%d UE %s%son %s (%s %s)\n",
				       error_count, msg, msg_aux, label,
				       location, detail);
	}

	if (edac_mc_get_panic_on_ue()) {
		if (other_detail && *other_detail)
			panic("UE %s on %s (%s%s - %s)\n",
			      msg, label, location, detail, other_detail);
			panic("UE %s%son %s (%s%s - %s)\n",
			      msg, msg_aux, label, location, detail, other_detail);
		else
			panic("UE %s on %s (%s%s)\n",
			      msg, label, location, detail);
			panic("UE %s%son %s (%s%s)\n",
			      msg, msg_aux, label, location, detail);
	}

	edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
@@ -1101,10 +1107,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
	 */
	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] >= (int)mci->layers[i].size) {
			if (type == HW_EVENT_ERR_CORRECTED)
				p = "CE";
			else
				p = "UE";

			edac_mc_printk(mci, KERN_ERR,
				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
@@ -1136,6 +1138,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
	grain = 0;
	p = label;
	*p = '\0';

	for (i = 0; i < mci->tot_dimms; i++) {
		struct dimm_info *dimm = mci->dimms[i];

@@ -1203,6 +1206,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,

	/* Fill the RAM location data */
	p = location;

	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] < 0)
			continue;
@@ -1215,7 +1219,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
		*(p - 1) = '\0';

	/* Report the error via the trace interface */

	grain_bits = fls_long(grain) + 1;
	trace_mc_event(type, msg, label, error_count,
		       mci->mc_idx, top_layer, mid_layer, low_layer,
Loading