Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 871d5f8d authored by Yinghai Lu's avatar Yinghai Lu Committed by Ingo Molnar
Browse files

x86: get mp_bus_to_node early



Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.

Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.

In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.

The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.

pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.

In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.

Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.

This is an updated version of pci_sysdata and Jeff's pci_domain patch.

[ mingo@elte.hu: build fix ]

Signed-off-by: default avatarYinghai Lu <yinghai.lu@sun.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
parent bb63b421
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -10,5 +10,6 @@ pci-y += legacy.o irq.o

pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o

obj-y				+= $(pci-y) common.o early.o
+17 −10
Original line number Diff line number Diff line
@@ -191,7 +191,10 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
{
	struct pci_bus *bus;
	struct pci_sysdata *sd;
	int node;
#ifdef CONFIG_ACPI_NUMA
	int pxm;
#endif

	dmi_check_system(acpi_pciprobe_dmi_table);

@@ -201,6 +204,17 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
		return NULL;
	}

	node = -1;
#ifdef CONFIG_ACPI_NUMA
	pxm = acpi_get_pxm(device->handle);
	if (pxm >= 0)
		node = pxm_to_node(pxm);
	if (node != -1)
		set_mp_bus_to_node(busnum, node);
	else
		node = get_mp_bus_to_node(busnum);
#endif

	/* Allocate per-root-bus (not per bus) arch-specific data.
	 * TODO: leak; this memory is never freed.
	 * It's arguable whether it's worth the trouble to care.
@@ -212,13 +226,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
	}

	sd->domain = domain;
	sd->node = -1;

	pxm = acpi_get_pxm(device->handle);
#ifdef CONFIG_ACPI_NUMA
	if (pxm >= 0)
		sd->node = pxm_to_node(pxm);
#endif
	sd->node = node;
	/*
	 * Maybe the desired pci bus has been already scanned. In such case
	 * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -238,9 +246,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
		kfree(sd);

#ifdef CONFIG_ACPI_NUMA
	if (bus != NULL) {
	if (bus) {
		if (pxm >= 0) {
			printk("bus %d -> pxm %d -> node %d\n",
			printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
				busnum, pxm, pxm_to_node(pxm));
		}
	}
@@ -248,7 +256,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do

	if (bus && (pci_probe & PCI_USE__CRS))
		get_current_resources(device, busnum, domain, bus);
	
	return bus;
}

+14 −4
Original line number Diff line number Diff line
@@ -342,9 +342,14 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
		return NULL;
	}

	sd->node = get_mp_bus_to_node(busnum);

	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
	bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
	if (!bus)
		kfree(sd);

	return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
	return bus;
}

extern u8 pci_cache_line_size;
@@ -480,7 +485,7 @@ void pcibios_disable_device (struct pci_dev *dev)
		pcibios_disable_irq(dev);
}

struct pci_bus *__devinit pci_scan_bus_with_sysdata(int busno)
struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
{
	struct pci_bus *bus = NULL;
	struct pci_sysdata *sd;
@@ -495,10 +500,15 @@ struct pci_bus *__devinit pci_scan_bus_with_sysdata(int busno)
		printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno);
		return NULL;
	}
	sd->node = -1;
	bus = pci_scan_bus(busno, &pci_root_ops, sd);
	sd->node = node;
	bus = pci_scan_bus(busno, ops, sd);
	if (!bus)
		kfree(sd);

	return bus;
}

struct pci_bus *pci_scan_bus_with_sysdata(int busno)
{
	return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
}
+3 −1
Original line number Diff line number Diff line
@@ -136,9 +136,11 @@ static void __init pirq_peer_trick(void)
		busmap[e->bus] = 1;
	}
	for(i = 1; i < 256; i++) {
		int node;
		if (!busmap[i] || pci_find_bus(0, i))
			continue;
		if (pci_scan_bus_with_sysdata(i))
		node = get_mp_bus_to_node(i);
		if (pci_scan_bus_on_node(i, &pci_root_ops, node))
			printk(KERN_INFO "PCI: Discovered primary peer "
			       "bus %02x [IRQ]\n", i);
	}
+66 −26
Original line number Diff line number Diff line
#include <linux/init.h>
#include <linux/pci.h>
#include <asm/pci-direct.h>
#include <asm/mpspec.h>
#include <linux/cpumask.h>
#include <linux/topology.h>

/*
 * This discovers the pcibus <-> node mapping on AMD K8.
@@ -20,29 +22,73 @@
#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
#define PCI_DEVICE_ID_K8HTCONFIG 0x1100

#ifdef CONFIG_NUMA

#define BUS_NR 256

static int mp_bus_to_node[BUS_NR];

void set_mp_bus_to_node(int busnum, int node)
{
	if (busnum >= 0 &&  busnum < BUS_NR)
		mp_bus_to_node[busnum] = node;
}

int get_mp_bus_to_node(int busnum)
{
	int node = -1;

	if (busnum < 0 || busnum > (BUS_NR - 1))
		return node;

	node = mp_bus_to_node[busnum];

	/*
	 * let numa_node_id to decide it later in dma_alloc_pages
	 * if there is no ram on that node
	 */
	if (node != -1 && !node_online(node))
		node = -1;

	return node;
}

#endif

/**
 * fill_mp_bus_to_cpumask()
 * early_fill_mp_bus_to_node()
 * called before pcibios_scan_root and pci_scan_bus
 * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
 * Registers found in the K8 northbridge
 */
__init static int
fill_mp_bus_to_cpumask(void)
early_fill_mp_bus_to_node(void)
{
	struct pci_dev *nb_dev = NULL;
#ifdef CONFIG_NUMA
	int i, j;
	unsigned slot;
	u32 ldtbus, nid;
	u32 id;
	static int lbnr[3] = {
		LDT_BUS_NUMBER_REGISTER_0,
		LDT_BUS_NUMBER_REGISTER_1,
		LDT_BUS_NUMBER_REGISTER_2
	};

	while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
			PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
		pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
	for (i = 0; i < BUS_NR; i++)
		mp_bus_to_node[i] = -1;

	if (!early_pci_allowed())
		return -1;

	for (slot = 0x18; slot < 0x20; slot++) {
		id = read_pci_config(0, slot, 0, PCI_VENDOR_ID);
		if (id != (PCI_VENDOR_ID_AMD | (PCI_DEVICE_ID_K8HTCONFIG<<16)))
			break;
		nid = read_pci_config(0, slot, 0, NODE_ID_REGISTER);

		for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
			pci_read_config_dword(nb_dev, lbnr[i], &ldtbus);
			ldtbus = read_pci_config(0, slot, 0, lbnr[i]);
			/*
			 * if there are no busses hanging off of the current
			 * ldt link then both the secondary and subordinate
@@ -58,26 +104,20 @@ fill_mp_bus_to_cpumask(void)
				for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
				     j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
				     j++) {
					struct pci_bus *bus;
					struct pci_sysdata *sd;

					long node = NODE_ID(nid);
					/* Algorithm a bit dumb, but
 					   it shouldn't matter here */
					bus = pci_find_bus(0, j);
					if (!bus)
						continue;
					if (!node_online(node))
						node = 0;

					sd = bus->sysdata;
					sd->node = node;
					int node = NODE_ID(nid);
					mp_bus_to_node[j] = (unsigned char)node;
				}
			}
		}
	}

	for (i = 0; i < BUS_NR; i++) {
		int node = mp_bus_to_node[i];
		if (node >= 0)
			printk(KERN_DEBUG "bus: %02x to node: %02x\n", i, node);
	}
#endif
	return 0;
}

fs_initcall(fill_mp_bus_to_cpumask);
postcore_initcall(early_fill_mp_bus_to_node);
Loading