Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4037d452 authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds
Browse files

Move remote node draining out of slab allocators



Currently the slab allocators contain callbacks into the page allocator to
perform the draining of pagesets on remote nodes.  This requires SLUB to have
a whole subsystem in order to be compatible with SLAB.  Moving node draining
out of the slab allocators avoids a section of code in SLUB.

Move the node draining so that is is done when the vm statistics are updated.
At that point we are already touching all the cachelines with the pagesets of
a processor.

Add a expire counter there.  If we have to update per zone or global vm
statistics then assume that the pageset will require subsequent draining.

The expire counter will be decremented on each vm stats update pass until it
reaches zero.  Then we will drain one batch from the pageset.  The draining
will cause vm counter updates which will then cause another expiration until
the pcp is empty.  So we will drain a batch every 3 seconds.

Note that remote node draining is a somewhat esoteric feature that is required
on large NUMA systems because otherwise significant portions of system memory
can become trapped in pcp queues.  The number of pcp is determined by the
number of processors and nodes in a system.  A system with 4 processors and 2
nodes has 8 pcps which is okay.  But a system with 1024 processors and 512
nodes has 512k pcps with a high potential for large amount of memory being
caught in them.

Signed-off-by: default avatarChristoph Lameter <clameter@sgi.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 77461ab3
Loading
Loading
Loading
Loading
+1 −5
Original line number Original line Diff line number Diff line
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
#define free_page(addr) free_pages((addr),0)
#define free_page(addr) free_pages((addr),0)


void page_alloc_init(void);
void page_alloc_init(void);
#ifdef CONFIG_NUMA
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_node_pages(int node);
#else
static inline void drain_node_pages(int node) { };
#endif


#endif /* __LINUX_GFP_H */
#endif /* __LINUX_GFP_H */
+3 −0
Original line number Original line Diff line number Diff line
@@ -83,6 +83,9 @@ struct per_cpu_pages {


struct per_cpu_pageset {
struct per_cpu_pageset {
	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
#ifdef CONFIG_NUMA
	s8 expire;
#endif
#ifdef CONFIG_SMP
#ifdef CONFIG_SMP
	s8 stat_threshold;
	s8 stat_threshold;
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+14 −31
Original line number Original line Diff line number Diff line
@@ -691,30 +691,16 @@ static void __init setup_nr_node_ids(void) {}


#ifdef CONFIG_NUMA
#ifdef CONFIG_NUMA
/*
/*
 * Called from the slab reaper to drain pagesets on a particular node that
 * Called from the vmstat counter updater to drain pagesets of this
 * belongs to the currently executing processor.
 * currently executing processor on remote nodes after they have
 * expired.
 *
 * Note that this function must be called with the thread pinned to
 * Note that this function must be called with the thread pinned to
 * a single processor.
 * a single processor.
 */
 */
void drain_node_pages(int nodeid)
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
{
	int i;
	enum zone_type z;
	unsigned long flags;
	unsigned long flags;

	for (z = 0; z < MAX_NR_ZONES; z++) {
		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
		struct per_cpu_pageset *pset;

		if (!populated_zone(zone))
			continue;

		pset = zone_pcp(zone, smp_processor_id());
		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
			struct per_cpu_pages *pcp;

			pcp = &pset->pcp[i];
			if (pcp->count) {
	int to_drain;
	int to_drain;


	local_irq_save(flags);
	local_irq_save(flags);
@@ -726,9 +712,6 @@ void drain_node_pages(int nodeid)
	pcp->count -= to_drain;
	pcp->count -= to_drain;
	local_irq_restore(flags);
	local_irq_restore(flags);
}
}
		}
	}
}
#endif
#endif


static void __drain_pages(unsigned int cpu)
static void __drain_pages(unsigned int cpu)
+0 −6
Original line number Original line Diff line number Diff line
@@ -928,12 +928,6 @@ static void next_reap_node(void)
{
{
	int node = __get_cpu_var(reap_node);
	int node = __get_cpu_var(reap_node);


	/*
	 * Also drain per cpu pages on remote zones
	 */
	if (node != numa_node_id())
		drain_node_pages(node);

	node = next_node(node, node_online_map);
	node = next_node(node, node_online_map);
	if (unlikely(node >= MAX_NUMNODES))
	if (unlikely(node >= MAX_NUMNODES))
		node = first_node(node_online_map);
		node = first_node(node_online_map);
+0 −84
Original line number Original line Diff line number Diff line
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =


#endif
#endif


#ifdef CONFIG_NUMA

/*****************************************************************
 * Generic reaper used to support the page allocator
 * (the cpu slabs are reaped by a per slab workqueue).
 *
 * Maybe move this to the page allocator?
 ****************************************************************/

static DEFINE_PER_CPU(unsigned long, reap_node);

static void init_reap_node(int cpu)
{
	int node;

	node = next_node(cpu_to_node(cpu), node_online_map);
	if (node == MAX_NUMNODES)
		node = first_node(node_online_map);

	__get_cpu_var(reap_node) = node;
}

static void next_reap_node(void)
{
	int node = __get_cpu_var(reap_node);

	/*
	 * Also drain per cpu pages on remote zones
	 */
	if (node != numa_node_id())
		drain_node_pages(node);

	node = next_node(node, node_online_map);
	if (unlikely(node >= MAX_NUMNODES))
		node = first_node(node_online_map);
	__get_cpu_var(reap_node) = node;
}
#else
#define init_reap_node(cpu) do { } while (0)
#define next_reap_node(void) do { } while (0)
#endif

#define REAPTIMEOUT_CPUC	(2*HZ)

#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, reap_work);

static void cache_reap(struct work_struct *unused)
{
	next_reap_node();
	schedule_delayed_work(&__get_cpu_var(reap_work),
				      REAPTIMEOUT_CPUC);
}

static void __devinit start_cpu_timer(int cpu)
{
	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);

	/*
	 * When this gets called from do_initcalls via cpucache_init(),
	 * init_workqueues() has already run, so keventd will be setup
	 * at that time.
	 */
	if (keventd_up() && reap_work->work.func == NULL) {
		init_reap_node(cpu);
		INIT_DELAYED_WORK(reap_work, cache_reap);
		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
	}
}

static int __init cpucache_init(void)
{
	int cpu;

	/*
	 * Register the timers that drain pcp pages and update vm statistics
	 */
	for_each_online_cpu(cpu)
		start_cpu_timer(cpu);
	return 0;
}
__initcall(cpucache_init);
#endif

void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
{
{
	struct kmem_cache *s = get_slab(size, gfpflags);
	struct kmem_cache *s = get_slab(size, gfpflags);
Loading