Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4484ebf1 authored by Ravikiran G Thirumalai's avatar Ravikiran G Thirumalai Committed by Linus Torvalds
Browse files

[PATCH] NUMA slab locking fixes: fix cpu down and up locking



This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab
allocator.  Sonny Rao <sonny@burdell.org> reported problems sometime back on
POWER5 boxes, when the last cpu on the nodes were being offlined.  We could
not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not
being updated on cpu down.  Since that issue is now fixed, we can reproduce
Sonny's problems on x86_64 NUMA, and here is the fix.

The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go
down, the array_caches (shared, alien) and the kmem_list3 of the node were
being freed (kfree) with the kmem_list3 lock held.  If the l3 or the
array_caches were to come from the same cache being cleared, we hit on
badness.

This patch cleans up the locking in cpu_up and cpu_down path.  We cannot
really free l3 on cpu down because, there is no node offlining yet and even
though a cpu is not yet up, node local memory can be allocated for it.  So l3s
are usually allocated at keme_cache_create and destroyed at
kmem_cache_destroy.  Hence, we don't need cachep->spinlock protection to get
to the cachep->nodelist[nodeid] either.

Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4
dbench process running all the time.

Signed-off-by: default avatarAlok N Kataria <alokk@calsoftinc.com>
Signed-off-by: default avatarRavikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ca3b9b91
Loading
Loading
Loading
Loading
+85 −38
Original line number Diff line number Diff line
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
	}
}

static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
{
	int i = 0;
	struct array_cache *ac;
	unsigned long flags;

	for_each_online_node(i) {
		ac = l3->alien[i];
		ac = alien[i];
		if (ac) {
			spin_lock_irqsave(&ac->lock, flags);
			__drain_alien_cache(cachep, ac, i);
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
}
#else
#define alloc_alien_cache(node, limit) do { } while (0)
#define free_alien_cache(ac_ptr) do { } while (0)
#define drain_alien_cache(cachep, l3) do { } while (0)
#define drain_alien_cache(cachep, alien) do { } while (0)

static inline void free_alien_cache(struct array_cache **ac_ptr)
{
}
#endif

static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;

				/*
				 * The l3s don't come and go as CPUs come and
				 * go.  cache_chain_mutex is sufficient
				 * protection here.
				 */
				cachep->nodelists[node] = l3;
			}

@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
		   & array cache's */
		list_for_each_entry(cachep, &cache_chain, next) {
			struct array_cache *nc;
			struct array_cache *shared;
			struct array_cache **alien;

			nc = alloc_arraycache(node, cachep->limit,
						cachep->batchcount);
			if (!nc)
				goto bad;
			shared = alloc_arraycache(node,
					cachep->shared * cachep->batchcount,
					0xbaadf00d);
			if (!shared)
				goto bad;
#ifdef CONFIG_NUMA
			alien = alloc_alien_cache(node, cachep->limit);
			if (!alien)
				goto bad;
#endif
			cachep->array[cpu] = nc;

			l3 = cachep->nodelists[node];
			BUG_ON(!l3);
			if (!l3->shared) {
				if (!(nc = alloc_arraycache(node,
							    cachep->shared *
							    cachep->batchcount,
							    0xbaadf00d)))
					goto bad;

				/* we are serialised from CPU_DEAD or
				   CPU_UP_CANCELLED by the cpucontrol lock */
				l3->shared = nc;
			spin_lock_irq(&l3->list_lock);
			if (!l3->shared) {
				/*
				 * We are serialised from CPU_DEAD or
				 * CPU_UP_CANCELLED by the cpucontrol lock
				 */
				l3->shared = shared;
				shared = NULL;
			}
#ifdef CONFIG_NUMA
			if (!l3->alien) {
				l3->alien = alien;
				alien = NULL;
			}
#endif
			spin_unlock_irq(&l3->list_lock);

			kfree(shared);
			free_alien_cache(alien);
		}
		mutex_unlock(&cache_chain_mutex);
		break;
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
		break;
#ifdef CONFIG_HOTPLUG_CPU
	case CPU_DEAD:
		/*
		 * Even if all the cpus of a node are down, we don't free the
		 * kmem_list3 of any cache. This to avoid a race between
		 * cpu_down, and a kmalloc allocation from another cpu for
		 * memory from the node of the cpu going down.  The list3
		 * structure is usually allocated from kmem_cache_create() and
		 * gets destroyed at kmem_cache_destroy().
		 */
		/* fall thru */
	case CPU_UP_CANCELED:
		mutex_lock(&cache_chain_mutex);

		list_for_each_entry(cachep, &cache_chain, next) {
			struct array_cache *nc;
			struct array_cache *shared;
			struct array_cache **alien;
			cpumask_t mask;

			mask = node_to_cpumask(node);
			spin_lock(&cachep->spinlock);
			/* cpu is dead; no one can alloc from it. */
			nc = cachep->array[cpu];
			cachep->array[cpu] = NULL;
			l3 = cachep->nodelists[node];

			if (!l3)
				goto unlock_cache;
				goto free_array_cache;

			spin_lock_irq(&l3->list_lock);

@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,

			if (!cpus_empty(mask)) {
				spin_unlock_irq(&l3->list_lock);
				goto unlock_cache;
				goto free_array_cache;
			}

			if (l3->shared) {
			shared = l3->shared;
			if (shared) {
				free_block(cachep, l3->shared->entry,
					   l3->shared->avail, node);
				kfree(l3->shared);
				l3->shared = NULL;
			}
			if (l3->alien) {
				drain_alien_cache(cachep, l3);
				free_alien_cache(l3->alien);

			alien = l3->alien;
			l3->alien = NULL;
			}

			/* free slabs belonging to this node */
			if (__node_shrink(cachep, node)) {
				cachep->nodelists[node] = NULL;
				spin_unlock_irq(&l3->list_lock);
				kfree(l3);
			} else {
			spin_unlock_irq(&l3->list_lock);

			kfree(shared);
			if (alien) {
				drain_alien_cache(cachep, alien);
				free_alien_cache(alien);
			}
		      unlock_cache:
			spin_unlock(&cachep->spinlock);
free_array_cache:
			kfree(nc);
		}
		/*
		 * In the previous loop, all the objects were freed to
		 * the respective cache's slabs,  now we can go ahead and
		 * shrink each nodelist to its limit.
		 */
		list_for_each_entry(cachep, &cache_chain, next) {
			l3 = cachep->nodelists[node];
			if (!l3)
				continue;
			spin_lock_irq(&l3->list_lock);
			/* free slabs belonging to this node */
			__node_shrink(cachep, node);
			spin_unlock_irq(&l3->list_lock);
		}
		mutex_unlock(&cache_chain_mutex);
		break;
#endif
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)

	smp_call_function_all_cpus(do_drain, cachep);
	check_irq_on();
	spin_lock(&cachep->spinlock);
	for_each_online_node(node) {
		l3 = cachep->nodelists[node];
		if (l3) {
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
			drain_array_locked(cachep, l3->shared, 1, node);
			spin_unlock_irq(&l3->list_lock);
			if (l3->alien)
				drain_alien_cache(cachep, l3);
				drain_alien_cache(cachep, l3->alien);
		}
	}
	spin_unlock(&cachep->spinlock);
}

static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)

		l3 = searchp->nodelists[numa_node_id()];
		if (l3->alien)
			drain_alien_cache(searchp, l3);
			drain_alien_cache(searchp, l3->alien);
		spin_lock_irq(&l3->list_lock);

		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3598,6 +3644,7 @@ static int s_show(struct seq_file *m, void *p)
			num_slabs++;
		}
		free_objects += l3->free_objects;
		if (l3->shared)
			shared_avail += l3->shared->avail;

		spin_unlock_irq(&l3->list_lock);