Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 711d3d2c authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds
Browse files

memcg: cpu hotplug aware percpu count updates



Now, memcgroup's per cpu coutner uses for_each_possible_cpu() to get the
value.  It's better to use for_each_online_cpu() and a cpu hotplug
handler.

This patch only handles statistics counter.  MEM_CGROUP_ON_MOVE will be
handled in another patch.

Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 7d74b06f
Loading
Loading
Loading
Loading
+93 −9
Original line number Diff line number Diff line
@@ -89,7 +89,9 @@ enum mem_cgroup_stat_index {
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
	/* incremented at every  pagein/pageout */
	MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */

	MEM_CGROUP_STAT_NSTATS,
@@ -255,6 +257,12 @@ struct mem_cgroup {
	 * percpu counter.
	 */
	struct mem_cgroup_stat_cpu *stat;
	/*
	 * used when a cpu is offlined or other synchronizations
	 * See mem_cgroup_read_stat().
	 */
	struct mem_cgroup_stat_cpu nocpu_base;
	spinlock_t pcp_counter_lock;
};

/* Stuffs for move charges at task migration. */
@@ -531,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
	return mz;
}

/*
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
 * a periodic synchronizion of counter in memcg's counter.
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 val = 0;

	for_each_possible_cpu(cpu)
	get_online_cpus();
	for_each_online_cpu(cpu)
		val += per_cpu(mem->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
	spin_lock(&mem->pcp_counter_lock);
	val += mem->nocpu_base.count[idx];
	spin_unlock(&mem->pcp_counter_lock);
#endif
	put_online_cpus();
	return val;
}

@@ -663,10 +697,29 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/* The caller has to guarantee "mem" exists before calling this */
static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
{
	if (mem && css_tryget(&mem->css))
	struct cgroup_subsys_state *css;
	int found;

	if (!mem) /* ROOT cgroup has the smallest ID */
		return root_mem_cgroup; /*css_put/get against root is ignored*/
	if (!mem->use_hierarchy) {
		if (css_tryget(&mem->css))
			return mem;
		return NULL;
	}
	rcu_read_lock();
	/*
	 * searching a memory cgroup which has the smallest ID under given
	 * ROOT cgroup. (ID >= 1)
	 */
	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
	if (css && css_tryget(css))
		mem = container_of(css, struct mem_cgroup, css);
	else
		mem = NULL;
	rcu_read_unlock();
	return mem;
}

static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
					struct mem_cgroup *root,
@@ -680,9 +733,13 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
	hierarchy_used = iter->use_hierarchy;

	css_put(&iter->css);
	if (!cond || !hierarchy_used)
	/* If no ROOT, walk all, ignore hierarchy */
	if (!cond || (root && !hierarchy_used))
		return NULL;

	if (!root)
		root = root_mem_cgroup;

	do {
		iter = NULL;
		rcu_read_lock();
@@ -711,6 +768,9 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
#define for_each_mem_cgroup_tree(iter, root) \
	for_each_mem_cgroup_tree_cond(iter, root, true)

#define for_each_mem_cgroup_all(iter) \
	for_each_mem_cgroup_tree_cond(iter, NULL, true)


static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
@@ -1676,15 +1736,38 @@ static void drain_all_stock_sync(void)
	atomic_dec(&memcg_drain_count);
}

static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
/*
 * This function drains percpu counter value from DEAD cpu and
 * move it to local cpu. Note that this function can be preempted.
 */
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
{
	int i;

	spin_lock(&mem->pcp_counter_lock);
	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
		s64 x = per_cpu(mem->stat->count[i], cpu);

		per_cpu(mem->stat->count[i], cpu) = 0;
		mem->nocpu_base.count[i] += x;
	}
	spin_unlock(&mem->pcp_counter_lock);
}

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
					unsigned long action,
					void *hcpu)
{
	int cpu = (unsigned long)hcpu;
	struct memcg_stock_pcp *stock;
	struct mem_cgroup *iter;

	if (action != CPU_DEAD)
	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
		return NOTIFY_OK;

	for_each_mem_cgroup_all(iter)
		mem_cgroup_drain_pcp_counter(iter, cpu);

	stock = &per_cpu(memcg_stock, cpu);
	drain_stock(stock);
	return NOTIFY_OK;
@@ -4098,6 +4181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
			vfree(mem);
		mem = NULL;
	}
	spin_lock_init(&mem->pcp_counter_lock);
	return mem;
}

@@ -4224,7 +4308,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
						&per_cpu(memcg_stock, cpu);
			INIT_WORK(&stock->work, drain_local_stock);
		}
		hotcpu_notifier(memcg_stock_cpu_callback, 0);
		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
	} else {
		parent = mem_cgroup_from_cont(cont->parent);
		mem->use_hierarchy = parent->use_hierarchy;