Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ce00a967 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds
Browse files

mm: memcontrol: revert use of root_mem_cgroup res_counter



Dave Hansen reports a massive scalability regression in an uncontained
page fault benchmark with more than 30 concurrent threads, which he
bisected down to 05b84301 ("mm: memcontrol: use root_mem_cgroup
res_counter") and pin-pointed on res_counter spinlock contention.

That change relied on the per-cpu charge caches to mostly swallow the
res_counter costs, but it's apparent that the caches don't scale yet.

Revert memcg back to bypassing res_counters on the root level in order
to restore performance for uncontained workloads.

Reported-by: default avatarDave Hansen <dave@sr71.net>
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Tested-by: default avatarDave Hansen <dave.hansen@intel.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.cz>
Acked-by: default avatarVladimir Davydov <vdavydov@parallels.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 10096fb1
Loading
Loading
Loading
Loading
+78 −25
Original line number Diff line number Diff line
@@ -2534,6 +2534,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
	unsigned long long size;
	int ret = 0;

	if (mem_cgroup_is_root(memcg))
		goto done;
retry:
	if (consume_stock(memcg, nr_pages))
		goto done;
@@ -2611,9 +2613,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
	if (!(gfp_mask & __GFP_NOFAIL))
		return -ENOMEM;
bypass:
	memcg = root_mem_cgroup;
	ret = -EINTR;
	goto retry;
	return -EINTR;

done_restock:
	if (batch > nr_pages)
@@ -2626,6 +2626,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
	unsigned long bytes = nr_pages * PAGE_SIZE;

	if (mem_cgroup_is_root(memcg))
		return;

	res_counter_uncharge(&memcg->res, bytes);
	if (do_swap_account)
		res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2643,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
{
	unsigned long bytes = nr_pages * PAGE_SIZE;

	if (mem_cgroup_is_root(memcg))
		return;

	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
	if (do_swap_account)
		res_counter_uncharge_until(&memcg->memsw,
@@ -4093,6 +4099,46 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
	return retval;
}

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
					       enum mem_cgroup_stat_index idx)
{
	struct mem_cgroup *iter;
	long val = 0;

	/* Per-cpu values can be negative, use a signed accumulator */
	for_each_mem_cgroup_tree(iter, memcg)
		val += mem_cgroup_read_stat(iter, idx);

	if (val < 0) /* race ? */
		val = 0;
	return val;
}

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
	u64 val;

	if (!mem_cgroup_is_root(memcg)) {
		if (!swap)
			return res_counter_read_u64(&memcg->res, RES_USAGE);
		else
			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
	}

	/*
	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
	 */
	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

	if (swap)
		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

	return val << PAGE_SHIFT;
}


static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
			       struct cftype *cft)
{
@@ -4102,8 +4148,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,

	switch (type) {
	case _MEM:
		if (name == RES_USAGE)
			return mem_cgroup_usage(memcg, false);
		return res_counter_read_u64(&memcg->res, name);
	case _MEMSWAP:
		if (name == RES_USAGE)
			return mem_cgroup_usage(memcg, true);
		return res_counter_read_u64(&memcg->memsw, name);
	case _KMEM:
		return res_counter_read_u64(&memcg->kmem, name);
@@ -4572,10 +4622,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
	if (!t)
		goto unlock;

	if (!swap)
		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
	else
		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
	usage = mem_cgroup_usage(memcg, swap);

	/*
	 * current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4720,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

	if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
		usage = mem_cgroup_usage(memcg, false);
	} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		usage = mem_cgroup_usage(memcg, true);
	} else
		BUG();

@@ -4762,10 +4809,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

	if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
		usage = mem_cgroup_usage(memcg, false);
	} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
		usage = mem_cgroup_usage(memcg, true);
	} else
		BUG();

@@ -5525,9 +5572,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
		 * core guarantees its existence.
		 */
	} else {
		res_counter_init(&memcg->res, &root_mem_cgroup->res);
		res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
		res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
		res_counter_init(&memcg->res, NULL);
		res_counter_init(&memcg->memsw, NULL);
		res_counter_init(&memcg->kmem, NULL);
		/*
		 * Deeper hierachy with use_hierarchy == false doesn't make
		 * much sense so let cgroup subsystem know about this
@@ -5969,6 +6016,7 @@ static void __mem_cgroup_clear_mc(void)
	/* we must fixup refcnts and charges */
	if (mc.moved_swap) {
		/* uncharge swap account from the old cgroup */
		if (!mem_cgroup_is_root(mc.from))
			res_counter_uncharge(&mc.from->memsw,
					     PAGE_SIZE * mc.moved_swap);

@@ -5979,6 +6027,7 @@ static void __mem_cgroup_clear_mc(void)
		 * we charged both to->res and to->memsw, so we should
		 * uncharge to->res.
		 */
		if (!mem_cgroup_is_root(mc.to))
			res_counter_uncharge(&mc.to->res,
					     PAGE_SIZE * mc.moved_swap);
		/* we've already done css_get(mc.to) */
@@ -6345,6 +6394,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
	rcu_read_lock();
	memcg = mem_cgroup_lookup(id);
	if (memcg) {
		if (!mem_cgroup_is_root(memcg))
			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
		mem_cgroup_swap_statistics(memcg, false);
		css_put(&memcg->css);
@@ -6509,12 +6559,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
{
	unsigned long flags;

	if (!mem_cgroup_is_root(memcg)) {
		if (nr_mem)
		res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
			res_counter_uncharge(&memcg->res,
					     nr_mem * PAGE_SIZE);
		if (nr_memsw)
		res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);

			res_counter_uncharge(&memcg->memsw,
					     nr_memsw * PAGE_SIZE);
		memcg_oom_recover(memcg);
	}

	local_irq_save(flags);
	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);