memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management (32047e2a) · Commits · e / devices / android_kernel_samsung_smdk4412

mm/memcontrol.c

+85 −14

Original line number	Diff line number	Diff line
		@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
		MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
		MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
		MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
		MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

		MEM_CGROUP_STAT_NSTATS,
		};
		@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
		return swappiness;
		}

		/* A routine for testing mem is not under move_account */
		static void mem_cgroup_start_move(struct mem_cgroup *mem)
		{
		int cpu;
		/* Because this is for moving account, reuse mc.lock */
		spin_lock(&mc.lock);
		for_each_possible_cpu(cpu)
		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
		spin_unlock(&mc.lock);

		synchronize_rcu();
		}

		static void mem_cgroup_end_move(struct mem_cgroup *mem)
		{
		int cpu;

		if (!mem)
		return;
		spin_lock(&mc.lock);
		for_each_possible_cpu(cpu)
		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
		spin_unlock(&mc.lock);
		}
		/*
		* 2 routines for checking "mem" is under move_account() or not.
		*
		* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
		* for avoiding race in accounting. If true,
		* pc->mem_cgroup may be overwritten.
		*
		* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
		* under hierarchy of moving cgroups. This is for
		* waiting at hith-memory prressure caused by "move".
		*/

		static bool mem_cgroup_stealed(struct mem_cgroup *mem)
		{
		VM_BUG_ON(!rcu_read_lock_held());
		return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
		}

		static bool mem_cgroup_under_move(struct mem_cgroup *mem)
		{
		@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
		/*
		* Currently used to update mapped file statistics, but the routine can be
		* generalized to update other statistics as well.
		*
		* Notes: Race condition
		*
		* We usually use page_cgroup_lock() for accessing page_cgroup member but
		* it tends to be costly. But considering some conditions, we doesn't need
		* to do so _always_.
		*
		* Considering "charge", lock_page_cgroup() is not required because all
		* file-stat operations happen after a page is attached to radix-tree. There
		* are no race with "charge".
		*
		* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
		* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
		* if there are race with "uncharge". Statistics itself is properly handled
		* by flags.
		*
		* Considering "move", this is an only case we see a race. To make the race
		* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
		* possibility of race condition. If there is, we take a lock.
		*/
		void mem_cgroup_update_file_mapped(struct page *page, int val)
		{
		struct mem_cgroup *mem;
		struct page_cgroup *pc;
		struct page_cgroup *pc = lookup_page_cgroup(page);
		bool need_unlock = false;

		pc = lookup_page_cgroup(page);
		if (unlikely(!pc))
		return;

		rcu_read_lock();
		mem = pc->mem_cgroup;
		if (unlikely(!mem \|\| !PageCgroupUsed(pc)))
		goto out;
		/* pc->mem_cgroup is unstable ? */
		if (unlikely(mem_cgroup_stealed(mem))) {
		/* take a lock against to access pc->mem_cgroup */
		lock_page_cgroup(pc);
		need_unlock = true;
		mem = pc->mem_cgroup;
		if (!mem \|\| !PageCgroupUsed(pc))
		goto done;

		/*
		* Preemption is already disabled. We can use __this_cpu_xxx
		*/
		goto out;
		}
		if (val > 0) {
		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		SetPageCgroupFileMapped(pc);
		} else {
		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		if (!page_mapped(page)) /* for race between dec->inc counter */
		ClearPageCgroupFileMapped(pc);
		}

		done:
		out:
		if (unlikely(need_unlock))
		unlock_page_cgroup(pc);
		rcu_read_unlock();
		return;
		}

		/*
		@@ -3039,6 +3106,7 @@ move_account:
		lru_add_drain_all();
		drain_all_stock_sync();
		ret = 0;
		mem_cgroup_start_move(mem);
		for_each_node_state(node, N_HIGH_MEMORY) {
		for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
		enum lru_list l;
		@@ -3052,6 +3120,7 @@ move_account:
		if (ret)
		break;
		}
		mem_cgroup_end_move(mem);
		memcg_oom_recover(mem);
		/* it seems parent cgroup doesn't have enough mem */
		if (ret == -ENOMEM)
		@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void)
		mc.to = NULL;
		mc.moving_task = NULL;
		spin_unlock(&mc.lock);
		mem_cgroup_end_move(from);
		memcg_oom_recover(from);
		memcg_oom_recover(to);
		wake_up_all(&mc.waitq);
		@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
		VM_BUG_ON(mc.moved_charge);
		VM_BUG_ON(mc.moved_swap);
		VM_BUG_ON(mc.moving_task);
		mem_cgroup_start_move(from);
		spin_lock(&mc.lock);
		mc.from = from;
		mc.to = mem;