Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 66e1707b authored by Balbir Singh's avatar Balbir Singh Committed by Linus Torvalds
Browse files

Memory controller: add per cgroup LRU and reclaim



Add the page_cgroup to the per cgroup LRU.  The reclaim algorithm has
been modified to make the isolate_lru_pages() as a pluggable component.  The
scan_control data structure now accepts the cgroup on behalf of which
reclaims are carried out.  try_to_free_pages() has been extended to become
cgroup aware.

[akpm@linux-foundation.org: fix warning]
[Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member]
[bunk@kernel.org: make do_try_to_free_pages() static]
[hugh@veritas.com: memcgroup: fix try_to_free order]
[kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary]
Signed-off-by: default avatarPavel Emelianov <xemul@openvz.org>
Signed-off-by: default avatarBalbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 67e465a7
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -32,6 +32,13 @@ extern void page_assign_page_cgroup(struct page *page,
extern struct page_cgroup *page_get_page_cgroup(struct page *page);
extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm);
extern void mem_cgroup_uncharge(struct page_cgroup *pc);
extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active);
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
					int active);

static inline void mem_cgroup_uncharge_page(struct page *page)
{
@@ -71,6 +78,11 @@ static inline void mem_cgroup_uncharge_page(struct page *page)
{
}

static inline void mem_cgroup_move_lists(struct page_cgroup *pc,
						bool active)
{
}

#endif /* CONFIG_CGROUP_MEM_CONT */

#endif /* _LINUX_MEMCONTROL_H */
+23 −0
Original line number Diff line number Diff line
@@ -99,4 +99,27 @@ int res_counter_charge(struct res_counter *counter, unsigned long val);
void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
void res_counter_uncharge(struct res_counter *counter, unsigned long val);

static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
{
	if (cnt->usage < cnt->limit)
		return true;

	return false;
}

/*
 * Helper function to detect if the cgroup is within it's limit or
 * not. It's currently called from cgroup_rss_prepare()
 */
static inline bool res_counter_check_under_limit(struct res_counter *cnt)
{
	bool ret;
	unsigned long flags;

	spin_lock_irqsave(&cnt->lock, flags);
	ret = res_counter_limit_check_locked(cnt);
	spin_unlock_irqrestore(&cnt->lock, flags);
	return ret;
}

#endif
+3 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>

#include <asm/atomic.h>
@@ -182,6 +183,8 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zone **zones, int order,
					gfp_t gfp_mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem);
extern int __isolate_lru_page(struct page *page, int mode);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
+143 −5
Original line number Diff line number Diff line
@@ -22,10 +22,15 @@
#include <linux/cgroup.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>

struct cgroup_subsys mem_cgroup_subsys;
static const int MEM_CGROUP_RECLAIM_RETRIES = 5;

/*
 * The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@ struct mem_cgroup {
	 */
	struct list_head active_list;
	struct list_head inactive_list;
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
	spinlock_t lru_lock;
};

/*
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page)
	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}

void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
{
	if (active)
		list_move(&pc->lru, &pc->mem_cgroup->active_list);
	else
		list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
}

/*
 * This routine assumes that the appropriate zone's lru lock is already held
 */
void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
{
	struct mem_cgroup *mem;
	if (!pc)
		return;

	mem = pc->mem_cgroup;

	spin_lock(&mem->lru_lock);
	__mem_cgroup_move_lists(pc, active);
	spin_unlock(&mem->lru_lock);
}

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
					int active)
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
	struct page_cgroup *pc;

	if (active)
		src = &mem_cont->active_list;
	else
		src = &mem_cont->inactive_list;

	spin_lock(&mem_cont->lru_lock);
	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
		pc = list_entry(src->prev, struct page_cgroup, lru);
		page = pc->page;
		VM_BUG_ON(!pc);

		if (PageActive(page) && !active) {
			__mem_cgroup_move_lists(pc, true);
			scan--;
			continue;
		}
		if (!PageActive(page) && active) {
			__mem_cgroup_move_lists(pc, false);
			scan--;
			continue;
		}

		/*
		 * Reclaim, per zone
		 * TODO: make the active/inactive lists per zone
		 */
		if (page_zone(page) != z)
			continue;

		/*
		 * Check if the meta page went away from under us
		 */
		if (!list_empty(&pc->lru))
			list_move(&pc->lru, &pc_list);
		else
			continue;

		if (__isolate_lru_page(page, mode) == 0) {
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	list_splice(&pc_list, src);
	spin_unlock(&mem_cont->lru_lock);

	*scanned = scan;
	return nr_taken;
}

/*
 * Charge the memory controller for page usage.
 * Return
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc, *race_pc;
	unsigned long flags;
	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

	/*
	 * Should page_cgroup's go to their own slab?
@@ -159,13 +258,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
	 * to see if the cgroup page already has a page_cgroup associated
	 * with it
	 */
retry:
	lock_page_cgroup(page);
	pc = page_get_page_cgroup(page);
	/*
	 * The page_cgroup exists and the page has already been accounted
	 */
	if (pc) {
		atomic_inc(&pc->ref_cnt);
		if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
			/* this page is under being uncharged ? */
			unlock_page_cgroup(page);
			cpu_relax();
			goto retry;
		} else
			goto done;
	}

@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
	 * If we created the page_cgroup, we should free it on exceeding
	 * the cgroup limit.
	 */
	if (res_counter_charge(&mem->res, 1)) {
	while (res_counter_charge(&mem->res, 1)) {
		if (try_to_free_mem_cgroup_pages(mem))
			continue;

		/*
 		 * try_to_free_mem_cgroup_pages() might not give us a full
 		 * picture of reclaim. Some pages are reclaimed and might be
 		 * moved to swap cache or just unmapped from the cgroup.
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
 		 */
		if (res_counter_check_under_limit(&mem->res))
			continue;
			/*
			 * Since we control both RSS and cache, we end up with a
			 * very interesting scenario where we end up reclaiming
			 * memory (essentially RSS), since the memory is pushed
			 * to swap cache, we eventually end up adding those
			 * pages back to our list. Hence we give ourselves a
			 * few chances before we fail
			 */
		else if (nr_retries--) {
			congestion_wait(WRITE, HZ/10);
			continue;
		}

		css_put(&mem->css);
		goto free_pc;
	}
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
	pc->page = page;
	page_assign_page_cgroup(page, pc);

	spin_lock_irqsave(&mem->lru_lock, flags);
	list_add(&pc->lru, &mem->active_list);
	spin_unlock_irqrestore(&mem->lru_lock, flags);

done:
	unlock_page_cgroup(page);
	return 0;
free_pc:
	kfree(pc);
	return -ENOMEM;
err:
	unlock_page_cgroup(page);
	return -ENOMEM;
}

@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
{
	struct mem_cgroup *mem;
	struct page *page;
	unsigned long flags;

	if (!pc)
		return;
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
		page_assign_page_cgroup(page, NULL);
		unlock_page_cgroup(page);
		res_counter_uncharge(&mem->res, 1);

 		spin_lock_irqsave(&mem->lru_lock, flags);
 		list_del_init(&pc->lru);
 		spin_unlock_irqrestore(&mem->lru_lock, flags);
		kfree(pc);
	}
}
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
	res_counter_init(&mem->res);
	INIT_LIST_HEAD(&mem->active_list);
	INIT_LIST_HEAD(&mem->inactive_list);
	spin_lock_init(&mem->lru_lock);
	return &mem->css;
}

+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>

/* How many pages do we try to swap or page in/out together? */
int page_cluster;
@@ -175,6 +176,7 @@ void activate_page(struct page *page)
		SetPageActive(page);
		add_page_to_active_list(zone, page);
		__count_vm_event(PGACTIVATE);
		mem_cgroup_move_lists(page_get_page_cgroup(page), true);
	}
	spin_unlock_irq(&zone->lru_lock);
}
Loading