Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6ae11b27 authored by Lee Schermerhorn's avatar Lee Schermerhorn Committed by Linus Torvalds
Browse files

hugetlb: add nodemask arg to huge page alloc, free and surplus adjust functions



In preparation for constraining huge page allocation and freeing by the
controlling task's numa mempolicy, add a "nodes_allowed" nodemask pointer
to the allocate, free and surplus adjustment functions.  For now, pass
NULL to indicate default behavior--i.e., use node_online_map.  A
subsqeuent patch will derive a non-default mask from the controlling
task's numa mempolicy.

Note that this method of updating the global hstate nr_hugepages under the
constraint of a nodemask simplifies keeping the global state
consistent--especially the number of persistent and surplus pages relative
to reservations and overcommit limits.  There are undoubtedly other ways
to do this, but this works for both interfaces: mempolicy and per node
attributes.

[rientjes@google.com: fix HIGHMEM compile error]
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: default avatarMel Gorman <mel@csn.ul.ie>
Acked-by: default avatarDavid Rientjes <rientjes@google.com>
Reviewed-by: default avatarAndi Kleen <andi@firstfloor.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 9a76db09
Loading
Loading
Loading
Loading
+72 −53
Original line number Original line Diff line number Diff line
@@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
}
}


/*
/*
 * common helper function for hstate_next_node_to_{alloc|free}.
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * return next node in node_online_map, wrapping at end.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
 */
static int next_node_allowed(int nid)
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
{
	nid = next_node(nid, node_online_map);
	nid = next_node(nid, *nodes_allowed);
	if (nid == MAX_NUMNODES)
	if (nid == MAX_NUMNODES)
		nid = first_node(node_online_map);
		nid = first_node(*nodes_allowed);
	VM_BUG_ON(nid >= MAX_NUMNODES);
	VM_BUG_ON(nid >= MAX_NUMNODES);


	return nid;
	return nid;
}
}


static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
	if (!node_isset(nid, *nodes_allowed))
		nid = next_node_allowed(nid, nodes_allowed);
	return nid;
}

/*
/*
 * Use a helper variable to find the next node and then
 * returns the previously saved node ["this node"] from which to
 * copy it back to next_nid_to_alloc afterwards:
 * allocate a persistent huge page for the pool and advance the
 * otherwise there's a window in which a racer might
 * next node from which to allocate, handling wrap at end of node
 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
 * mask.
 * But we don't need to use a spin_lock here: it really
 * doesn't matter if occasionally a racer chooses the
 * same nid as we do.  Move nid forward in the mask even
 * if we just successfully allocated a hugepage so that
 * the next caller gets hugepages on the next node.
 */
 */
static int hstate_next_node_to_alloc(struct hstate *h)
static int hstate_next_node_to_alloc(struct hstate *h,
					nodemask_t *nodes_allowed)
{
{
	int nid, next_nid;
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);


	nid = h->next_nid_to_alloc;
	next_nid = next_node_allowed(nid);
	h->next_nid_to_alloc = next_nid;
	return nid;
	return nid;
}
}


static int alloc_fresh_huge_page(struct hstate *h)
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
{
	struct page *page;
	struct page *page;
	int start_nid;
	int start_nid;
	int next_nid;
	int next_nid;
	int ret = 0;
	int ret = 0;


	start_nid = hstate_next_node_to_alloc(h);
	start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
	next_nid = start_nid;
	next_nid = start_nid;


	do {
	do {
@@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h)
			ret = 1;
			ret = 1;
			break;
			break;
		}
		}
		next_nid = hstate_next_node_to_alloc(h);
		next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
	} while (next_nid != start_nid);
	} while (next_nid != start_nid);


	if (ret)
	if (ret)
@@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h)
}
}


/*
/*
 * helper for free_pool_huge_page() - return the next node
 * helper for free_pool_huge_page() - return the previously saved
 * from which to free a huge page.  Advance the next node id
 * node ["this node"] from which to free a huge page.  Advance the
 * whether or not we find a free huge page to free so that the
 * next node id whether or not we find a free huge page to free so
 * next attempt to free addresses the next node.
 * that the next attempt to free addresses the next node.
 */
 */
static int hstate_next_node_to_free(struct hstate *h)
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
{
	int nid, next_nid;
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);


	nid = h->next_nid_to_free;
	next_nid = next_node_allowed(nid);
	h->next_nid_to_free = next_nid;
	return nid;
	return nid;
}
}


@@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h)
 * balanced over allowed nodes.
 * balanced over allowed nodes.
 * Called with hugetlb_lock locked.
 * Called with hugetlb_lock locked.
 */
 */
static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
							 bool acct_surplus)
{
{
	int start_nid;
	int start_nid;
	int next_nid;
	int next_nid;
	int ret = 0;
	int ret = 0;


	start_nid = hstate_next_node_to_free(h);
	start_nid = hstate_next_node_to_free(h, nodes_allowed);
	next_nid = start_nid;
	next_nid = start_nid;


	do {
	do {
@@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
			ret = 1;
			ret = 1;
			break;
			break;
		}
		}
		next_nid = hstate_next_node_to_free(h);
		next_nid = hstate_next_node_to_free(h, nodes_allowed);
	} while (next_nid != start_nid);
	} while (next_nid != start_nid);


	return ret;
	return ret;
@@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h,
	 * on-line nodes for us and will handle the hstate accounting.
	 * on-line nodes for us and will handle the hstate accounting.
	 */
	 */
	while (nr_pages--) {
	while (nr_pages--) {
		if (!free_pool_huge_page(h, 1))
		if (!free_pool_huge_page(h, &node_online_map, 1))
			break;
			break;
	}
	}
}
}
@@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
		void *addr;
		void *addr;


		addr = __alloc_bootmem_node_nopanic(
		addr = __alloc_bootmem_node_nopanic(
				NODE_DATA(hstate_next_node_to_alloc(h)),
				NODE_DATA(hstate_next_node_to_alloc(h,
							&node_online_map)),
				huge_page_size(h), huge_page_size(h), 0);
				huge_page_size(h), huge_page_size(h), 0);


		if (addr) {
		if (addr) {
@@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
		if (h->order >= MAX_ORDER) {
		if (h->order >= MAX_ORDER) {
			if (!alloc_bootmem_huge_page(h))
			if (!alloc_bootmem_huge_page(h))
				break;
				break;
		} else if (!alloc_fresh_huge_page(h))
		} else if (!alloc_fresh_huge_page(h, &node_online_map))
			break;
			break;
	}
	}
	h->max_huge_pages = i;
	h->max_huge_pages = i;
@@ -1144,14 +1156,15 @@ static void __init report_hugepages(void)
}
}


#ifdef CONFIG_HIGHMEM
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count)
static void try_to_free_low(struct hstate *h, unsigned long count,
						nodemask_t *nodes_allowed)
{
{
	int i;
	int i;


	if (h->order >= MAX_ORDER)
	if (h->order >= MAX_ORDER)
		return;
		return;


	for (i = 0; i < MAX_NUMNODES; ++i) {
	for_each_node_mask(i, *nodes_allowed) {
		struct page *page, *next;
		struct page *page, *next;
		struct list_head *freel = &h->hugepage_freelists[i];
		struct list_head *freel = &h->hugepage_freelists[i];
		list_for_each_entry_safe(page, next, freel, lru) {
		list_for_each_entry_safe(page, next, freel, lru) {
@@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
	}
	}
}
}
#else
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count)
static inline void try_to_free_low(struct hstate *h, unsigned long count,
						nodemask_t *nodes_allowed)
{
{
}
}
#endif
#endif
@@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 * balanced by operating on them in a round-robin fashion.
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 * Returns 1 if an adjustment was made.
 */
 */
static int adjust_pool_surplus(struct hstate *h, int delta)
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
				int delta)
{
{
	int start_nid, next_nid;
	int start_nid, next_nid;
	int ret = 0;
	int ret = 0;
@@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
	VM_BUG_ON(delta != -1 && delta != 1);
	VM_BUG_ON(delta != -1 && delta != 1);


	if (delta < 0)
	if (delta < 0)
		start_nid = hstate_next_node_to_alloc(h);
		start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
	else
	else
		start_nid = hstate_next_node_to_free(h);
		start_nid = hstate_next_node_to_free(h, nodes_allowed);
	next_nid = start_nid;
	next_nid = start_nid;


	do {
	do {
@@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
			 * To shrink on this node, there must be a surplus page
			 * To shrink on this node, there must be a surplus page
			 */
			 */
			if (!h->surplus_huge_pages_node[nid]) {
			if (!h->surplus_huge_pages_node[nid]) {
				next_nid = hstate_next_node_to_alloc(h);
				next_nid = hstate_next_node_to_alloc(h,
								nodes_allowed);
				continue;
				continue;
			}
			}
		}
		}
@@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
			 */
			 */
			if (h->surplus_huge_pages_node[nid] >=
			if (h->surplus_huge_pages_node[nid] >=
						h->nr_huge_pages_node[nid]) {
						h->nr_huge_pages_node[nid]) {
				next_nid = hstate_next_node_to_free(h);
				next_nid = hstate_next_node_to_free(h,
								nodes_allowed);
				continue;
				continue;
			}
			}
		}
		}
@@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
}
}


#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
						nodemask_t *nodes_allowed)
{
{
	unsigned long min_count, ret;
	unsigned long min_count, ret;


@@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
	 */
	 */
	spin_lock(&hugetlb_lock);
	spin_lock(&hugetlb_lock);
	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
		if (!adjust_pool_surplus(h, -1))
		if (!adjust_pool_surplus(h, nodes_allowed, -1))
			break;
			break;
	}
	}


@@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
		 * and reducing the surplus.
		 * and reducing the surplus.
		 */
		 */
		spin_unlock(&hugetlb_lock);
		spin_unlock(&hugetlb_lock);
		ret = alloc_fresh_huge_page(h);
		ret = alloc_fresh_huge_page(h, nodes_allowed);
		spin_lock(&hugetlb_lock);
		spin_lock(&hugetlb_lock);
		if (!ret)
		if (!ret)
			goto out;
			goto out;
@@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
	 */
	 */
	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
	min_count = max(count, min_count);
	min_count = max(count, min_count);
	try_to_free_low(h, min_count);
	try_to_free_low(h, min_count, nodes_allowed);
	while (min_count < persistent_huge_pages(h)) {
	while (min_count < persistent_huge_pages(h)) {
		if (!free_pool_huge_page(h, 0))
		if (!free_pool_huge_page(h, nodes_allowed, 0))
			break;
			break;
	}
	}
	while (count < persistent_huge_pages(h)) {
	while (count < persistent_huge_pages(h)) {
		if (!adjust_pool_surplus(h, 1))
		if (!adjust_pool_surplus(h, nodes_allowed, 1))
			break;
			break;
	}
	}
out:
out:
@@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj,
	if (err)
	if (err)
		return 0;
		return 0;


	h->max_huge_pages = set_max_huge_pages(h, input);
	h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map);


	return count;
	return count;
}
}
@@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
	proc_doulongvec_minmax(table, write, buffer, length, ppos);
	proc_doulongvec_minmax(table, write, buffer, length, ppos);


	if (write)
	if (write)
		h->max_huge_pages = set_max_huge_pages(h, tmp);
		h->max_huge_pages = set_max_huge_pages(h, tmp,
							&node_online_map);


	return 0;
	return 0;
}
}