Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d9713679 authored by Lai Jiangshan's avatar Lai Jiangshan Committed by Linus Torvalds
Browse files

memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]



Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY], it
forgets to manage node_states[N_NORMAL_MEMORY].  This may cause
node_states[N_NORMAL_MEMORY] to become incorrect.

Example, if a node is empty before online, and we online a memory which is
in ZONE_NORMAL.  And after online, node_states[N_HIGH_MEMORY] is correct,
but node_states[N_NORMAL_MEMORY] is incorrect, the online code doesn't set
the new online node to node_states[N_NORMAL_MEMORY].

The same thing will happen when offlining (the offline code doesn't clear
the node from node_states[N_NORMAL_MEMORY] when needed).  Some memory
managment code depends node_states[N_NORMAL_MEMORY], so we have to fix up
the node_states[N_NORMAL_MEMORY].

We add node_states_check_changes_online() and
node_states_check_changes_offline() to detect whether
node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY] are changed
while hotpluging.

Also add @status_change_nid_normal to struct memory_notify, thus the
memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY] are
changed.  (We can add a @flags and reuse @status_change_nid instead of
introducing @status_change_nid_normal, but it will add much more
complexity in memory hotplug callback in every subsystem.  So introducing
@status_change_nid_normal is better and it doesn't change the sematics of
@status_change_nid)

Signed-off-by: default avatarLai Jiangshan <laijs@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Rob Landley <rob@landley.net>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 6dcd73d7
Loading
Loading
Loading
Loading
+4 −1
Original line number Original line Diff line number Diff line
@@ -377,15 +377,18 @@ The third argument is passed by pointer of struct memory_notify.
struct memory_notify {
struct memory_notify {
       unsigned long start_pfn;
       unsigned long start_pfn;
       unsigned long nr_pages;
       unsigned long nr_pages;
       int status_change_nid_normal;
       int status_change_nid;
       int status_change_nid;
}
}


start_pfn is start_pfn of online/offline memory.
start_pfn is start_pfn of online/offline memory.
nr_pages is # of pages of online/offline memory.
nr_pages is # of pages of online/offline memory.
status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
is (will be) set/clear, if this is -1, then nodemask status is not changed.
status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
set/clear. It means a new(memoryless) node gets new memory by online and a
set/clear. It means a new(memoryless) node gets new memory by online and a
node loses all memory. If this is -1, then nodemask status is not changed.
node loses all memory. If this is -1, then nodemask status is not changed.
If status_changed_nid >= 0, callback should create/discard structures for the
If status_changed_nid* >= 0, callback should create/discard structures for the
node if necessary.
node if necessary.


--------------
--------------
+1 −0
Original line number Original line Diff line number Diff line
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
struct memory_notify {
struct memory_notify {
	unsigned long start_pfn;
	unsigned long start_pfn;
	unsigned long nr_pages;
	unsigned long nr_pages;
	int status_change_nid_normal;
	int status_change_nid;
	int status_change_nid;
};
};


+120 −16
Original line number Original line Diff line number Diff line
@@ -460,6 +460,53 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
	return 0;
	return 0;
}
}


/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);
	enum zone_type zone_last = ZONE_NORMAL;

	/*
	 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
	 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
	 *
	 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
	 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
	 */
	if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
		zone_last = ZONE_MOVABLE;

	/*
	 * if the memory to be online is in a zone of 0...zone_last, and
	 * the zones of 0...zone_last don't have memory before online, we will
	 * need to set the node to node_states[N_NORMAL_MEMORY] after
	 * the memory is online.
	 */
	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
		arg->status_change_nid_normal = nid;
	else
		arg->status_change_nid_normal = -1;

	/*
	 * if the node don't have memory befor online, we will need to
	 * set the node to node_states[N_HIGH_MEMORY] after the memory
	 * is online.
	 */
	if (!node_state(nid, N_HIGH_MEMORY))
		arg->status_change_nid = nid;
	else
		arg->status_change_nid = -1;
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

	node_set_state(node, N_HIGH_MEMORY);
}



int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
{
{
@@ -471,13 +518,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
	struct memory_notify arg;
	struct memory_notify arg;


	lock_memory_hotplug();
	lock_memory_hotplug();
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_mutex.
	 */
	zone = page_zone(pfn_to_page(pfn));

	arg.start_pfn = pfn;
	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;
	node_states_check_changes_online(nr_pages, zone, &arg);


	nid = page_to_nid(pfn_to_page(pfn));
	nid = page_to_nid(pfn_to_page(pfn));
	if (node_present_pages(nid) == 0)
		arg.status_change_nid = nid;


	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	ret = notifier_to_errno(ret);
@@ -486,12 +538,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
		unlock_memory_hotplug();
		unlock_memory_hotplug();
		return ret;
		return ret;
	}
	}
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_mutex.
	 */
	zone = page_zone(pfn_to_page(pfn));
	/*
	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * This means the page allocator ignores this zone.
@@ -521,7 +567,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
	zone->present_pages += onlined_pages;
	zone->present_pages += onlined_pages;
	zone->zone_pgdat->node_present_pages += onlined_pages;
	zone->zone_pgdat->node_present_pages += onlined_pages;
	if (onlined_pages) {
	if (onlined_pages) {
		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
		node_states_set_node(zone_to_nid(zone), &arg);
		if (need_zonelists_rebuild)
		if (need_zonelists_rebuild)
			build_all_zonelists(NULL, NULL);
			build_all_zonelists(NULL, NULL);
		else
		else
@@ -871,6 +917,67 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
	return offlined;
	return offlined;
}
}


/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
		struct zone *zone, struct memory_notify *arg)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	unsigned long present_pages = 0;
	enum zone_type zt, zone_last = ZONE_NORMAL;

	/*
	 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
	 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
	 *
	 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
	 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
	 */
	if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
		zone_last = ZONE_MOVABLE;

	/*
	 * check whether node_states[N_NORMAL_MEMORY] will be changed.
	 * If the memory to be offline is in a zone of 0...zone_last,
	 * and it is the last present memory, 0...zone_last will
	 * become empty after offline , thus we can determind we will
	 * need to clear the node from node_states[N_NORMAL_MEMORY].
	 */
	for (zt = 0; zt <= zone_last; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;
	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
		arg->status_change_nid_normal = zone_to_nid(zone);
	else
		arg->status_change_nid_normal = -1;

	/*
	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
	 */
	zone_last = ZONE_MOVABLE;

	/*
	 * check whether node_states[N_HIGH_MEMORY] will be changed
	 * If we try to offline the last present @nr_pages from the node,
	 * we can determind we will need to clear the node from
	 * node_states[N_HIGH_MEMORY].
	 */
	for (; zt <= zone_last; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;
	if (nr_pages >= present_pages)
		arg->status_change_nid = zone_to_nid(zone);
	else
		arg->status_change_nid = -1;
}

static void node_states_clear_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_clear_state(node, N_NORMAL_MEMORY);

	if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
	    (arg->status_change_nid >= 0))
		node_clear_state(node, N_HIGH_MEMORY);
}

static int __ref __offline_pages(unsigned long start_pfn,
static int __ref __offline_pages(unsigned long start_pfn,
		  unsigned long end_pfn, unsigned long timeout)
		  unsigned long end_pfn, unsigned long timeout)
{
{
@@ -905,9 +1012,7 @@ static int __ref __offline_pages(unsigned long start_pfn,


	arg.start_pfn = start_pfn;
	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;
	node_states_check_changes_offline(nr_pages, zone, &arg);
	if (nr_pages >= node_present_pages(node))
		arg.status_change_nid = node;


	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
	ret = notifier_to_errno(ret);
@@ -980,10 +1085,9 @@ repeat:
	} else
	} else
		zone_pcp_update(zone);
		zone_pcp_update(zone);


	if (!node_present_pages(node)) {
	node_states_clear_node(node, &arg);
		node_clear_state(node, N_HIGH_MEMORY);
	if (arg.status_change_nid >= 0)
		kswapd_stop(node);
		kswapd_stop(node);
	}


	vm_total_pages = nr_free_pagecache_pages();
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();
	writeback_set_ratelimit();