mm, vmscan: make kswapd reclaim in terms of nodes (1d82de61) · Commits · e / devices / android_kernel_teracube_mt6765

mm/vmscan.c

+101 −191

Original line number	Original line	Diff line number	Diff line
	@@ -2980,7 +2980,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
	}		}
	#endif		#endif

	static void age_active_anon(struct zone zone, struct scan_control sc)		static void age_active_anon(struct pglist_data *pgdat,
			struct zone zone, struct scan_control sc)
	{		{
	struct mem_cgroup *memcg;		struct mem_cgroup *memcg;

	@@ -2999,84 +3000,14 @@ static void age_active_anon(struct zone zone, struct scan_control sc)
	} while (memcg);		} while (memcg);
	}		}

	static bool zone_balanced(struct zone *zone, int order, bool highorder,		static bool zone_balanced(struct zone *zone, int order,
	unsigned long balance_gap, int classzone_idx)		unsigned long balance_gap, int classzone_idx)
	{		{
	unsigned long mark = high_wmark_pages(zone) + balance_gap;		unsigned long mark = high_wmark_pages(zone) + balance_gap;

	/*
	* When checking from pgdat_balanced(), kswapd should stop and sleep
	* when it reaches the high order-0 watermark and let kcompactd take
	* over. Other callers such as wakeup_kswapd() want to determine the
	* true high-order watermark.
	*/
	if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
	mark += (1UL << order);
	order = 0;
	}

	return zone_watermark_ok_safe(zone, order, mark, classzone_idx);		return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
	}		}

	/*
	* pgdat_balanced() is used when checking if a node is balanced.
	*
	* For order-0, all zones must be balanced!
	*
	* For high-order allocations only zones that meet watermarks and are in a
	* zone allowed by the callers classzone_idx are added to balanced_pages. The
	* total of balanced pages must be at least 25% of the zones allowed by
	* classzone_idx for the node to be considered balanced. Forcing all zones to
	* be balanced for high orders can cause excessive reclaim when there are
	* imbalanced zones.
	* The choice of 25% is due to
	* o a 16M DMA zone that is balanced will not balance a zone on any
	* reasonable sized machine
	* o On all other machines, the top zone must be at least a reasonable
	* percentage of the middle zones. For example, on 32-bit x86, highmem
	* would need to be at least 256M for it to be balance a whole node.
	* Similarly, on x86-64 the Normal zone would need to be at least 1G
	* to balance a node on its own. These seemed like reasonable ratios.
	*/
	static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
	{
	unsigned long managed_pages = 0;
	unsigned long balanced_pages = 0;
	int i;

	/* Check the watermark levels */
	for (i = 0; i <= classzone_idx; i++) {
	struct zone *zone = pgdat->node_zones + i;

	if (!populated_zone(zone))
	continue;

	managed_pages += zone->managed_pages;

	/*
	* A special case here:
	*
	* balance_pgdat() skips over all_unreclaimable after
	* DEF_PRIORITY. Effectively, it considers them balanced so
	* they must be considered balanced here as well!
	*/
	if (!pgdat_reclaimable(zone->zone_pgdat)) {
	balanced_pages += zone->managed_pages;
	continue;
	}

	if (zone_balanced(zone, order, false, 0, i))
	balanced_pages += zone->managed_pages;
	else if (!order)
	return false;
	}

	if (order)
	return balanced_pages >= (managed_pages >> 2);
	else
	return true;
	}

	/*		/*
	* Prepare kswapd for sleeping. This verifies that there are no processes		* Prepare kswapd for sleeping. This verifies that there are no processes
	* waiting in throttle_direct_reclaim() and that watermarks have been met.		* waiting in throttle_direct_reclaim() and that watermarks have been met.
	@@ -3086,6 +3017,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
	static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,		static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
	int classzone_idx)		int classzone_idx)
	{		{
			int i;

	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */		/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
	if (remaining)		if (remaining)
	return false;		return false;
	@@ -3106,101 +3039,90 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
	if (waitqueue_active(&pgdat->pfmemalloc_wait))		if (waitqueue_active(&pgdat->pfmemalloc_wait))
	wake_up_all(&pgdat->pfmemalloc_wait);		wake_up_all(&pgdat->pfmemalloc_wait);

	return pgdat_balanced(pgdat, order, classzone_idx);		for (i = 0; i <= classzone_idx; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
			continue;

			if (zone_balanced(zone, order, 0, classzone_idx))
			return true;
			}

			return false;
	}		}

	/*		/*
	* kswapd shrinks the zone by the number of pages required to reach		* kswapd shrinks a node of pages that are at or below the highest usable
	* the high watermark.		* zone that is currently unbalanced.
	*		*
	* Returns true if kswapd scanned at least the requested number of pages to		* Returns true if kswapd scanned at least the requested number of pages to
	* reclaim or if the lack of progress was due to pages under writeback.		* reclaim or if the lack of progress was due to pages under writeback.
	* This is used to determine if the scanning priority needs to be raised.		* This is used to determine if the scanning priority needs to be raised.
	*/		*/
	static bool kswapd_shrink_zone(struct zone *zone,		static bool kswapd_shrink_node(pg_data_t *pgdat,
	int classzone_idx,		int classzone_idx,
	struct scan_control *sc)		struct scan_control *sc)
	{		{
	unsigned long balance_gap;		struct zone *zone;
	bool lowmem_pressure;		int z;
	struct pglist_data *pgdat = zone->zone_pgdat;

	/* Reclaim above the high watermark. */		/* Reclaim a number of pages proportional to the number of zones */
	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));		sc->nr_to_reclaim = 0;
			for (z = 0; z <= classzone_idx; z++) {
			zone = pgdat->node_zones + z;
			if (!populated_zone(zone))
			continue;

	/*		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
	* We put equal pressure on every zone, unless one zone has way too		}
	* many pages free already. The "too many pages" is defined as the
	* high wmark plus a "gap" where the gap is either the low
	* watermark or 1% of the zone, whichever is smaller.
	*/
	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
	zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));

	/*		/*
	* If there is no low memory pressure or the zone is balanced then no		* Historically care was taken to put equal pressure on all zones but
	* reclaim is necessary		* now pressure is applied based on node LRU order.
	*/		*/
	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));		shrink_node(pgdat, sc, classzone_idx);
	if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
	balance_gap, classzone_idx))
	return true;

	shrink_node(zone->zone_pgdat, sc, classzone_idx);

	/* TODO: ANOMALY */
	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);

	/*		/*
	* If a zone reaches its high watermark, consider it to be no longer		* Fragmentation may mean that the system cannot be rebalanced for
	* congested. It's possible there are dirty pages backed by congested		* high-order allocations. If twice the allocation size has been
	* BDIs but as pressure is relieved, speculatively avoid congestion		* reclaimed then recheck watermarks only at order-0 to prevent
	* waits.		* excessive reclaim. Assume that a process requested a high-order
			* can direct reclaim/compact.
	*/		*/
	if (pgdat_reclaimable(zone->zone_pgdat) &&		if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
	zone_balanced(zone, sc->order, false, 0, classzone_idx)) {		sc->order = 0;
	clear_bit(PGDAT_CONGESTED, &pgdat->flags);
	clear_bit(PGDAT_DIRTY, &pgdat->flags);
	}

	return sc->nr_scanned >= sc->nr_to_reclaim;		return sc->nr_scanned >= sc->nr_to_reclaim;
	}		}

	/*		/*
	* For kswapd, balance_pgdat() will work across all this node's zones until		* For kswapd, balance_pgdat() will reclaim pages across a node from zones
	* they are all at high_wmark_pages(zone).		* that are eligible for use by the caller until at least one zone is
	*		* balanced.
	* Returns the highest zone idx kswapd was reclaiming at
	*		*
	* There is special handling here for zones which are full of pinned pages.		* Returns the order kswapd finished reclaiming at.
	* This can happen if the pages are all mlocked, or if they are all used by
	* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
	* What we do is to detect the case where all pages in the zone have been
	* scanned twice and there has been zero successful reclaim. Mark the zone as
	* dead and from now on, only perform a short scan. Basically we're polling
	* the zone for when the problem goes away.
	*		*
	* kswapd scans the zones in the highmem->normal->dma direction. It skips		* kswapd scans the zones in the highmem->normal->dma direction. It skips
	* zones which have free_pages > high_wmark_pages(zone), but once a zone is		* zones which have free_pages > high_wmark_pages(zone), but once a zone is
	* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the		* found to have free_pages <= high_wmark_pages(zone), any page is that zone
	* lower zones regardless of the number of free pages in the lower zones. This		* or lower is eligible for reclaim until at least one usable zone is
	* interoperates with the page allocator fallback scheme to ensure that aging		* balanced.
	* of pages is balanced across the zones.
	*/		*/
	static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)		static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
	{		{
	int i;		int i;
	int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
	unsigned long nr_soft_reclaimed;		unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;		unsigned long nr_soft_scanned;
			struct zone *zone;
	struct scan_control sc = {		struct scan_control sc = {
	.gfp_mask = GFP_KERNEL,		.gfp_mask = GFP_KERNEL,
	.reclaim_idx = MAX_NR_ZONES - 1,
	.order = order,		.order = order,
	.priority = DEF_PRIORITY,		.priority = DEF_PRIORITY,
	.may_writepage = !laptop_mode,		.may_writepage = !laptop_mode,
	.may_unmap = 1,		.may_unmap = 1,
	.may_swap = 1,		.may_swap = 1,
			.reclaim_idx = classzone_idx,
	};		};
	count_vm_event(PAGEOUTRUN);		count_vm_event(PAGEOUTRUN);

	@@ -3211,21 +3133,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)

	/* Scan from the highest requested zone to dma */		/* Scan from the highest requested zone to dma */
	for (i = classzone_idx; i >= 0; i--) {		for (i = classzone_idx; i >= 0; i--) {
	struct zone *zone = pgdat->node_zones + i;		zone = pgdat->node_zones + i;

	if (!populated_zone(zone))		if (!populated_zone(zone))
	continue;		continue;

	if (sc.priority != DEF_PRIORITY &&
	!pgdat_reclaimable(zone->zone_pgdat))
	continue;

	/*
	* Do some background aging of the anon list, to give
	* pages a chance to be referenced before reclaiming.
	*/
	age_active_anon(zone, &sc);

	/*		/*
	* If the number of buffer_heads in the machine		* If the number of buffer_heads in the machine
	* exceeds the maximum allowed level and this node		* exceeds the maximum allowed level and this node
	@@ -3233,19 +3144,17 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
	* it to relieve lowmem pressure.		* it to relieve lowmem pressure.
	*/		*/
	if (buffer_heads_over_limit && is_highmem_idx(i)) {		if (buffer_heads_over_limit && is_highmem_idx(i)) {
	end_zone = i;		classzone_idx = i;
	break;		break;
	}		}

	if (!zone_balanced(zone, order, false, 0, 0)) {		if (!zone_balanced(zone, order, 0, 0)) {
	end_zone = i;		classzone_idx = i;
	break;		break;
	} else {		} else {
	/*		/*
	* If balanced, clear the dirty and congested		* If any eligible zone is balanced then the
	* flags		* node is not considered congested or dirty.
	*
	* TODO: ANOMALY
	*/		*/
	clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);		clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
	clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);		clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
	@@ -3256,51 +3165,34 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
	goto out;		goto out;

	/*		/*
	* If we're getting trouble reclaiming, start doing writepage		* Do some background aging of the anon list, to give
	* even in laptop mode.		* pages a chance to be referenced before reclaiming. All
			* pages are rotated regardless of classzone as this is
			* about consistent aging.
	*/		*/
	if (sc.priority < DEF_PRIORITY - 2)		age_active_anon(pgdat, &pgdat->node_zones[MAX_NR_ZONES - 1], &sc);
	sc.may_writepage = 1;

	/*		/*
	* Continue scanning in the highmem->dma direction stopping at		* If we're getting trouble reclaiming, start doing writepage
	* the last zone which needs scanning. This may reclaim lowmem		* even in laptop mode.
	* pages that are not necessary for zone balancing but it
	* preserves LRU ordering. It is assumed that the bulk of
	* allocation requests can use arbitrary zones with the
	* possible exception of big highmem:lowmem configurations.
	*/		*/
	for (i = end_zone; i >= 0; i--) {		if (sc.priority < DEF_PRIORITY - 2 \|\| !pgdat_reclaimable(pgdat))
	struct zone *zone = pgdat->node_zones + i;		sc.may_writepage = 1;

	if (!populated_zone(zone))
	continue;

	if (sc.priority != DEF_PRIORITY &&
	!pgdat_reclaimable(zone->zone_pgdat))
	continue;

			/* Call soft limit reclaim before calling shrink_node. */
	sc.nr_scanned = 0;		sc.nr_scanned = 0;
	sc.reclaim_idx = i;

	nr_soft_scanned = 0;		nr_soft_scanned = 0;
	/*		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, sc.order,
	* Call soft limit reclaim before calling shrink_zone.		sc.gfp_mask, &nr_soft_scanned);
	*/
	nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
	order, sc.gfp_mask,
	&nr_soft_scanned);
	sc.nr_reclaimed += nr_soft_reclaimed;		sc.nr_reclaimed += nr_soft_reclaimed;

	/*		/*
	* There should be no need to raise the scanning		* There should be no need to raise the scanning priority if
	* priority if enough pages are already being scanned		* enough pages are already being scanned that that high
	* that that high watermark would be met at 100%		* watermark would be met at 100% efficiency.
	* efficiency.
	*/		*/
	if (kswapd_shrink_zone(zone, end_zone, &sc))		if (kswapd_shrink_node(pgdat, classzone_idx, &sc))
	raise_priority = false;		raise_priority = false;
	}

	/*		/*
	* If the low watermark is met there is no need for processes		* If the low watermark is met there is no need for processes
	@@ -3315,21 +3207,38 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
	if (try_to_freeze() \|\| kthread_should_stop())		if (try_to_freeze() \|\| kthread_should_stop())
	break;		break;

			/*
			* Stop reclaiming if any eligible zone is balanced and clear
			* node writeback or congested.
			*/
			for (i = 0; i <= classzone_idx; i++) {
			zone = pgdat->node_zones + i;
			if (!populated_zone(zone))
			continue;

			if (zone_balanced(zone, sc.order, 0, classzone_idx)) {
			clear_bit(PGDAT_CONGESTED, &pgdat->flags);
			clear_bit(PGDAT_DIRTY, &pgdat->flags);
			goto out;
			}
			}

	/*		/*
	* Raise priority if scanning rate is too low or there was no		* Raise priority if scanning rate is too low or there was no
	* progress in reclaiming pages		* progress in reclaiming pages
	*/		*/
	if (raise_priority \|\| !sc.nr_reclaimed)		if (raise_priority \|\| !sc.nr_reclaimed)
	sc.priority--;		sc.priority--;
	} while (sc.priority >= 1 &&		} while (sc.priority >= 1);
	!pgdat_balanced(pgdat, order, classzone_idx));

	out:		out:
	/*		/*
	* Return the highest zone idx we were reclaiming at so		* Return the order kswapd stopped reclaiming at as
	* prepare_kswapd_sleep() makes the same decisions as here.		* prepare_kswapd_sleep() takes it into account. If another caller
			* entered the allocator slow path while kswapd was awake, order will
			* remain at the higher level.
	*/		*/
	return end_zone;		return sc.order;
	}		}

	static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,		static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
	@@ -3486,8 +3395,9 @@ static int kswapd(void *p)
	*/		*/
	if (!ret) {		if (!ret) {
	trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);		trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
	balanced_classzone_idx = balance_pgdat(pgdat, order,
	classzone_idx);		/* return value ignored until next patch */
			balance_pgdat(pgdat, order, classzone_idx);
	}		}
	}		}

	@@ -3517,7 +3427,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
	}		}
	if (!waitqueue_active(&pgdat->kswapd_wait))		if (!waitqueue_active(&pgdat->kswapd_wait))
	return;		return;
	if (zone_balanced(zone, order, true, 0, 0))		if (zone_balanced(zone, order, 0, 0))
	return;		return;

	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);		trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);