Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 28a6deec authored by qctecmdr's avatar qctecmdr Committed by Gerrit - the friendly Code Review server
Browse files

Merge "mm: do not shrink pages marked for reclaim by MADV_FREE"

parents 94b3cc1b 482f5ddd
Loading
Loading
Loading
Loading
+76 −7
Original line number Diff line number Diff line
@@ -1638,15 +1638,17 @@ const struct file_operations proc_pagemap_operations = {
static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
				unsigned long end, struct mm_walk *walk)
{
	struct vm_area_struct *vma = walk->private;
	struct reclaim_param *rp = walk->private;
	struct vm_area_struct *vma = rp->vma;
	pte_t *pte, ptent;
	spinlock_t *ptl;
	struct page *page;
	LIST_HEAD(page_list);
	int isolated;
	int reclaimed;

	split_huge_pmd(vma, addr, pmd);
	if (pmd_trans_unstable(pmd))
	if (pmd_trans_unstable(pmd) || !rp->nr_to_reclaim)
		return 0;
cont:
	isolated = 0;
@@ -1663,16 +1665,34 @@ static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
		if (isolate_lru_page(page))
			continue;

		/* MADV_FREE clears pte dirty bit and then marks the page
		 * lazyfree (clear SwapBacked). Inbetween if this lazyfreed page
		 * is touched by user then it becomes dirty.  PPR in
		 * shrink_page_list in try_to_unmap finds the page dirty, marks
		 * it back as PageSwapBacked and skips reclaim. This can cause
		 * isolated count mismatch.
		 */
		if (PageAnon(page) && !PageSwapBacked(page)) {
			putback_lru_page(page);
			continue;
		}

		list_add(&page->lru, &page_list);
		inc_node_page_state(page, NR_ISOLATED_ANON +
				page_is_file_cache(page));
		isolated++;
		if (isolated >= SWAP_CLUSTER_MAX)
		rp->nr_scanned++;
		if ((isolated >= SWAP_CLUSTER_MAX) || !rp->nr_to_reclaim)
			break;
	}
	pte_unmap_unlock(pte - 1, ptl);
	reclaim_pages_from_list(&page_list, vma);
	if (addr != end)
	reclaimed = reclaim_pages_from_list(&page_list, vma);
	rp->nr_reclaimed += reclaimed;
	rp->nr_to_reclaim -= reclaimed;
	if (rp->nr_to_reclaim < 0)
		rp->nr_to_reclaim = 0;

	if (rp->nr_to_reclaim && (addr != end))
		goto cont;

	cond_resched();
@@ -1686,6 +1706,50 @@ enum reclaim_type {
	RECLAIM_RANGE,
};

struct reclaim_param reclaim_task_anon(struct task_struct *task,
		int nr_to_reclaim)
{
	struct mm_struct *mm;
	struct vm_area_struct *vma;
	struct mm_walk reclaim_walk = {};
	struct reclaim_param rp = {
		.nr_to_reclaim = nr_to_reclaim,
	};

	get_task_struct(task);
	mm = get_task_mm(task);
	if (!mm)
		goto out;

	reclaim_walk.mm = mm;
	reclaim_walk.pmd_entry = reclaim_pte_range;

	reclaim_walk.private = &rp;

	down_read(&mm->mmap_sem);
	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		if (is_vm_hugetlb_page(vma))
			continue;

		if (vma->vm_file)
			continue;

		if (!rp.nr_to_reclaim)
			break;

		rp.vma = vma;
		walk_page_range(vma->vm_start, vma->vm_end,
			&reclaim_walk);
	}

	flush_tlb_mm(mm);
	up_read(&mm->mmap_sem);
	mmput(mm);
out:
	put_task_struct(task);
	return rp;
}

static ssize_t reclaim_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppos)
{
@@ -1698,6 +1762,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
	struct mm_walk reclaim_walk = {};
	unsigned long start = 0;
	unsigned long end = 0;
	struct reclaim_param rp;

	memset(buffer, 0, sizeof(buffer));
	if (count > sizeof(buffer) - 1)
@@ -1760,6 +1825,10 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
	reclaim_walk.mm = mm;
	reclaim_walk.pmd_entry = reclaim_pte_range;

	rp.nr_to_reclaim = INT_MAX;
	rp.nr_reclaimed = 0;
	reclaim_walk.private = &rp;

	down_read(&mm->mmap_sem);
	if (type == RECLAIM_RANGE) {
		vma = find_vma(mm, start);
@@ -1769,7 +1838,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
			if (is_vm_hugetlb_page(vma))
				continue;

			reclaim_walk.private = vma;
			rp.vma = vma;
			walk_page_range(max(vma->vm_start, start),
					min(vma->vm_end, end),
					&reclaim_walk);
@@ -1786,7 +1855,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
			if (type == RECLAIM_FILE && !vma->vm_file)
				continue;

			reclaim_walk.private = vma;
			rp.vma = vma;
			walk_page_range(vma->vm_start, vma->vm_end,
				&reclaim_walk);
		}
+14 −0
Original line number Diff line number Diff line
@@ -2932,5 +2932,19 @@ static inline void setup_nr_node_ids(void) {}

extern int want_old_faultaround_pte;

#ifdef CONFIG_PROCESS_RECLAIM
struct reclaim_param {
	struct vm_area_struct *vma;
	/* Number of pages scanned */
	int nr_scanned;
	/* max pages to reclaim */
	int nr_to_reclaim;
	/* pages reclaimed */
	int nr_reclaimed;
};
extern struct reclaim_param reclaim_task_anon(struct task_struct *task,
		int nr_to_reclaim);
#endif

#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
+77 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2015-2019, The Linux Foundation. All rights reserved.
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM process_reclaim

#if !defined(_TRACE_EVENT_PROCESSRECLAIM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EVENT_PROCESSRECLAIM_H

#include <linux/tracepoint.h>
#include <linux/types.h>
#include <linux/sched.h>

TRACE_EVENT(process_reclaim,

	TP_PROTO(int tasksize,
		short oom_score_adj,
		int nr_scanned, int nr_reclaimed,
		int per_swap_size, int total_sz,
		int nr_to_reclaim),

	TP_ARGS(tasksize, oom_score_adj, nr_scanned,
			nr_reclaimed, per_swap_size,
			total_sz, nr_to_reclaim),

	TP_STRUCT__entry(
		__field(int, tasksize)
		__field(short, oom_score_adj)
		__field(int, nr_scanned)
		__field(int, nr_reclaimed)
		__field(int, per_swap_size)
		__field(int, total_sz)
		__field(int, nr_to_reclaim)
	),

	TP_fast_assign(
		__entry->tasksize	= tasksize;
		__entry->oom_score_adj	= oom_score_adj;
		__entry->nr_scanned	= nr_scanned;
		__entry->nr_reclaimed	= nr_reclaimed;
		__entry->per_swap_size	= per_swap_size;
		__entry->total_sz	= total_sz;
		__entry->nr_to_reclaim	= nr_to_reclaim;
	),

	TP_printk("%d, %hd, %d, %d, %d, %d, %d",
			__entry->tasksize, __entry->oom_score_adj,
			__entry->nr_scanned, __entry->nr_reclaimed,
			__entry->per_swap_size, __entry->total_sz,
			__entry->nr_to_reclaim)
);

TRACE_EVENT(process_reclaim_eff,

	TP_PROTO(int efficiency, int reclaim_avg_efficiency),

	TP_ARGS(efficiency, reclaim_avg_efficiency),

	TP_STRUCT__entry(
		__field(int, efficiency)
		__field(int, reclaim_avg_efficiency)
	),

	TP_fast_assign(
		__entry->efficiency	= efficiency;
		__entry->reclaim_avg_efficiency	= reclaim_avg_efficiency;
	),

	TP_printk("%d, %d", __entry->efficiency,
		__entry->reclaim_avg_efficiency)
);

#endif

#include <trace/define_trace.h>
+1 −0
Original line number Diff line number Diff line
@@ -105,3 +105,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_PROCESS_RECLAIM)	+= process_reclaim.o

mm/process_reclaim.c

0 → 100644
+243 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2015-2019, The Linux Foundation. All rights reserved.
 */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched.h>
#include <linux/rcupdate.h>
#include <linux/notifier.h>
#include <linux/vmpressure.h>

#define CREATE_TRACE_POINTS
#include <trace/events/process_reclaim.h>

#define MAX_SWAP_TASKS SWAP_CLUSTER_MAX

static void swap_fn(struct work_struct *work);
DECLARE_WORK(swap_work, swap_fn);

/* User knob to enable/disable process reclaim feature */
static int enable_process_reclaim;
module_param_named(enable_process_reclaim, enable_process_reclaim, int, 0644);

/* The max number of pages tried to be reclaimed in a single run */
int per_swap_size = SWAP_CLUSTER_MAX * 32;
module_param_named(per_swap_size, per_swap_size, int, 0644);

int reclaim_avg_efficiency;
module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency, int, 0444);

/* The vmpressure region where process reclaim operates */
static unsigned long pressure_min = 50;
static unsigned long pressure_max = 90;
module_param_named(pressure_min, pressure_min, ulong, 0644);
module_param_named(pressure_max, pressure_max, ulong, 0644);

/*
 * Scheduling process reclaim workqueue unecessarily
 * when the reclaim efficiency is low does not make
 * sense. We try to detect a drop in efficiency and
 * disable reclaim for a time period. This period and the
 * period for which we monitor a drop in efficiency is
 * defined by swap_eff_win. swap_opt_eff is the optimal
 * efficincy used as theshold for this.
 */
static int swap_eff_win = 2;
module_param_named(swap_eff_win, swap_eff_win, int, 0644);

static int swap_opt_eff = 50;
module_param_named(swap_opt_eff, swap_opt_eff, int, 0644);

static atomic_t skip_reclaim = ATOMIC_INIT(0);
/* Not atomic since only a single instance of swap_fn run at a time */
static int monitor_eff;

struct selected_task {
	struct task_struct *p;
	int tasksize;
	short oom_score_adj;
};

int selected_cmp(const void *a, const void *b)
{
	const struct selected_task *x = a;
	const struct selected_task *y = b;
	int ret;

	ret = x->tasksize < y->tasksize ? -1 : 1;

	return ret;
}

static int test_task_flag(struct task_struct *p, int flag)
{
	struct task_struct *t = p;

	rcu_read_lock();
	for_each_thread(p, t) {
		task_lock(t);
		if (test_tsk_thread_flag(t, flag)) {
			task_unlock(t);
			rcu_read_unlock();
			return 1;
		}
		task_unlock(t);
	}
	rcu_read_unlock();

	return 0;
}

static void swap_fn(struct work_struct *work)
{
	struct task_struct *tsk;
	struct reclaim_param rp;

	/* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */
	struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},};
	int si = 0;
	int i;
	int tasksize;
	int total_sz = 0;
	short min_score_adj = 360;
	int total_scan = 0;
	int total_reclaimed = 0;
	int nr_to_reclaim;
	int efficiency;

	rcu_read_lock();
	for_each_process(tsk) {
		struct task_struct *p;
		short oom_score_adj;

		if (tsk->flags & PF_KTHREAD)
			continue;

		if (test_task_flag(tsk, TIF_MEMDIE))
			continue;

		p = find_lock_task_mm(tsk);
		if (!p)
			continue;

		oom_score_adj = p->signal->oom_score_adj;
		if (oom_score_adj < min_score_adj) {
			task_unlock(p);
			continue;
		}

		tasksize = get_mm_counter(p->mm, MM_ANONPAGES);
		task_unlock(p);

		if (tasksize <= 0)
			continue;

		if (si == MAX_SWAP_TASKS) {
			sort(&selected[0], MAX_SWAP_TASKS,
					sizeof(struct selected_task),
					&selected_cmp, NULL);
			if (tasksize < selected[0].tasksize)
				continue;
			selected[0].p = p;
			selected[0].oom_score_adj = oom_score_adj;
			selected[0].tasksize = tasksize;
		} else {
			selected[si].p = p;
			selected[si].oom_score_adj = oom_score_adj;
			selected[si].tasksize = tasksize;
			si++;
		}
	}

	for (i = 0; i < si; i++)
		total_sz += selected[i].tasksize;

	/* Skip reclaim if total size is too less */
	if (total_sz < SWAP_CLUSTER_MAX) {
		rcu_read_unlock();
		return;
	}

	for (i = 0; i < si; i++)
		get_task_struct(selected[i].p);

	rcu_read_unlock();

	while (si--) {
		nr_to_reclaim =
			(selected[si].tasksize * per_swap_size) / total_sz;
		/* scan atleast a page */
		if (!nr_to_reclaim)
			nr_to_reclaim = 1;

		rp = reclaim_task_anon(selected[si].p, nr_to_reclaim);

		trace_process_reclaim(selected[si].tasksize,
				selected[si].oom_score_adj, rp.nr_scanned,
				rp.nr_reclaimed, per_swap_size, total_sz,
				nr_to_reclaim);
		total_scan += rp.nr_scanned;
		total_reclaimed += rp.nr_reclaimed;
		put_task_struct(selected[si].p);
	}

	if (total_scan) {
		efficiency = (total_reclaimed * 100) / total_scan;

		if (efficiency < swap_opt_eff) {
			if (++monitor_eff == swap_eff_win) {
				atomic_set(&skip_reclaim, swap_eff_win);
				monitor_eff = 0;
			}
		} else {
			monitor_eff = 0;
		}

		reclaim_avg_efficiency =
			(efficiency + reclaim_avg_efficiency) / 2;
		trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency);
	}
}

static int vmpressure_notifier(struct notifier_block *nb,
			unsigned long action, void *data)
{
	unsigned long pressure = action;

	if (!enable_process_reclaim)
		return 0;

	if (!current_is_kswapd())
		return 0;

	if (atomic_dec_if_positive(&skip_reclaim) >= 0)
		return 0;

	if ((pressure >= pressure_min) && (pressure < pressure_max))
		if (!work_pending(&swap_work))
			queue_work(system_unbound_wq, &swap_work);
	return 0;
}

static struct notifier_block vmpr_nb = {
	.notifier_call = vmpressure_notifier,
};

static int __init process_reclaim_init(void)
{
	vmpressure_notifier_register(&vmpr_nb);
	return 0;
}

static void __exit process_reclaim_exit(void)
{
	vmpressure_notifier_unregister(&vmpr_nb);
}

module_init(process_reclaim_init);
module_exit(process_reclaim_exit);