sched/numa: Scale scan period with tasks in group and shared/private (b5dd77c8) · Commits · e / devices / android_kernel_oneplus_sm8150

kernel/sched/fair.c

+86 −25

Original line number	Diff line number	Diff line
		@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
		/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
		unsigned int sysctl_numa_balancing_scan_delay = 1000;

		struct numa_group {
		atomic_t refcount;

		spinlock_t lock; /* nr_tasks, tasks */
		int nr_tasks;
		pid_t gid;
		int active_nodes;

		struct rcu_head rcu;
		unsigned long total_faults;
		unsigned long max_faults_cpu;
		/*
		* Faults_cpu is used to decide whether memory should move
		* towards the CPU. As a consequence, these stats are weighted
		* more by CPU use than by memory faults.
		*/
		unsigned long *faults_cpu;
		unsigned long faults[0];
		};

		static inline unsigned long group_faults_priv(struct numa_group *ng);
		static inline unsigned long group_faults_shared(struct numa_group *ng);

		static unsigned int task_nr_scan_windows(struct task_struct *p)
		{
		unsigned long rss = 0;
		@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
		return max_t(unsigned int, floor, scan);
		}

		static unsigned int task_scan_start(struct task_struct *p)
		{
		unsigned long smin = task_scan_min(p);
		unsigned long period = smin;

		/* Scale the maximum scan period with the amount of shared memory. */
		if (p->numa_group) {
		struct numa_group *ng = p->numa_group;
		unsigned long shared = group_faults_shared(ng);
		unsigned long private = group_faults_priv(ng);

		period *= atomic_read(&ng->refcount);
		period *= shared + 1;
		period /= private + shared + 1;
		}

		return max(smin, period);
		}

		static unsigned int task_scan_max(struct task_struct *p)
		{
		unsigned int smin = task_scan_min(p);
		unsigned int smax;
		unsigned long smin = task_scan_min(p);
		unsigned long smax;

		/* Watch for min being lower than max due to floor calculations */
		smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

		/* Scale the maximum scan period with the amount of shared memory. */
		if (p->numa_group) {
		struct numa_group *ng = p->numa_group;
		unsigned long shared = group_faults_shared(ng);
		unsigned long private = group_faults_priv(ng);
		unsigned long period = smax;

		period *= atomic_read(&ng->refcount);
		period *= shared + 1;
		period /= private + shared + 1;

		smax = max(smax, period);
		}

		return max(smin, smax);
		}

		@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq rq, struct task_struct p)
		rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
		}

		struct numa_group {
		atomic_t refcount;

		spinlock_t lock; /* nr_tasks, tasks */
		int nr_tasks;
		pid_t gid;
		int active_nodes;

		struct rcu_head rcu;
		unsigned long total_faults;
		unsigned long max_faults_cpu;
		/*
		* Faults_cpu is used to decide whether memory should move
		* towards the CPU. As a consequence, these stats are weighted
		* more by CPU use than by memory faults.
		*/
		unsigned long *faults_cpu;
		unsigned long faults[0];
		};

		/* Shared or private faults. */
		#define NR_NUMA_HINT_FAULT_TYPES 2

		@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
		}

		static inline unsigned long group_faults_priv(struct numa_group *ng)
		{
		unsigned long faults = 0;
		int node;

		for_each_online_node(node) {
		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
		}

		return faults;
		}

		static inline unsigned long group_faults_shared(struct numa_group *ng)
		{
		unsigned long faults = 0;
		int node;

		for_each_online_node(node) {
		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
		}

		return faults;
		}

		/*
		* A node triggering more than 1/3 as many NUMA faults as the maximum is
		* considered part of a numa group's pseudo-interleaving set. Migrations
		@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
		* Reset the scan period if the task is being rescheduled on an
		* alternative node to recheck if the tasks is now properly placed.
		*/
		p->numa_scan_period = task_scan_min(p);
		p->numa_scan_period = task_scan_start(p);

		if (env.best_task == NULL) {
		ret = migrate_task_to(p, env.best_cpu);
		@@ -2459,7 +2520,7 @@ void task_numa_work(struct callback_head *work)

		if (p->numa_scan_period == 0) {
		p->numa_scan_period_max = task_scan_max(p);
		p->numa_scan_period = task_scan_min(p);
		p->numa_scan_period = task_scan_start(p);
		}

		next_scan = now + msecs_to_jiffies(p->numa_scan_period);
		@@ -2587,7 +2648,7 @@ void task_tick_numa(struct rq rq, struct task_struct curr)

		if (now > curr->node_stamp + period) {
		if (!curr->node_stamp)
		curr->numa_scan_period = task_scan_min(curr);
		curr->numa_scan_period = task_scan_start(curr);
		curr->node_stamp += period;

		if (!time_before(jiffies, curr->mm->numa_next_scan)) {