sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler (e33a9bba) · Commits · e / devices / android_kernel_fairphone_FP5

kernel/sched/core.c

+61 −7

Original line number	Diff line number	Diff line
		@@ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
		p->sched_contributes_to_load = !!task_contributes_to_load(p);
		p->state = TASK_WAKING;

		if (p->in_iowait) {
		delayacct_blkio_end();
		atomic_dec(&task_rq(p)->nr_iowait);
		}

		cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
		if (task_cpu(p) != cpu) {
		wake_flags \|= WF_MIGRATED;
		set_task_cpu(p, cpu);
		}

		#else /* CONFIG_SMP */

		if (p->in_iowait) {
		delayacct_blkio_end();
		atomic_dec(&task_rq(p)->nr_iowait);
		}

		#endif /* CONFIG_SMP */

		ttwu_queue(p, cpu, wake_flags);
		@@ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct p, struct rq_flags rf)

		trace_sched_waking(p);

		if (!task_on_rq_queued(p))
		if (!task_on_rq_queued(p)) {
		if (p->in_iowait) {
		delayacct_blkio_end();
		atomic_dec(&rq->nr_iowait);
		}
		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
		}

		ttwu_do_wakeup(rq, p, 0, rf);
		ttwu_stat(p, smp_processor_id(), 0);
		@@ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void)
		return sum;
		}

		/*
		* IO-wait accounting, and how its mostly bollocks (on SMP).
		*
		* The idea behind IO-wait account is to account the idle time that we could
		* have spend running if it were not for IO. That is, if we were to improve the
		* storage performance, we'd have a proportional reduction in IO-wait time.
		*
		* This all works nicely on UP, where, when a task blocks on IO, we account
		* idle time as IO-wait, because if the storage were faster, it could've been
		* running and we'd not be idle.
		*
		* This has been extended to SMP, by doing the same for each CPU. This however
		* is broken.
		*
		* Imagine for instance the case where two tasks block on one CPU, only the one
		* CPU will have IO-wait accounted, while the other has regular idle. Even
		* though, if the storage were faster, both could've ran at the same time,
		* utilising both CPUs.
		*
		* This means, that when looking globally, the current IO-wait accounting on
		* SMP is a lower bound, by reason of under accounting.
		*
		* Worse, since the numbers are provided per CPU, they are sometimes
		* interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
		* associated with any one particular CPU, it can wake to another CPU than it
		* blocked on. This means the per CPU IO-wait number is meaningless.
		*
		* Task CPU affinities can make all that even more 'interesting'.
		*/

		unsigned long nr_iowait(void)
		{
		unsigned long i, sum = 0;
		@@ -2966,6 +3014,13 @@ unsigned long nr_iowait(void)
		return sum;
		}

		/*
		* Consumers of these two interfaces, like for example the cpufreq menu
		* governor are using nonsensical data. Boosting frequency for a CPU that has
		* IO-wait which might not even end up running the task when it does become
		* runnable.
		*/

		unsigned long nr_iowait_cpu(int cpu)
		{
		struct rq *this = cpu_rq(cpu);
		@@ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
		deactivate_task(rq, prev, DEQUEUE_SLEEP);
		prev->on_rq = 0;

		if (prev->in_iowait) {
		atomic_inc(&rq->nr_iowait);
		delayacct_blkio_start();
		}

		/*
		* If a worker went to sleep, notify and ask workqueue
		* whether it wants to wake up a task to maintain
		@@ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
		long __sched io_schedule_timeout(long timeout)
		{
		int old_iowait = current->in_iowait;
		struct rq *rq;
		long ret;

		current->in_iowait = 1;
		blk_schedule_flush_plug(current);

		delayacct_blkio_start();
		rq = raw_rq();
		atomic_inc(&rq->nr_iowait);
		ret = schedule_timeout(timeout);
		current->in_iowait = old_iowait;
		atomic_dec(&rq->nr_iowait);
		delayacct_blkio_end();

		return ret;
		}