Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3e51f33f authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar
Browse files

sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK



this replaces the rq->clock stuff (and possibly cpu_clock()).

 - architectures that have an 'imperfect' hardware clock can set
   CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

 - the 'jiffie' window might be superfulous when we update tick_gtod
   before the __update_sched_clock() call in sched_clock_tick()

 - cpu_clock() might be implemented as:

     sched_clock_cpu(smp_processor_id())

   if the accuracy proves good enough - how far can TSC drift in a
   single jiffie when considering the filtering and idle hooks?

[ mingo@elte.hu: various fixes and cleanups ]

Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent a5574cf6
Loading
Loading
Loading
Loading
+29 −0
Original line number Original line Diff line number Diff line
@@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)


extern unsigned long long sched_clock(void);
extern unsigned long long sched_clock(void);


#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline void sched_clock_init(void)
{
}

static inline u64 sched_clock_cpu(int cpu)
{
	return sched_clock();
}

static inline void sched_clock_tick(void)
{
}

static inline void sched_clock_idle_sleep_event(void)
{
}

static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
{
}
#else
extern void sched_clock_init(void);
extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#endif

/*
/*
 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
 * clock constructed from sched_clock():
 * clock constructed from sched_clock():
+1 −0
Original line number Original line Diff line number Diff line
@@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void)
	softirq_init();
	softirq_init();
	timekeeping_init();
	timekeeping_init();
	time_init();
	time_init();
	sched_clock_init();
	profile_init();
	profile_init();
	if (!irqs_disabled())
	if (!irqs_disabled())
		printk("start_kernel(): bug: interrupts were enabled early\n");
		printk("start_kernel(): bug: interrupts were enabled early\n");
+1 −1
Original line number Original line Diff line number Diff line
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
	    rcupdate.o extable.o params.o posix-timers.o \
	    rcupdate.o extable.o params.o posix-timers.o \
	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
	    notifier.o ksysfs.o pm_qos_params.o
	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o


obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
+13 −152
Original line number Original line Diff line number Diff line
@@ -74,16 +74,6 @@
#include <asm/tlb.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/irq_regs.h>


/*
 * Scheduler clock - returns current time in nanosec units.
 * This is default implementation.
 * Architectures and sub-architectures can override this.
 */
unsigned long long __attribute__((weak)) sched_clock(void)
{
	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}

/*
/*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -557,13 +547,7 @@ struct rq {
	unsigned long next_balance;
	unsigned long next_balance;
	struct mm_struct *prev_mm;
	struct mm_struct *prev_mm;


	u64 clock, prev_clock_raw;
	u64 clock;
	s64 clock_max_delta;

	unsigned int clock_warps, clock_overflows, clock_underflows;
	u64 idle_clock;
	unsigned int clock_deep_idle_events;
	u64 tick_timestamp;


	atomic_t nr_iowait;
	atomic_t nr_iowait;


@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
#endif
#endif
}
}


#ifdef CONFIG_NO_HZ
static inline bool nohz_on(int cpu)
{
	return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
}

static inline u64 max_skipped_ticks(struct rq *rq)
{
	return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
}

static inline void update_last_tick_seen(struct rq *rq)
{
	rq->last_tick_seen = jiffies;
}
#else
static inline u64 max_skipped_ticks(struct rq *rq)
{
	return 1;
}

static inline void update_last_tick_seen(struct rq *rq)
{
}
#endif

/*
 * Update the per-runqueue clock, as finegrained as the platform can give
 * us, but without assuming monotonicity, etc.:
 */
static void __update_rq_clock(struct rq *rq)
{
	u64 prev_raw = rq->prev_clock_raw;
	u64 now = sched_clock();
	s64 delta = now - prev_raw;
	u64 clock = rq->clock;

#ifdef CONFIG_SCHED_DEBUG
	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
#endif
	/*
	 * Protect against sched_clock() occasionally going backwards:
	 */
	if (unlikely(delta < 0)) {
		clock++;
		rq->clock_warps++;
	} else {
		/*
		 * Catch too large forward jumps too:
		 */
		u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
		u64 max_time = rq->tick_timestamp + max_jump;

		if (unlikely(clock + delta > max_time)) {
			if (clock < max_time)
				clock = max_time;
			else
				clock++;
			rq->clock_overflows++;
		} else {
			if (unlikely(delta > rq->clock_max_delta))
				rq->clock_max_delta = delta;
			clock += delta;
		}
	}

	rq->prev_clock_raw = now;
	rq->clock = clock;
}

static void update_rq_clock(struct rq *rq)
{
	if (likely(smp_processor_id() == cpu_of(rq)))
		__update_rq_clock(rq);
}

/*
/*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
#define task_rq(p)		cpu_rq(task_cpu(p))
#define task_rq(p)		cpu_rq(task_cpu(p))
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)


static inline void update_rq_clock(struct rq *rq)
{
	rq->clock = sched_clock_cpu(cpu_of(rq));
}

/*
/*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
 */
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
static unsigned long long __cpu_clock(int cpu)
static unsigned long long __cpu_clock(int cpu)
{
{
	unsigned long long now;
	unsigned long long now;
	struct rq *rq;


	/*
	/*
	 * Only call sched_clock() if the scheduler has already been
	 * Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
	if (unlikely(!scheduler_running))
	if (unlikely(!scheduler_running))
		return 0;
		return 0;


	rq = cpu_rq(cpu);
	now = sched_clock_cpu(cpu);
	update_rq_clock(rq);
	now = rq->clock;


	return now;
	return now;
}
}
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
	return rq;
	return rq;
}
}


/*
 * We are going deep-idle (irqs are disabled):
 */
void sched_clock_idle_sleep_event(void)
{
	struct rq *rq = cpu_rq(smp_processor_id());

	WARN_ON(!irqs_disabled());
	spin_lock(&rq->lock);
	__update_rq_clock(rq);
	spin_unlock(&rq->lock);
	rq->clock_deep_idle_events++;
}
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);

/*
 * We just idled delta nanoseconds (called with irqs disabled):
 */
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
	struct rq *rq = cpu_rq(smp_processor_id());
	u64 now = sched_clock();

	WARN_ON(!irqs_disabled());
	rq->idle_clock += delta_ns;
	/*
	 * Override the previous timestamp and ignore all
	 * sched_clock() deltas that occured while we idled,
	 * and use the PM-provided delta_ns to advance the
	 * rq clock:
	 */
	spin_lock(&rq->lock);
	rq->prev_clock_raw = now;
	rq->clock += delta_ns;
	spin_unlock(&rq->lock);
	touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);

static void __resched_task(struct task_struct *p, int tif_bit);
static void __resched_task(struct task_struct *p, int tif_bit);


static inline void resched_task(struct task_struct *p)
static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());


	spin_lock(&rq->lock);
	spin_lock(&rq->lock);
	__update_rq_clock(rq);
	update_rq_clock(rq);
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
	spin_unlock(&rq->lock);
	spin_unlock(&rq->lock);


@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
	int cpu = smp_processor_id();
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr;
	struct task_struct *curr = rq->curr;
	u64 next_tick = rq->tick_timestamp + TICK_NSEC;

	sched_clock_tick();


	spin_lock(&rq->lock);
	spin_lock(&rq->lock);
	__update_rq_clock(rq);
	update_rq_clock(rq);
	/*
	 * Let rq->clock advance by at least TICK_NSEC:
	 */
	if (unlikely(rq->clock < next_tick)) {
		rq->clock = next_tick;
		rq->clock_underflows++;
	}
	rq->tick_timestamp = rq->clock;
	update_last_tick_seen(rq);
	update_cpu_load(rq);
	update_cpu_load(rq);
	curr->sched_class->task_tick(rq, curr, 0);
	curr->sched_class->task_tick(rq, curr, 0);
	spin_unlock(&rq->lock);
	spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
	 * Do the rq-clock update outside the rq lock:
	 * Do the rq-clock update outside the rq lock:
	 */
	 */
	local_irq_disable();
	local_irq_disable();
	__update_rq_clock(rq);
	update_rq_clock(rq);
	spin_lock(&rq->lock);
	spin_lock(&rq->lock);
	clear_tsk_need_resched(prev);
	clear_tsk_need_resched(prev);


@@ -8226,8 +8089,6 @@ void __init sched_init(void)
		spin_lock_init(&rq->lock);
		spin_lock_init(&rq->lock);
		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
		rq->nr_running = 0;
		rq->nr_running = 0;
		rq->clock = 1;
		update_last_tick_seen(rq);
		init_cfs_rq(&rq->cfs, rq);
		init_cfs_rq(&rq->cfs, rq);
		init_rt_rq(&rq->rt, rq);
		init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
static void normalize_task(struct rq *rq, struct task_struct *p)
static void normalize_task(struct rq *rq, struct task_struct *p)
{
{
	int on_rq;
	int on_rq;

	update_rq_clock(rq);
	update_rq_clock(rq);
	on_rq = p->se.on_rq;
	on_rq = p->se.on_rq;
	if (on_rq)
	if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
		p->se.sleep_start		= 0;
		p->se.sleep_start		= 0;
		p->se.block_start		= 0;
		p->se.block_start		= 0;
#endif
#endif
		task_rq(p)->clock		= 0;


		if (!rt_task(p)) {
		if (!rt_task(p)) {
			/*
			/*

kernel/sched_clock.c

0 → 100644
+236 −0
Original line number Original line Diff line number Diff line
/*
 * sched_clock for unstable cpu clocks
 *
 *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
 * Based on code by:
 *   Ingo Molnar <mingo@redhat.com>
 *   Guillaume Chazarain <guichaz@gmail.com>
 *
 * Create a semi stable clock from a mixture of other events, including:
 *  - gtod
 *  - jiffies
 *  - sched_clock()
 *  - explicit idle events
 *
 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
 * making it monotonic and keeping it within an expected window.  This window
 * is set up using jiffies.
 *
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
 * consistent between cpus (never more than 1 jiffies difference).
 */
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/ktime.h>
#include <linux/module.h>


#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

struct sched_clock_data {
	/*
	 * Raw spinlock - this is a special case: this might be called
	 * from within instrumentation code so we dont want to do any
	 * instrumentation ourselves.
	 */
	raw_spinlock_t		lock;

	unsigned long		prev_jiffies;
	u64			prev_raw;
	u64			tick_raw;
	u64			tick_gtod;
	u64			clock;
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);

static inline struct sched_clock_data *this_scd(void)
{
	return &__get_cpu_var(sched_clock_data);
}

static inline struct sched_clock_data *cpu_sdc(int cpu)
{
	return &per_cpu(sched_clock_data, cpu);
}

void sched_clock_init(void)
{
	u64 ktime_now = ktime_to_ns(ktime_get());
	u64 now = 0;
	int cpu;

	for_each_possible_cpu(cpu) {
		struct sched_clock_data *scd = cpu_sdc(cpu);

		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
		scd->prev_jiffies = jiffies;
		scd->prev_raw = now;
		scd->tick_raw = now;
		scd->tick_gtod = ktime_now;
		scd->clock = ktime_now;
	}
}

/*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
 *  - use jiffies to generate a min,max window to clip the raw values
 */
static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
{
	unsigned long now_jiffies = jiffies;
	long delta_jiffies = now_jiffies - scd->prev_jiffies;
	u64 clock = scd->clock;
	u64 min_clock, max_clock;
	s64 delta = now - scd->prev_raw;

	WARN_ON_ONCE(!irqs_disabled());
	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;

	if (unlikely(delta < 0)) {
		clock++;
		goto out;
	}

	max_clock = min_clock + TICK_NSEC;

	if (unlikely(clock + delta > max_clock)) {
		if (clock < max_clock)
			clock = max_clock;
		else
			clock++;
	} else {
		clock += delta;
	}

 out:
	if (unlikely(clock < min_clock))
		clock = min_clock;

	scd->prev_raw = now;
	scd->prev_jiffies = now_jiffies;
	scd->clock = clock;
}

static void lock_double_clock(struct sched_clock_data *data1,
				struct sched_clock_data *data2)
{
	if (data1 < data2) {
		__raw_spin_lock(&data1->lock);
		__raw_spin_lock(&data2->lock);
	} else {
		__raw_spin_lock(&data2->lock);
		__raw_spin_lock(&data1->lock);
	}
}

u64 sched_clock_cpu(int cpu)
{
	struct sched_clock_data *scd = cpu_sdc(cpu);
	u64 now, clock;

	WARN_ON_ONCE(!irqs_disabled());
	now = sched_clock();

	if (cpu != raw_smp_processor_id()) {
		/*
		 * in order to update a remote cpu's clock based on our
		 * unstable raw time rebase it against:
		 *   tick_raw		(offset between raw counters)
		 *   tick_gotd          (tick offset between cpus)
		 */
		struct sched_clock_data *my_scd = this_scd();

		lock_double_clock(scd, my_scd);

		now -= my_scd->tick_raw;
		now += scd->tick_raw;

		now -= my_scd->tick_gtod;
		now += scd->tick_gtod;

		__raw_spin_unlock(&my_scd->lock);
	} else {
		__raw_spin_lock(&scd->lock);
	}

	__update_sched_clock(scd, now);
	clock = scd->clock;

	__raw_spin_unlock(&scd->lock);

	return clock;
}

void sched_clock_tick(void)
{
	struct sched_clock_data *scd = this_scd();
	u64 now, now_gtod;

	WARN_ON_ONCE(!irqs_disabled());

	now = sched_clock();
	now_gtod = ktime_to_ns(ktime_get());

	__raw_spin_lock(&scd->lock);
	__update_sched_clock(scd, now);
	/*
	 * update tick_gtod after __update_sched_clock() because that will
	 * already observe 1 new jiffy; adding a new tick_gtod to that would
	 * increase the clock 2 jiffies.
	 */
	scd->tick_raw = now;
	scd->tick_gtod = now_gtod;
	__raw_spin_unlock(&scd->lock);
}

/*
 * We are going deep-idle (irqs are disabled):
 */
void sched_clock_idle_sleep_event(void)
{
	sched_clock_cpu(smp_processor_id());
}
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);

/*
 * We just idled delta nanoseconds (called with irqs disabled):
 */
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
	struct sched_clock_data *scd = this_scd();
	u64 now = sched_clock();

	/*
	 * Override the previous timestamp and ignore all
	 * sched_clock() deltas that occured while we idled,
	 * and use the PM-provided delta_ns to advance the
	 * rq clock:
	 */
	__raw_spin_lock(&scd->lock);
	scd->prev_raw = now;
	scd->clock += delta_ns;
	__raw_spin_unlock(&scd->lock);

	touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);

#endif

/*
 * Scheduler clock - returns current time in nanosec units.
 * This is default implementation.
 * Architectures and sub-architectures can override this.
 */
unsigned long long __attribute__((weak)) sched_clock(void)
{
	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}
Loading