Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a40f7521 authored by Joonwoo Park's avatar Joonwoo Park
Browse files

timer: make deferrable cpu unbound timers really not bound to a cpu



When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0                                 CPU 1
queue_delayed_work(wq, &dwork, HZ);
    queue_delayed_work_on(WORK_CPU_UNBOUND);
        ...
	__mod_timer() -> queues timer to the
			 current cpu's timer
			 base.
	...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
                                      now it's in active but won't
cpu 0 won't wake up till next         handle cpu unbound deferrable timer
non-deferrable timer expires.         as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound, so they can be scheduled by any non-idle cpu.
This common timer fixes the scalability issue of delayed work and all other
cpu unbound deferrable timer using implementations.

CRs-fixed: 708770
Change-Id: I83b16fa9e1e3f42808d68fc626dc276bfd556012
Signed-off-by: default avatarJoonwoo Park <joonwoop@codeaurora.org>
parent cc9ba103
Loading
Loading
Loading
Loading
+84 −29
Original line number Diff line number Diff line
@@ -91,6 +91,9 @@ struct tvec_base {
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
#ifdef CONFIG_SMP
static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
#endif

/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -622,7 +625,14 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
			  const char *name, struct lock_class_key *key)
{
	struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
	struct tvec_base *base;

#ifdef CONFIG_SMP
	if (flags & TIMER_DEFERRABLE)
		base = tvec_base_deferral;
	else
#endif
		base = __raw_get_cpu_var(tvec_bases);

	timer->entry.next = NULL;
	timer->base = (void *)((unsigned long)base | flags);
@@ -740,6 +750,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,

	debug_activate(timer, expires);

#ifdef CONFIG_SMP
	if (base != tvec_base_deferral) {
#endif
		 cpu = smp_processor_id();

#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
@@ -751,10 +764,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
		if (base != new_base) {
			/*
			 * We are trying to schedule the timer on the local CPU.
		 * However we can't change timer's base while it is running,
		 * otherwise del_timer_sync() can't detect that the timer's
		 * handler yet has not finished. This also guarantees that
		 * the timer is serialized wrt itself.
			 * However we can't change timer's base while it is
			 * running, otherwise del_timer_sync() can't detect that
			 * the timer's * handler yet has not finished. This also
			 * guarantees that * the timer is serialized wrt itself.
			 */
			if (likely(base->running_timer != timer)) {
				/* See the comment in lock_timer_base() */
@@ -765,6 +778,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
				timer_set_base(timer, base);
			}
		}
#ifdef CONFIG_SMP
	}
#endif

	timer->expires = expires;
	internal_add_timer(base, timer);
@@ -1138,15 +1154,20 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 * @try: try and just return if base's lock already acquired.
 *
 * This function cascades all vectors and executes all expired timer
 * vectors.
 */
static inline void __run_timers(struct tvec_base *base)
static inline void __run_timers(struct tvec_base *base, bool try)
{
	struct timer_list *timer;

	if (!try)
		spin_lock_irq(&base->lock);
	else if (!spin_trylock_irq(&base->lock))
		return;

	while (time_after_eq(jiffies, base->timer_jiffies)) {
		struct list_head work_list;
		struct list_head *head = &work_list;
@@ -1373,8 +1394,17 @@ static void run_timer_softirq(struct softirq_action *h)

	hrtimer_run_pending();

#ifdef CONFIG_SMP
	if (time_after_eq(jiffies, tvec_base_deferral->timer_jiffies))
		/*
		 * if other cpu is handling cpu unbound deferrable timer base,
		 * current cpu doesn't need to handle it so pass try=true.
		 */
		__run_timers(tvec_base_deferral, true);
#endif

	if (time_after_eq(jiffies, base->timer_jiffies))
		__run_timers(base);
		__run_timers(base, false);
}

/*
@@ -1510,7 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
{
	int j;
	struct tvec_base *base;
	static char __cpuinitdata tvec_base_done[NR_CPUS];
	static char __cpuinitdata tvec_base_done[NR_CPUS + 1];

	if (!tvec_base_done[cpu]) {
		static char boot_done;
@@ -1519,9 +1549,14 @@ static int __cpuinit init_timers_cpu(int cpu)
			/*
			 * The APs use this path later in boot
			 */
			if (cpu != NR_CPUS)
				base = kmalloc_node(sizeof(*base),
						    GFP_KERNEL | __GFP_ZERO,
						    cpu_to_node(cpu));
			else
				base = kmalloc(sizeof(*base),
					       GFP_KERNEL | __GFP_ZERO);

			if (!base)
				return -ENOMEM;

@@ -1531,7 +1566,12 @@ static int __cpuinit init_timers_cpu(int cpu)
				kfree(base);
				return -ENOMEM;
			}
			if (cpu != NR_CPUS)
				per_cpu(tvec_bases, cpu) = base;
#ifdef CONFIG_SMP
			else
				tvec_base_deferral = base;
#endif
		} else {
			/*
			 * This is for the boot CPU - we use compile-time
@@ -1545,7 +1585,12 @@ static int __cpuinit init_timers_cpu(int cpu)
		spin_lock_init(&base->lock);
		tvec_base_done[cpu] = 1;
	} else {
		if (cpu != NR_CPUS)
			base = per_cpu(tvec_bases, cpu);
#ifdef CONFIG_SMP
		else
			base = tvec_base_deferral;
#endif
	}


@@ -1653,6 +1698,16 @@ void __init init_timers(void)
	init_timer_stats();

	BUG_ON(err != NOTIFY_OK);

#ifdef CONFIG_SMP
	/*
	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
	 * UP kernel handles the timers with cpu 0 timer base.
	 */
	err = init_timers_cpu(NR_CPUS);
	BUG_ON(err);
#endif

	register_cpu_notifier(&timers_nb);
	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}