Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2133b5d7 authored by Paul E. McKenney's avatar Paul E. McKenney Committed by Ingo Molnar
Browse files

rcu: RCU-based detection of stalled CPUs for Classic RCU



This patch adds stalled-CPU detection to Classic RCU.  This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.

This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.

This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds.  This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent b5259d94
Loading
Loading
Loading
Loading
+9 −3
Original line number Diff line number Diff line
@@ -40,15 +40,21 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>

#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
#define RCU_SECONDS_TILL_STALL_CHECK	( 3 * HZ) /* for rcp->jiffies_stall */
#define RCU_SECONDS_TILL_STALL_RECHECK	(30 * HZ) /* for rcp->jiffies_stall */
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
	long	cur;		/* Current batch number.                      */
	long	completed;	/* Number of the last completed batch         */
	long	pending;	/* Number of the last pending batch           */
#ifdef CONFIG_DEBUG_RCU_STALL
	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
	unsigned long gp_start;	/* Time at which GP started in jiffies. */
	unsigned long jiffies_stall;
				/* Time at which to check for CPU stalls. */
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

	int	signaled;

+86 −80
Original line number Diff line number Diff line
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
	}
}

#ifdef CONFIG_RCU_CPU_STALL_DETECTOR

static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
{
	rcp->gp_start = jiffies;
	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
}

static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
{
	int cpu;
	long delta;
	unsigned long flags;

	/* Only let one CPU complain about others per time interval. */

	spin_lock_irqsave(&rcp->lock, flags);
	delta = jiffies - rcp->jiffies_stall;
	if (delta < 2 || rcp->cur != rcp->completed) {
		spin_unlock_irqrestore(&rcp->lock, flags);
		return;
	}
	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
	spin_unlock_irqrestore(&rcp->lock, flags);

	/* OK, time to rat on our buddy... */

	printk(KERN_ERR "RCU detected CPU stalls:");
	for_each_possible_cpu(cpu) {
		if (cpu_isset(cpu, rcp->cpumask))
			printk(" %d", cpu);
	}
	printk(" (detected by %d, t=%ld jiffies)\n",
	       smp_processor_id(), (long)(jiffies - rcp->gp_start));
}

static void print_cpu_stall(struct rcu_ctrlblk *rcp)
{
	unsigned long flags;

	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
			smp_processor_id(), jiffies,
			jiffies - rcp->gp_start);
	dump_stack();
	spin_lock_irqsave(&rcp->lock, flags);
	if ((long)(jiffies - rcp->jiffies_stall) >= 0)
		rcp->jiffies_stall =
			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
	spin_unlock_irqrestore(&rcp->lock, flags);
	set_need_resched();  /* kick ourselves to get things going. */
}

static void check_cpu_stall(struct rcu_ctrlblk *rcp)
{
	long delta;

	delta = jiffies - rcp->jiffies_stall;
	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {

		/* We haven't checked in, so go dump stack. */
		print_cpu_stall(rcp);

	} else if (rcp->cur != rcp->completed && delta >= 2) {

		/* They had two seconds to dump stack, so complain. */
		print_other_cpu_stall(rcp);
	}
}

#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
{
}

static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
}

#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */

/**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
 * @head: structure to be used for queueing the RCU updates.
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
 *   period (if necessary).
 */

#ifdef CONFIG_DEBUG_RCU_STALL

static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
{
	rcp->gp_check = get_seconds() + 3;
}

static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
{
	int cpu;
	long delta;
	unsigned long flags;

	/* Only let one CPU complain about others per time interval. */

	spin_lock_irqsave(&rcp->lock, flags);
	delta = get_seconds() - rcp->gp_check;
	if (delta < 2L || cpus_empty(rcp->cpumask)) {
		spin_unlock(&rcp->lock);
		return;
	}
	rcp->gp_check = get_seconds() + 30;
	spin_unlock_irqrestore(&rcp->lock, flags);

	/* OK, time to rat on our buddy... */

	printk(KERN_ERR "RCU detected CPU stalls:");
	for_each_cpu_mask(cpu, rcp->cpumask)
		printk(" %d", cpu);
	printk(" (detected by %d, t=%lu/%lu)\n",
	       smp_processor_id(), get_seconds(), rcp->gp_check);
}

static void print_cpu_stall(struct rcu_ctrlblk *rcp)
{
	unsigned long flags;

	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
			smp_processor_id(), get_seconds(), rcp->gp_check);
	dump_stack();
	spin_lock_irqsave(&rcp->lock, flags);
	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
		rcp->gp_check = get_seconds() + 30;
	spin_unlock_irqrestore(&rcp->lock, flags);
}

static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
	long delta;

	delta = get_seconds() - rcp->gp_check;
	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {

		/* We haven't checked in, so go dump stack. */

		print_cpu_stall(rcp);

	} else {
		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
			/* They had two seconds to dump stack, so complain. */
			print_other_cpu_stall(rcp);
		}
	}
}

#else /* #ifdef CONFIG_DEBUG_RCU_STALL */

static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
{
}

static inline void
check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
}

#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */

/*
 * Register a new batch of callbacks, and start it up if there is currently no
 * active batch and the batch to be registered has not already occurred.
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
	if (rcp->cur != rcp->pending &&
			rcp->completed == rcp->cur) {
		rcp->cur++;
		record_gp_check_time(rcp);
		record_gp_stall_check_time(rcp);

		/*
		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
	/* Check for CPU stalls, if enabled. */
	check_cpu_stall(rcp, rdp);
	check_cpu_stall(rcp);

	if (rdp->nxtlist) {
		long completed_snap = ACCESS_ONCE(rcp->completed);
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
 */
void __init __rcu_init(void)
{
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
			(void *)(long)smp_processor_id());
	/* Register notifier for non-boot CPUs */
+1 −1
Original line number Diff line number Diff line
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE
	  Say N here if you want the RCU torture tests to start only
	  after being manually enabled via /proc.

config RCU_CPU_STALL
config RCU_CPU_STALL_DETECTOR
	bool "Check for stalled CPUs delaying RCU grace periods"
	depends on CLASSIC_RCU
	default n