Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 385b73c0 authored by Paul E. McKenney's avatar Paul E. McKenney
Browse files

rcu: Get rid of synchronize_sched_expedited()'s polling loop



This commit gets rid of synchronize_sched_expedited()'s mutex_trylock()
polling loop in favor of a funnel-locking scheme based on the rcu_node
tree.  The work-done check is done at each level of the tree, allowing
high-contention situations to be resolved quickly with reasonable levels
of mutex contention.

Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
parent d6ada2cf
Loading
Loading
Loading
Loading
+40 −55
Original line number Diff line number Diff line
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");

static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];

/*
 * In order to export the rcu_state name to the tracing tools, it
@@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \
	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
	.orphan_donetail = &sname##_state.orphan_donelist, \
	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
	.name = RCU_STATE_NAME(sname), \
	.abbr = sabbr, \
}
@@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
	return 0;
}

/* Common code for synchronize_sched_expedited() work-done checking. */
static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
			      atomic_long_t *stat, unsigned long s)
{
	if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
		if (rnp)
			mutex_unlock(&rnp->exp_funnel_mutex);
		/* Ensure test happens before caller kfree(). */
		smp_mb__before_atomic(); /* ^^^ */
		atomic_long_inc(stat);
		put_online_cpus();
		return true;
	}
	return false;
}

/**
 * synchronize_sched_expedited - Brute-force RCU-sched grace period
 *
@@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 * This implementation can be thought of as an application of sequence
 * locking to expedited grace periods, but using the sequence counter to
 * determine when someone else has already done the work instead of for
 * retrying readers.  We do a mutex_trylock() polling loop, but if we fail
 * too many times in a row, we fall back to synchronize_sched().
 * retrying readers.
 */
void synchronize_sched_expedited(void)
{
	int cpu;
	long s;
	int trycount = 0;
	struct rcu_state *rsp = &rcu_sched_state;
	struct rcu_node *rnp0;
	struct rcu_node *rnp1 = NULL;

	/* Take a snapshot of the sequence number.  */
	smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void)
	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));

	/*
	 * Each pass through the following loop attempts to acquire
	 * ->expedited_mutex, checking for others doing our work each time.
	 */
	while (!mutex_trylock(&rsp->expedited_mutex)) {
		put_online_cpus();
		atomic_long_inc(&rsp->expedited_tryfail);

		/* Check to see if someone else did our work for us. */
		if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
			/* ensure test happens before caller kfree */
			smp_mb__before_atomic(); /* ^^^ */
			atomic_long_inc(&rsp->expedited_workdone1);
			return;
		}

		/* No joy, try again later.  Or just synchronize_sched(). */
		if (trycount++ < 10) {
			udelay(trycount * num_online_cpus());
		} else {
			wait_rcu_gp(call_rcu_sched);
			atomic_long_inc(&rsp->expedited_normal);
			return;
		}

		/* Recheck to see if someone else did our work for us. */
		if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
			/* ensure test happens before caller kfree */
			smp_mb__before_atomic(); /* ^^^ */
			atomic_long_inc(&rsp->expedited_workdone2);
			return;
		}

		/*
		 * Refetching sync_sched_expedited_started allows later
		 * callers to piggyback on our grace period.  We retry
		 * after they started, so our grace period works for them,
		 * and they started after our first try, so their grace
		 * period works for us.
	 * Each pass through the following loop works its way
	 * up the rcu_node tree, returning if others have done the
	 * work or otherwise falls through holding the root rnp's
	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
	 * can be inexact, as it is just promoting locality and is not
	 * strictly needed for correctness.
	 */
		if (!try_get_online_cpus()) {
			/* CPU hotplug operation in flight, use normal GP. */
			wait_rcu_gp(call_rcu_sched);
			atomic_long_inc(&rsp->expedited_normal);
	rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
		if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
			return;
		mutex_lock(&rnp0->exp_funnel_mutex);
		if (rnp1)
			mutex_unlock(&rnp1->exp_funnel_mutex);
		rnp1 = rnp0;
	}
	}

	/* Recheck yet again to see if someone else did our work for us. */
	if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
		rsp->expedited_workdone3++;
		mutex_unlock(&rsp->expedited_mutex);
		smp_mb(); /* ensure test happens before caller kfree */
	rnp0 = rnp1;  /* rcu_get_root(rsp), AKA root rcu_node structure. */
	if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
		return;
	}

	WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
	smp_mb(); /* Ensure expedited GP seen after counter increment. */
@@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void)
	smp_mb(); /* Ensure expedited GP seen before counter increment. */
	WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
	WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
	mutex_unlock(&rsp->expedited_mutex);
	mutex_unlock(&rnp0->exp_funnel_mutex);
	smp_mb(); /* ensure subsequent action seen after grace period. */

	put_online_cpus();
@@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
{
	static const char * const buf[] = RCU_NODE_NAME_INIT;
	static const char * const fqs[] = RCU_FQS_NAME_INIT;
	static const char * const exp[] = RCU_EXP_NAME_INIT;
	static u8 fl_mask = 0x1;

	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
			rnp->level = i;
			INIT_LIST_HEAD(&rnp->blkd_tasks);
			rcu_init_one_nocb(rnp);
			mutex_init(&rnp->exp_funnel_mutex);
			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
						   &rcu_exp_class[i], exp[i]);
		}
	}

+6 −2
Original line number Diff line number Diff line
@@ -68,6 +68,7 @@
#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
#  define RCU_NUM_LVLS	      2
#  define NUM_RCU_LVL_0	      1
@@ -76,6 +77,7 @@
#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
#  define RCU_NUM_LVLS	      3
#  define NUM_RCU_LVL_0	      1
@@ -85,6 +87,7 @@
#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
#  define RCU_NUM_LVLS	      4
#  define NUM_RCU_LVL_0	      1
@@ -95,6 +98,7 @@
#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -237,6 +241,8 @@ struct rcu_node {
	int need_future_gp[2];
				/* Counts of upcoming no-CB GP requests. */
	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;

	struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;

/*
@@ -480,12 +486,10 @@ struct rcu_state {
						/*  _rcu_barrier(). */
	/* End of fields guarded by barrier_mutex. */

	struct mutex  expedited_mutex;		/* Serializes expediting. */
	unsigned long expedited_sequence;	/* Take a ticket. */
	atomic_long_t expedited_tryfail;	/* # acquisition failures. */
	atomic_long_t expedited_workdone1;	/* # done by others #1. */
	atomic_long_t expedited_workdone2;	/* # done by others #2. */
	unsigned long expedited_workdone3;	/* # done by others #3. */
	atomic_long_t expedited_normal;		/* # fallbacks to normal. */

	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+1 −2
Original line number Diff line number Diff line
@@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
	struct rcu_state *rsp = (struct rcu_state *)m->private;

	seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n",
	seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
		   rsp->expedited_sequence,
		   atomic_long_read(&rsp->expedited_tryfail),
		   atomic_long_read(&rsp->expedited_workdone1),
		   atomic_long_read(&rsp->expedited_workdone2),
		   rsp->expedited_workdone3,
		   atomic_long_read(&rsp->expedited_normal),
		   rsp->expedited_sequence / 2);
	return 0;