Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 12f5f524 authored by Paul E. McKenney's avatar Paul E. McKenney Committed by Paul E. McKenney
Browse files

rcu: merge TREE_PREEPT_RCU blocked_tasks[] lists



Combine the current TREE_PREEMPT_RCU ->blocked_tasks[] lists in the
rcu_node structure into a single ->blkd_tasks list with ->gp_tasks
and ->exp_tasks tail pointers.  This is in preparation for RCU priority
boosting, which will add a third dimension to the combinatorial explosion
in the ->blocked_tasks[] case, but simply a third pointer in the new
->blkd_tasks case.

Also update documentation to reflect blocked_tasks[] merge

Signed-off-by: default avatarPaul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: default avatarJosh Triplett <josh@joshtriplett.org>
parent e59fb312
Loading
Loading
Loading
Loading
+18 −11
Original line number Diff line number Diff line
@@ -166,14 +166,14 @@ o "gpnum" is the number of grace periods that have started. It is
The output of "cat rcu/rcuhier" looks as follows, with very long lines:

c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0    
3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
1/1 ..>. 0:127 ^0
3/3 ..>. 0:35 ^0    0/0 ..>. 36:71 ^1    0/0 ..>. 72:107 ^2    0/0 ..>. 108:127 ^3
3/3f ..>. 0:5 ^0    2/3 ..>. 6:11 ^1    0/0 ..>. 12:17 ^2    0/0 ..>. 18:23 ^3    0/0 ..>. 24:29 ^4    0/0 ..>. 30:35 ^5    0/0 ..>. 36:41 ^0    0/0 ..>. 42:47 ^1    0/0 ..>. 48:53 ^2    0/0 ..>. 54:59 ^3    0/0 ..>. 60:65 ^4    0/0 ..>. 66:71 ^5    0/0 ..>. 72:77 ^0    0/0 ..>. 78:83 ^1    0/0 ..>. 84:89 ^2    0/0 ..>. 90:95 ^3    0/0 ..>. 96:101 ^4    0/0 ..>. 102:107 ^5    0/0 ..>. 108:113 ^0    0/0 ..>. 114:119 ^1    0/0 ..>. 120:125 ^2    0/0 ..>. 126:127 ^3
rcu_bh:
c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0    
0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
0/1 ..>. 0:127 ^0
0/3 ..>. 0:35 ^0    0/0 ..>. 36:71 ^1    0/0 ..>. 72:107 ^2    0/0 ..>. 108:127 ^3
0/3f ..>. 0:5 ^0    0/3 ..>. 6:11 ^1    0/0 ..>. 12:17 ^2    0/0 ..>. 18:23 ^3    0/0 ..>. 24:29 ^4    0/0 ..>. 30:35 ^5    0/0 ..>. 36:41 ^0    0/0 ..>. 42:47 ^1    0/0 ..>. 48:53 ^2    0/0 ..>. 54:59 ^3    0/0 ..>. 60:65 ^4    0/0 ..>. 66:71 ^5    0/0 ..>. 72:77 ^0    0/0 ..>. 78:83 ^1    0/0 ..>. 84:89 ^2    0/0 ..>. 90:95 ^3    0/0 ..>. 96:101 ^4    0/0 ..>. 102:107 ^5    0/0 ..>. 108:113 ^0    0/0 ..>. 114:119 ^1    0/0 ..>. 120:125 ^2    0/0 ..>. 126:127 ^3

This is once again split into "rcu_sched" and "rcu_bh" portions,
and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
@@ -232,13 +232,20 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
		current grace period.

	o	The characters separated by the ">" indicate the state
		of the blocked-tasks lists.  A "T" preceding the ">"
		of the blocked-tasks lists.  A "G" preceding the ">"
		indicates that at least one task blocked in an RCU
		read-side critical section blocks the current grace
		period, while a "." preceding the ">" indicates otherwise.
		The character following the ">" indicates similarly for
		the next grace period.  A "T" should appear in this
		field only for rcu-preempt.
		period, while a "E" preceding the ">" indicates that
		at least one task blocked in an RCU read-side critical
		section blocks the current expedited grace period.
		A "T" character following the ">" indicates that at
		least one task is blocked within an RCU read-side
		critical section, regardless of whether any current
		grace period (expedited or normal) is inconvenienced.
		A "." character appears if the corresponding condition
		does not hold, so that "..>." indicates that no tasks
		are blocked.  In contrast, "GE>T" indicates maximal
		inconvenience from blocked tasks.

	o	The numbers separated by the ":" are the range of CPUs
		served by this struct rcu_node.  This can be helpful
+1 −4
Original line number Diff line number Diff line
@@ -1901,10 +1901,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
					      j / rsp->levelspread[i - 1];
			}
			rnp->level = i;
			INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
			INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
			INIT_LIST_HEAD(&rnp->blkd_tasks);
		}
	}

+15 −6
Original line number Diff line number Diff line
@@ -107,7 +107,7 @@ struct rcu_node {
				/*  an rcu_data structure, otherwise, each */
				/*  bit corresponds to a child rcu_node */
				/*  structure. */
	unsigned long expmask;	/* Groups that have ->blocked_tasks[] */
	unsigned long expmask;	/* Groups that have ->blkd_tasks */
				/*  elements that need to drain to allow the */
				/*  current expedited grace period to */
				/*  complete (only for TREE_PREEMPT_RCU). */
@@ -120,11 +120,20 @@ struct rcu_node {
	u8	grpnum;		/* CPU/group number for next level up. */
	u8	level;		/* root is at level 0. */
	struct rcu_node *parent;
	struct list_head blocked_tasks[4];
				/* Tasks blocked in RCU read-side critsect. */
				/*  Grace period number (->gpnum) x blocked */
				/*  by tasks on the (x & 0x1) element of the */
				/*  blocked_tasks[] array. */
	struct list_head blkd_tasks;
				/* Tasks blocked in RCU read-side critical */
				/*  section.  Tasks are placed at the head */
				/*  of this list and age towards the tail. */
	struct list_head *gp_tasks;
				/* Pointer to the first task blocking the */
				/*  current grace period, or NULL if there */
				/*  is no such task. */
	struct list_head *exp_tasks;
				/* Pointer to the first task blocking the */
				/*  current expedited grace period, or NULL */
				/*  if there is no such task.  If there */
				/*  is no current expedited grace period, */
				/*  then there can cannot be any such task. */
} ____cacheline_internodealigned_in_smp;

/*
+97 −66
Original line number Diff line number Diff line
@@ -130,12 +130,12 @@ static void rcu_preempt_qs(int cpu)
 * We have entered the scheduler, and the current task might soon be
 * context-switched away from.  If this task is in an RCU read-side
 * critical section, we will no longer be able to rely on the CPU to
 * record that fact, so we enqueue the task on the appropriate entry
 * of the blocked_tasks[] array.  The task will dequeue itself when
 * it exits the outermost enclosing RCU read-side critical section.
 * Therefore, the current grace period cannot be permitted to complete
 * until the blocked_tasks[] entry indexed by the low-order bit of
 * rnp->gpnum empties.
 * record that fact, so we enqueue the task on the blkd_tasks list.
 * The task will dequeue itself when it exits the outermost enclosing
 * RCU read-side critical section.  Therefore, the current grace period
 * cannot be permitted to complete until the blkd_tasks list entries
 * predating the current grace period drain, in other words, until
 * rnp->gp_tasks becomes NULL.
 *
 * Caller must disable preemption.
 */
@@ -143,7 +143,6 @@ static void rcu_preempt_note_context_switch(int cpu)
{
	struct task_struct *t = current;
	unsigned long flags;
	int phase;
	struct rcu_data *rdp;
	struct rcu_node *rnp;

@@ -165,15 +164,26 @@ static void rcu_preempt_note_context_switch(int cpu)
		 * (i.e., this CPU has not yet passed through a quiescent
		 * state for the current grace period), then as long
		 * as that task remains queued, the current grace period
		 * cannot end.
		 * cannot end.  Note that there is some uncertainty as
		 * to exactly when the current grace period started.
		 * We take a conservative approach, which can result
		 * in unnecessarily waiting on tasks that started very
		 * slightly after the current grace period began.  C'est
		 * la vie!!!
		 *
		 * But first, note that the current CPU must still be
		 * on line!
		 */
		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
		phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
		list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
			rnp->gp_tasks = &t->rcu_node_entry;
		} else {
			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
			if (rnp->qsmask & rdp->grpmask)
				rnp->gp_tasks = &t->rcu_node_entry;
		}
		raw_spin_unlock_irqrestore(&rnp->lock, flags);
	}

@@ -210,10 +220,7 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
 */
static int rcu_preempted_readers(struct rcu_node *rnp)
{
	int phase = rnp->gpnum & 0x1;

	return !list_empty(&rnp->blocked_tasks[phase]) ||
	       !list_empty(&rnp->blocked_tasks[phase + 2]);
	return rnp->gp_tasks != NULL;
}

/*
@@ -252,6 +259,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
}

/*
 * Advance a ->blkd_tasks-list pointer to the next entry, instead
 * returning NULL if at the end of the list.
 */
static struct list_head *rcu_next_node_entry(struct task_struct *t,
					     struct rcu_node *rnp)
{
	struct list_head *np;

	np = t->rcu_node_entry.next;
	if (np == &rnp->blkd_tasks)
		np = NULL;
	return np;
}

/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
@@ -262,6 +284,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
	int empty;
	int empty_exp;
	unsigned long flags;
	struct list_head *np;
	struct rcu_node *rnp;
	int special;

@@ -305,7 +328,12 @@ static void rcu_read_unlock_special(struct task_struct *t)
		empty = !rcu_preempted_readers(rnp);
		empty_exp = !rcu_preempted_readers_exp(rnp);
		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
		np = rcu_next_node_entry(t, rnp);
		list_del_init(&t->rcu_node_entry);
		if (&t->rcu_node_entry == rnp->gp_tasks)
			rnp->gp_tasks = np;
		if (&t->rcu_node_entry == rnp->exp_tasks)
			rnp->exp_tasks = np;
		t->rcu_blocked_node = NULL;

		/*
@@ -361,19 +389,17 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
{
	unsigned long flags;
	struct list_head *lp;
	int phase;
	struct task_struct *t;

	if (rcu_preempted_readers(rnp)) {
	if (!rcu_preempted_readers(rnp))
		return;
	raw_spin_lock_irqsave(&rnp->lock, flags);
		phase = rnp->gpnum & 0x1;
		lp = &rnp->blocked_tasks[phase];
		list_for_each_entry(t, lp, rcu_node_entry)
	t = list_entry(rnp->gp_tasks,
		       struct task_struct, rcu_node_entry);
	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
		sched_show_task(t);
	raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}

/*
 * Dump detailed information for all tasks blocking the current RCU
@@ -402,17 +428,15 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 */
static void rcu_print_task_stall(struct rcu_node *rnp)
{
	struct list_head *lp;
	int phase;
	struct task_struct *t;

	if (rcu_preempted_readers(rnp)) {
		phase = rnp->gpnum & 0x1;
		lp = &rnp->blocked_tasks[phase];
		list_for_each_entry(t, lp, rcu_node_entry)
	if (!rcu_preempted_readers(rnp))
		return;
	t = list_entry(rnp->gp_tasks,
		       struct task_struct, rcu_node_entry);
	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
		printk(" P%d", t->pid);
}
}

/*
 * Suppress preemptible RCU's CPU stall warnings by pushing the
@@ -430,10 +454,15 @@ static void rcu_preempt_stall_reset(void)
 * period that still has RCU readers blocked!  This function must be
 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 * must be held by the caller.
 *
 * Also, if there are blocked tasks on the list, they automatically
 * block the newly created grace period, so set up ->gp_tasks accordingly.
 */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
	WARN_ON_ONCE(rcu_preempted_readers(rnp));
	if (!list_empty(&rnp->blkd_tasks))
		rnp->gp_tasks = rnp->blkd_tasks.next;
	WARN_ON_ONCE(rnp->qsmask);
}

@@ -457,45 +486,49 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
				     struct rcu_node *rnp,
				     struct rcu_data *rdp)
{
	int i;
	struct list_head *lp;
	struct list_head *lp_root;
	int retval = 0;
	struct rcu_node *rnp_root = rcu_get_root(rsp);
	struct task_struct *tp;
	struct task_struct *t;

	if (rnp == rnp_root) {
		WARN_ONCE(1, "Last CPU thought to be offlined?");
		return 0;  /* Shouldn't happen: at least one CPU online. */
	}
	WARN_ON_ONCE(rnp != rdp->mynode &&
		     (!list_empty(&rnp->blocked_tasks[0]) ||
		      !list_empty(&rnp->blocked_tasks[1]) ||
		      !list_empty(&rnp->blocked_tasks[2]) ||
		      !list_empty(&rnp->blocked_tasks[3])));

	/* If we are on an internal node, complain bitterly. */
	WARN_ON_ONCE(rnp != rdp->mynode);

	/*
	 * Move tasks up to root rcu_node.  Rely on the fact that the
	 * root rcu_node can be at most one ahead of the rest of the
	 * rcu_nodes in terms of gp_num value.  This fact allows us to
	 * move the blocked_tasks[] array directly, element by element.
	 * Move tasks up to root rcu_node.  Don't try to get fancy for
	 * this corner-case operation -- just put this node's tasks
	 * at the head of the root node's list, and update the root node's
	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
	 * if non-NULL.  This might result in waiting for more tasks than
	 * absolutely necessary, but this is a good performance/complexity
	 * tradeoff.
	 */
	if (rcu_preempted_readers(rnp))
		retval |= RCU_OFL_TASKS_NORM_GP;
	if (rcu_preempted_readers_exp(rnp))
		retval |= RCU_OFL_TASKS_EXP_GP;
	for (i = 0; i < 4; i++) {
		lp = &rnp->blocked_tasks[i];
		lp_root = &rnp_root->blocked_tasks[i];
	lp = &rnp->blkd_tasks;
	lp_root = &rnp_root->blkd_tasks;
	while (!list_empty(lp)) {
			tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
			list_del(&tp->rcu_node_entry);
			tp->rcu_blocked_node = rnp_root;
			list_add(&tp->rcu_node_entry, lp_root);
			raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
		}
	}
		list_del(&t->rcu_node_entry);
		t->rcu_blocked_node = rnp_root;
		list_add(&t->rcu_node_entry, lp_root);
		if (&t->rcu_node_entry == rnp->gp_tasks)
			rnp_root->gp_tasks = rnp->gp_tasks;
		if (&t->rcu_node_entry == rnp->exp_tasks)
			rnp_root->exp_tasks = rnp->exp_tasks;
		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
	}
	rnp->gp_tasks = NULL;
	rnp->exp_tasks = NULL;
	return retval;
}

@@ -586,8 +619,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 */
static int rcu_preempted_readers_exp(struct rcu_node *rnp)
{
	return !list_empty(&rnp->blocked_tasks[2]) ||
	       !list_empty(&rnp->blocked_tasks[3]);
	return rnp->exp_tasks != NULL;
}

/*
@@ -647,12 +679,13 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
static void
sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
{
	int must_wait;
	int must_wait = 0;

	raw_spin_lock(&rnp->lock); /* irqs already disabled */
	list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
	list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
	must_wait = rcu_preempted_readers_exp(rnp);
	if (!list_empty(&rnp->blkd_tasks)) {
		rnp->exp_tasks = rnp->blkd_tasks.next;
		must_wait = 1;
	}
	raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
	if (!must_wait)
		rcu_report_exp_rnp(rsp, rnp);
@@ -661,9 +694,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
/*
 * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
 * is to invoke synchronize_sched_expedited() to push all the tasks to
 * the ->blocked_tasks[] lists, move all entries from the first set of
 * ->blocked_tasks[] lists to the second set, and finally wait for this
 * second set to drain.
 * the ->blkd_tasks lists and wait for this list to drain.
 */
void synchronize_rcu_expedited(void)
{
@@ -695,7 +726,7 @@ void synchronize_rcu_expedited(void)
	if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
		goto unlock_mb_ret; /* Others did our work for us. */

	/* force all RCU readers onto blocked_tasks[]. */
	/* force all RCU readers onto ->blkd_tasks lists. */
	synchronize_sched_expedited();

	raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -707,7 +738,7 @@ void synchronize_rcu_expedited(void)
		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
	}

	/* Snapshot current state of ->blocked_tasks[] lists. */
	/* Snapshot current state of ->blkd_tasks lists. */
	rcu_for_each_leaf_node(rsp, rnp)
		sync_rcu_preempt_exp_init(rsp, rnp);
	if (NUM_RCU_NODES > 1)
@@ -715,7 +746,7 @@ void synchronize_rcu_expedited(void)

	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);

	/* Wait for snapshotted ->blocked_tasks[] lists to drain. */
	/* Wait for snapshotted ->blkd_tasks lists to drain. */
	rnp = rcu_get_root(rsp);
	wait_event(sync_rcu_preempt_exp_wq,
		   sync_rcu_preempt_exp_done(rnp));
+4 −7
Original line number Diff line number Diff line
@@ -161,7 +161,6 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
{
	unsigned long gpnum;
	int level = 0;
	int phase;
	struct rcu_node *rnp;

	gpnum = rsp->gpnum;
@@ -178,13 +177,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
			seq_puts(m, "\n");
			level = rnp->level;
		}
		phase = gpnum & 0x1;
		seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d    ",
		seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
			   rnp->qsmask, rnp->qsmaskinit,
			   "T."[list_empty(&rnp->blocked_tasks[phase])],
			   "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
			   "T."[list_empty(&rnp->blocked_tasks[!phase])],
			   "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
			   ".G"[rnp->gp_tasks != NULL],
			   ".E"[rnp->exp_tasks != NULL],
			   ".T"[!list_empty(&rnp->blkd_tasks)],
			   rnp->grplo, rnp->grphi, rnp->grpnum);
	}
	seq_puts(m, "\n");