Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1f7dd3e5 authored by Tejun Heo's avatar Tejun Heo
Browse files

cgroup: fix handling of multi-destination migration from subtree_control enabling



Consider the following v2 hierarchy.

  P0 (+memory) --- P1 (-memory) --- A
                                 \- B
       
P0 has memory enabled in its subtree_control while P1 doesn't.  If
both A and B contain processes, they would belong to the memory css of
P1.  Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter.  IOW, enabling controllers
can cause atomic migrations into different csses.

The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses.  pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.

 WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
 Modules linked in:
 CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
 ...
  ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
  ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
  ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
 Call Trace:
  [<ffffffff81551ffc>] dump_stack+0x4e/0x82
  [<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
  [<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
  [<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
  [<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
  [<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
  [<ffffffff81189016>] cgroup_attach_task+0x176/0x200
  [<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
  [<ffffffff81189684>] cgroup_procs_write+0x14/0x20
  [<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
  [<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
  [<ffffffff81265f88>] __vfs_write+0x28/0xe0
  [<ffffffff812666fc>] vfs_write+0xac/0x1a0
  [<ffffffff81267019>] SyS_write+0x49/0xb0
  [<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76

This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated.  All controllers are
updated accordingly.

* Controllers which don't care whether there are one or multiple
  target csses can be converted trivially.  cpu, io, freezer, perf,
  netclassid and netprio fall in this category.

* cpuset's current implementation assumes that there's single source
  and destination and thus doesn't support v2 hierarchy already.  The
  only change made by this patchset is how that single destination css
  is obtained.

* memory migration path already doesn't do anything on v2.  How the
  single destination css is obtained is updated and the prep stage of
  mem_cgroup_can_attach() is reordered to accomodate the change.

* pids is the only controller which was affected by this bug.  It now
  correctly handles multi-destination migrations and no longer causes
  counter underflow from incorrect accounting.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-and-tested-by: default avatarDaniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
parent 599c963a
Loading
Loading
Loading
Loading
+3 −3
Original line number Original line Diff line number Diff line
@@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
 * of the main cic data structures.  For now we allow a task to change
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 * its cgroup only if it's the only owner of its ioc.
 */
 */
static int blkcg_can_attach(struct cgroup_subsys_state *css,
static int blkcg_can_attach(struct cgroup_taskset *tset)
			    struct cgroup_taskset *tset)
{
{
	struct task_struct *task;
	struct task_struct *task;
	struct cgroup_subsys_state *dst_css;
	struct io_context *ioc;
	struct io_context *ioc;
	int ret = 0;
	int ret = 0;


	/* task_lock() is needed to avoid races with exit_io_context() */
	/* task_lock() is needed to avoid races with exit_io_context() */
	cgroup_taskset_for_each(task, tset) {
	cgroup_taskset_for_each(task, dst_css, tset) {
		task_lock(task);
		task_lock(task);
		ioc = task->io_context;
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+3 −6
Original line number Original line Diff line number Diff line
@@ -422,12 +422,9 @@ struct cgroup_subsys {
	void (*css_reset)(struct cgroup_subsys_state *css);
	void (*css_reset)(struct cgroup_subsys_state *css);
	void (*css_e_css_changed)(struct cgroup_subsys_state *css);
	void (*css_e_css_changed)(struct cgroup_subsys_state *css);


	int (*can_attach)(struct cgroup_subsys_state *css,
	int (*can_attach)(struct cgroup_taskset *tset);
			  struct cgroup_taskset *tset);
	void (*cancel_attach)(struct cgroup_taskset *tset);
	void (*cancel_attach)(struct cgroup_subsys_state *css,
	void (*attach)(struct cgroup_taskset *tset);
			      struct cgroup_taskset *tset);
	void (*attach)(struct cgroup_subsys_state *css,
		       struct cgroup_taskset *tset);
	int (*can_fork)(struct task_struct *task, void **priv_p);
	int (*can_fork)(struct task_struct *task, void **priv_p);
	void (*cancel_fork)(struct task_struct *task, void *priv);
	void (*cancel_fork)(struct task_struct *task, void *priv);
	void (*fork)(struct task_struct *task, void *priv);
	void (*fork)(struct task_struct *task, void *priv);
+22 −11
Original line number Original line Diff line number Diff line
@@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
						     struct cgroup_subsys_state *css);
						     struct cgroup_subsys_state *css);


struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
					 struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp);


void css_task_iter_start(struct cgroup_subsys_state *css,
void css_task_iter_start(struct cgroup_subsys_state *css,
			 struct css_task_iter *it);
			 struct css_task_iter *it);
@@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it);
/**
/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 * @tset: taskset to iterate
 *
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.  When there are multiple tasks in @tset, if a task of a
 * processes.
 * process is in @tset, all tasks of the process are in @tset.  Also, all
 *
 * are guaranteed to share the same source and destination csses.
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 *
 * Iteration is not in any specific order.
 * Iteration is not in any specific order.
 */
 */
#define cgroup_taskset_for_each(task, tset)				\
#define cgroup_taskset_for_each(task, dst_css, tset)			\
	for ((task) = cgroup_taskset_first((tset)); (task);		\
	for ((task) = cgroup_taskset_first((tset), &(dst_css));		\
	     (task) = cgroup_taskset_next((tset)))
	     (task);							\
	     (task) = cgroup_taskset_next((tset), &(dst_css)))


/**
/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: takset to iterate
 * @tset: takset to iterate
 *
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 * may not contain any.
 */
 */
#define cgroup_taskset_for_each_leader(leader, tset)			\
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)		\
	for ((leader) = cgroup_taskset_first((tset)); (leader);		\
	for ((leader) = cgroup_taskset_first((tset), &(dst_css));	\
	     (leader) = cgroup_taskset_next((tset)))			\
	     (leader);							\
	     (leader) = cgroup_taskset_next((tset), &(dst_css)))	\
		if ((leader) != (leader)->group_leader)			\
		if ((leader) != (leader)->group_leader)			\
			;						\
			;						\
		else
		else
+34 −9
Original line number Original line Diff line number Diff line
@@ -2237,6 +2237,9 @@ struct cgroup_taskset {
	struct list_head	src_csets;
	struct list_head	src_csets;
	struct list_head	dst_csets;
	struct list_head	dst_csets;


	/* the subsys currently being processed */
	int			ssid;

	/*
	/*
	 * Fields for cgroup_taskset_*() iteration.
	 * Fields for cgroup_taskset_*() iteration.
	 *
	 *
@@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task,
/**
/**
 * cgroup_taskset_first - reset taskset and return the first task
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 *
 * @tset iteration is initialized and the first task is returned.
 * @tset iteration is initialized and the first task is returned.
 */
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
					 struct cgroup_subsys_state **dst_cssp)
{
{
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_task = NULL;
	tset->cur_task = NULL;


	return cgroup_taskset_next(tset);
	return cgroup_taskset_next(tset, dst_cssp);
}
}


/**
/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 * with cgroup_taskset_first().
 */
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp)
{
{
	struct css_set *cset = tset->cur_cset;
	struct css_set *cset = tset->cur_cset;
	struct task_struct *task = tset->cur_task;
	struct task_struct *task = tset->cur_task;
@@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
		if (&task->cg_list != &cset->mg_tasks) {
		if (&task->cg_list != &cset->mg_tasks) {
			tset->cur_cset = cset;
			tset->cur_cset = cset;
			tset->cur_task = task;
			tset->cur_task = task;

			/*
			 * This function may be called both before and
			 * after cgroup_taskset_migrate().  The two cases
			 * can be distinguished by looking at whether @cset
			 * has its ->mg_dst_cset set.
			 */
			if (cset->mg_dst_cset)
				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
			else
				*dst_cssp = cset->subsys[tset->ssid];

			return task;
			return task;
		}
		}


@@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	/* check that we can legitimately attach to the cgroup */
	/* check that we can legitimately attach to the cgroup */
	for_each_e_css(css, i, dst_cgrp) {
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->can_attach) {
		if (css->ss->can_attach) {
			ret = css->ss->can_attach(css, tset);
			tset->ssid = i;
			ret = css->ss->can_attach(tset);
			if (ret) {
			if (ret) {
				failed_css = css;
				failed_css = css;
				goto out_cancel_attach;
				goto out_cancel_attach;
@@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	 */
	 */
	tset->csets = &tset->dst_csets;
	tset->csets = &tset->dst_csets;


	for_each_e_css(css, i, dst_cgrp)
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->attach)
		if (css->ss->attach) {
			css->ss->attach(css, tset);
			tset->ssid = i;
			css->ss->attach(tset);
		}
	}


	ret = 0;
	ret = 0;
	goto out_release_tset;
	goto out_release_tset;
@@ -2411,8 +2434,10 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	for_each_e_css(css, i, dst_cgrp) {
	for_each_e_css(css, i, dst_cgrp) {
		if (css == failed_css)
		if (css == failed_css)
			break;
			break;
		if (css->ss->cancel_attach)
		if (css->ss->cancel_attach) {
			css->ss->cancel_attach(css, tset);
			tset->ssid = i;
			css->ss->cancel_attach(tset);
		}
	}
	}
out_release_tset:
out_release_tset:
	spin_lock_bh(&css_set_lock);
	spin_lock_bh(&css_set_lock);
+3 −3
Original line number Original line Diff line number Diff line
@@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css)
 * @freezer->lock.  freezer_attach() makes the new tasks conform to the
 * @freezer->lock.  freezer_attach() makes the new tasks conform to the
 * current state and all following state changes can see the new tasks.
 * current state and all following state changes can see the new tasks.
 */
 */
static void freezer_attach(struct cgroup_subsys_state *new_css,
static void freezer_attach(struct cgroup_taskset *tset)
			   struct cgroup_taskset *tset)
{
{
	struct task_struct *task;
	struct task_struct *task;
	struct cgroup_subsys_state *new_css;


	mutex_lock(&freezer_mutex);
	mutex_lock(&freezer_mutex);


@@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
	 * current state before executing the following - !frozen tasks may
	 * current state before executing the following - !frozen tasks may
	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
	 */
	 */
	cgroup_taskset_for_each(task, tset) {
	cgroup_taskset_for_each(task, new_css, tset) {
		struct freezer *freezer = css_freezer(new_css);
		struct freezer *freezer = css_freezer(new_css);


		if (!(freezer->state & CGROUP_FREEZING)) {
		if (!(freezer->state & CGROUP_FREEZING)) {
Loading