Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1a90dd50 authored by Tejun Heo's avatar Tejun Heo
Browse files

cgroup: deactivate CSS's and mark cgroup dead before invoking ->pre_destroy()



Because ->pre_destroy() could fail and can't be called under
cgroup_mutex, cgroup destruction did something very ugly.

  1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise.

  2. Release cgroup_mutex and call ->pre_destroy().

  3. Re-grab cgroup_mutex and verify it can still be destroyed; fail
     otherwise.

  4. Continue destroying.

In addition to being ugly, it has been always broken in various ways.
For example, memcg ->pre_destroy() expects the cgroup to be inactive
after it's done but tasks can be attached and detached between #2 and
#3 and the conditions that memcg verified in ->pre_destroy() might no
longer hold by the time control reaches #3.

Now that ->pre_destroy() is no longer allowed to fail.  We can switch
to the following.

  1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise.

  2. Deactivate CSS's and mark the cgroup removed thus preventing any
     further operations which can invalidate the verification from #1.

  3. Release cgroup_mutex and call ->pre_destroy().

  4. Re-grab cgroup_mutex and continue destroying.

After this change, controllers can safely assume that ->pre_destroy()
will only be called only once for a given cgroup and, once
->pre_destroy() is called, the cgroup will stay dormant till it's
destroyed.

This removes the only reason ->pre_destroy() can fail - new task being
attached or child cgroup being created inbetween.  Error out path is
removed and ->pre_destroy() invocation is open coded in
cgroup_rmdir().

v2: cgroup_call_pre_destroy() removal moved to this patch per Michal.
    Commit message updated per Glauber.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarMichal Hocko <mhocko@suse.cz>
Reviewed-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: default avatarLi Zefan <lizefan@huawei.com>
Cc: Glauber Costa <glommer@parallels.com>
parent 976c06bc
Loading
Loading
Loading
Loading
+19 −46
Original line number Original line Diff line number Diff line
@@ -851,27 +851,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
	return inode;
	return inode;
}
}


/*
 * Call subsys's pre_destroy handler.
 * This is called before css refcnt check.
 */
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
	struct cgroup_subsys *ss;
	int ret = 0;

	for_each_subsys(cgrp->root, ss) {
		if (!ss->pre_destroy)
			continue;

		ret = ss->pre_destroy(cgrp);
		if (WARN_ON_ONCE(ret))
			break;
	}

	return ret;
}

static void cgroup_diput(struct dentry *dentry, struct inode *inode)
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
{
	/* is dentry a directory ? if so, kfree() associated cgroup */
	/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -4078,19 +4057,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
	DEFINE_WAIT(wait);
	DEFINE_WAIT(wait);
	struct cgroup_event *event, *tmp;
	struct cgroup_event *event, *tmp;
	struct cgroup_subsys *ss;
	struct cgroup_subsys *ss;
	int ret;

	/* the vfs holds both inode->i_mutex already */
	mutex_lock(&cgroup_mutex);
	if (atomic_read(&cgrp->count) != 0) {
		mutex_unlock(&cgroup_mutex);
		return -EBUSY;
	}
	if (!list_empty(&cgrp->children)) {
		mutex_unlock(&cgroup_mutex);
		return -EBUSY;
	}
	mutex_unlock(&cgroup_mutex);


	/*
	/*
	 * In general, subsystem has no css->refcnt after pre_destroy(). But
	 * In general, subsystem has no css->refcnt after pre_destroy(). But
@@ -4103,16 +4069,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
	 */
	 */
	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);


	/*
	/* the vfs holds both inode->i_mutex already */
	 * Call pre_destroy handlers of subsys. Notify subsystems
	 * that rmdir() request comes.
	 */
	ret = cgroup_call_pre_destroy(cgrp);
	if (ret) {
		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
		return ret;
	}

	mutex_lock(&cgroup_mutex);
	mutex_lock(&cgroup_mutex);
	parent = cgrp->parent;
	parent = cgrp->parent;
	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
@@ -4122,13 +4079,30 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
	}
	}
	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);


	/* block new css_tryget() by deactivating refcnt */
	/*
	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
	 * removed.  This makes future css_tryget() and child creation
	 * attempts fail thus maintaining the removal conditions verified
	 * above.
	 */
	for_each_subsys(cgrp->root, ss) {
	for_each_subsys(cgrp->root, ss) {
		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];


		WARN_ON(atomic_read(&css->refcnt) < 0);
		WARN_ON(atomic_read(&css->refcnt) < 0);
		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
	}
	}
	set_bit(CGRP_REMOVED, &cgrp->flags);

	/*
	 * Tell subsystems to initate destruction.  pre_destroy() should be
	 * called with cgroup_mutex unlocked.  See 3fa59dfbc3 ("cgroup: fix
	 * potential deadlock in pre_destroy") for details.
	 */
	mutex_unlock(&cgroup_mutex);
	for_each_subsys(cgrp->root, ss)
		if (ss->pre_destroy)
			WARN_ON_ONCE(ss->pre_destroy(cgrp));
	mutex_lock(&cgroup_mutex);


	/*
	/*
	 * Put all the base refs.  Each css holds an extra reference to the
	 * Put all the base refs.  Each css holds an extra reference to the
@@ -4144,7 +4118,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);


	raw_spin_lock(&release_list_lock);
	raw_spin_lock(&release_list_lock);
	set_bit(CGRP_REMOVED, &cgrp->flags);
	if (!list_empty(&cgrp->release_list))
	if (!list_empty(&cgrp->release_list))
		list_del_init(&cgrp->release_list);
		list_del_init(&cgrp->release_list);
	raw_spin_unlock(&release_list_lock);
	raw_spin_unlock(&release_list_lock);