Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5406812e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup fixes from Tejun Heo:
 "More change than I'd have liked at this stage.  The pids controller
  and the changes made to cgroup core to support it introduced and
  revealed several important issues.

   - Assigning membership to a newly created task and migrating it can
     race leading to incorrect accounting.  Oleg fixed it by widening
     threadgroup synchronization.  It looks like we'll be able to merge
     it with a different percpu rwsem which is used in fork path making
     things simpler and cheaper.

   - The recent change to extend cgroup membership to zombies (so that
     pid accounting can extend till the pid is actually released) missed
     pinning the underlying data structures leading to use-after-free.
     Fixed.

   - v2 hierarchy was calling subsystem callbacks with the wrong target
     cgroup_subsys_state based on the incorrect assumption that they
     share the same target.  pids is the first controller affected by
     this.  Subsys callbacks updated so that they can deal with
     multi-target migrations"

* 'for-4.4-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup_pids: don't account for the root cgroup
  cgroup: fix handling of multi-destination migration from subtree_control enabling
  cgroup_freezer: simplify propagation of CGROUP_FROZEN clearing in freezer_attach()
  cgroup: pids: kill pids_fork(), simplify pids_can_fork() and pids_cancel_fork()
  cgroup: pids: fix race between cgroup_post_fork() and cgroup_migrate()
  cgroup: make css_set pin its css's to avoid use-afer-free
  cgroup: fix cftype->file_offset handling
parents 633bb738 0b98f0c0
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
 * of the main cic data structures.  For now we allow a task to change
 * its cgroup only if it's the only owner of its ioc.
 */
static int blkcg_can_attach(struct cgroup_subsys_state *css,
			    struct cgroup_taskset *tset)
static int blkcg_can_attach(struct cgroup_taskset *tset)
{
	struct task_struct *task;
	struct cgroup_subsys_state *dst_css;
	struct io_context *ioc;
	int ret = 0;

	/* task_lock() is needed to avoid races with exit_io_context() */
	cgroup_taskset_for_each(task, tset) {
	cgroup_taskset_for_each(task, dst_css, tset) {
		task_lock(task);
		ioc = task->io_context;
		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+3 −10
Original line number Diff line number Diff line
@@ -90,7 +90,6 @@ enum {
 */
struct cgroup_file {
	/* do not access any fields from outside cgroup core */
	struct list_head node;			/* anchored at css->files */
	struct kernfs_node *kn;
};

@@ -134,9 +133,6 @@ struct cgroup_subsys_state {
	 */
	u64 serial_nr;

	/* all cgroup_files associated with this css */
	struct list_head files;

	/* percpu_ref killing and RCU release */
	struct rcu_head rcu_head;
	struct work_struct destroy_work;
@@ -426,12 +422,9 @@ struct cgroup_subsys {
	void (*css_reset)(struct cgroup_subsys_state *css);
	void (*css_e_css_changed)(struct cgroup_subsys_state *css);

	int (*can_attach)(struct cgroup_subsys_state *css,
			  struct cgroup_taskset *tset);
	void (*cancel_attach)(struct cgroup_subsys_state *css,
			      struct cgroup_taskset *tset);
	void (*attach)(struct cgroup_subsys_state *css,
		       struct cgroup_taskset *tset);
	int (*can_attach)(struct cgroup_taskset *tset);
	void (*cancel_attach)(struct cgroup_taskset *tset);
	void (*attach)(struct cgroup_taskset *tset);
	int (*can_fork)(struct task_struct *task, void **priv_p);
	void (*cancel_fork)(struct task_struct *task, void *priv);
	void (*fork)(struct task_struct *task, void *priv);
+23 −24
Original line number Diff line number Diff line
@@ -88,6 +88,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
@@ -119,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
						     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
					 struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css,
			 struct css_task_iter *it);
@@ -235,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it);
/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.  When there are multiple tasks in @tset, if a task of a
 * process is in @tset, all tasks of the process are in @tset.  Also, all
 * are guaranteed to share the same source and destination csses.
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, tset)				\
	for ((task) = cgroup_taskset_first((tset)); (task);		\
	     (task) = cgroup_taskset_next((tset)))
#define cgroup_taskset_for_each(task, dst_css, tset)			\
	for ((task) = cgroup_taskset_first((tset), &(dst_css));		\
	     (task);							\
	     (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: takset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, tset)			\
	for ((leader) = cgroup_taskset_first((tset)); (leader);		\
	     (leader) = cgroup_taskset_next((tset)))			\
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)		\
	for ((leader) = cgroup_taskset_first((tset), &(dst_css));	\
	     (leader);							\
	     (leader) = cgroup_taskset_next((tset), &(dst_css)))	\
		if ((leader) != (leader)->group_leader)			\
			;						\
		else
@@ -516,19 +528,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
	pr_cont_kernfs_path(cgrp->kn);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
static inline void cgroup_file_notify(struct cgroup_file *cfile)
{
	/* might not have been created due to one of the CFTYPE selector flags */
	if (cfile->kn)
		kernfs_notify(cfile->kn);
}

#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
+78 −21
Original line number Diff line number Diff line
@@ -97,6 +97,12 @@ static DEFINE_SPINLOCK(css_set_lock);
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
@@ -754,9 +760,11 @@ static void put_css_set_locked(struct css_set *cset)
	if (!atomic_dec_and_test(&cset->refcount))
		return;

	/* This css_set is dead. unlink it and release cgroup refcounts */
	for_each_subsys(ss, ssid)
	/* This css_set is dead. unlink it and release cgroup and css refs */
	for_each_subsys(ss, ssid) {
		list_del(&cset->e_cset_node[ssid]);
		css_put(cset->subsys[ssid]);
	}
	hash_del(&cset->hlist);
	css_set_count--;

@@ -1056,9 +1064,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
	key = css_set_hash(cset->subsys);
	hash_add(css_set_table, &cset->hlist, key);

	for_each_subsys(ss, ssid)
	for_each_subsys(ss, ssid) {
		struct cgroup_subsys_state *css = cset->subsys[ssid];

		list_add_tail(&cset->e_cset_node[ssid],
			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
			      &css->cgroup->e_csets[ssid]);
		css_get(css);
	}

	spin_unlock_bh(&css_set_lock);

@@ -1393,6 +1405,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
	char name[CGROUP_FILE_NAME_MAX];

	lockdep_assert_held(&cgroup_mutex);

	if (cft->file_offset) {
		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		spin_lock_irq(&cgroup_file_kn_lock);
		cfile->kn = NULL;
		spin_unlock_irq(&cgroup_file_kn_lock);
	}

	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}

@@ -1856,7 +1878,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)

	INIT_LIST_HEAD(&cgrp->self.sibling);
	INIT_LIST_HEAD(&cgrp->self.children);
	INIT_LIST_HEAD(&cgrp->self.files);
	INIT_LIST_HEAD(&cgrp->cset_links);
	INIT_LIST_HEAD(&cgrp->pidlists);
	mutex_init(&cgrp->pidlist_mutex);
@@ -2216,6 +2237,9 @@ struct cgroup_taskset {
	struct list_head	src_csets;
	struct list_head	dst_csets;

	/* the subsys currently being processed */
	int			ssid;

	/*
	 * Fields for cgroup_taskset_*() iteration.
	 *
@@ -2278,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task,
/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
					 struct cgroup_subsys_state **dst_cssp)
{
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_task = NULL;

	return cgroup_taskset_next(tset);
	return cgroup_taskset_next(tset, dst_cssp);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
					struct cgroup_subsys_state **dst_cssp)
{
	struct css_set *cset = tset->cur_cset;
	struct task_struct *task = tset->cur_task;
@@ -2311,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
		if (&task->cg_list != &cset->mg_tasks) {
			tset->cur_cset = cset;
			tset->cur_task = task;

			/*
			 * This function may be called both before and
			 * after cgroup_taskset_migrate().  The two cases
			 * can be distinguished by looking at whether @cset
			 * has its ->mg_dst_cset set.
			 */
			if (cset->mg_dst_cset)
				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
			else
				*dst_cssp = cset->subsys[tset->ssid];

			return task;
		}

@@ -2346,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	/* check that we can legitimately attach to the cgroup */
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->can_attach) {
			ret = css->ss->can_attach(css, tset);
			tset->ssid = i;
			ret = css->ss->can_attach(tset);
			if (ret) {
				failed_css = css;
				goto out_cancel_attach;
@@ -2379,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	 */
	tset->csets = &tset->dst_csets;

	for_each_e_css(css, i, dst_cgrp)
		if (css->ss->attach)
			css->ss->attach(css, tset);
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->attach) {
			tset->ssid = i;
			css->ss->attach(tset);
		}
	}

	ret = 0;
	goto out_release_tset;
@@ -2390,8 +2434,10 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
	for_each_e_css(css, i, dst_cgrp) {
		if (css == failed_css)
			break;
		if (css->ss->cancel_attach)
			css->ss->cancel_attach(css, tset);
		if (css->ss->cancel_attach) {
			tset->ssid = i;
			css->ss->cancel_attach(tset);
		}
	}
out_release_tset:
	spin_lock_bh(&css_set_lock);
@@ -3313,9 +3359,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
	if (cft->file_offset) {
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		kernfs_get(kn);
		spin_lock_irq(&cgroup_file_kn_lock);
		cfile->kn = kn;
		list_add(&cfile->node, &css->files);
		spin_unlock_irq(&cgroup_file_kn_lock);
	}

	return 0;
@@ -3552,6 +3598,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
	return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
	unsigned long flags;

	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
	if (cfile->kn)
		kernfs_notify(cfile->kn);
	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
@@ -4613,13 +4675,9 @@ static void css_free_work_fn(struct work_struct *work)
		container_of(work, struct cgroup_subsys_state, destroy_work);
	struct cgroup_subsys *ss = css->ss;
	struct cgroup *cgrp = css->cgroup;
	struct cgroup_file *cfile;

	percpu_ref_exit(&css->refcnt);

	list_for_each_entry(cfile, &css->files, node)
		kernfs_put(cfile->kn);

	if (ss) {
		/* css free path */
		int id = css->id;
@@ -4724,7 +4782,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
	css->ss = ss;
	INIT_LIST_HEAD(&css->sibling);
	INIT_LIST_HEAD(&css->children);
	INIT_LIST_HEAD(&css->files);
	css->serial_nr = css_serial_nr_next++;

	if (cgroup_parent(cgrp)) {
+10 −13
Original line number Diff line number Diff line
@@ -155,12 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css)
 * @freezer->lock.  freezer_attach() makes the new tasks conform to the
 * current state and all following state changes can see the new tasks.
 */
static void freezer_attach(struct cgroup_subsys_state *new_css,
			   struct cgroup_taskset *tset)
static void freezer_attach(struct cgroup_taskset *tset)
{
	struct freezer *freezer = css_freezer(new_css);
	struct task_struct *task;
	bool clear_frozen = false;
	struct cgroup_subsys_state *new_css;

	mutex_lock(&freezer_mutex);

@@ -174,20 +172,19 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
	 * current state before executing the following - !frozen tasks may
	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
	 */
	cgroup_taskset_for_each(task, tset) {
	cgroup_taskset_for_each(task, new_css, tset) {
		struct freezer *freezer = css_freezer(new_css);

		if (!(freezer->state & CGROUP_FREEZING)) {
			__thaw_task(task);
		} else {
			freeze_task(task);
			/* clear FROZEN and propagate upwards */
			while (freezer && (freezer->state & CGROUP_FROZEN)) {
				freezer->state &= ~CGROUP_FROZEN;
			clear_frozen = true;
				freezer = parent_freezer(freezer);
			}
		}

	/* propagate FROZEN clearing upwards */
	while (clear_frozen && (freezer = parent_freezer(freezer))) {
		freezer->state &= ~CGROUP_FROZEN;
		clear_frozen = freezer->state & CGROUP_FREEZING;
	}

	mutex_unlock(&freezer_mutex);
Loading