Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fb0dc5f1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup fixes from Tejun Heo:

 - The destruction path of cgroup objects are asynchronous and
   multi-staged and some of them ended up destroying parents before
   children leading to failures in cpu and memory controllers.  Ensure
   that parents are always destroyed after children.

 - cpuset mm node migration was performed synchronously while holding
   threadgroup and cgroup mutexes and the recent threadgroup locking
   update resulted in a possible deadlock.  The migration is best effort
   and shouldn't have been performed under those locks to begin with.
   Made asynchronous.

 - Minor documentation fix.

* 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1'
  cgroup: make sure a parent css isn't freed before its children
  cgroup: make sure a parent css isn't offlined before its children
  cpuset: make mm migration asynchronous
parents 9aece75c 9a2ddda5
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
conventions of cgroup v2.  It describes all userland-visible aspects
of cgroup including core and specific controller behaviors.  All
future changes must be reflected in this document.  Documentation for
v1 is available under Documentation/cgroup-legacy/.
v1 is available under Documentation/cgroup-v1/.

CONTENTS

+6 −0
Original line number Diff line number Diff line
@@ -127,6 +127,12 @@ struct cgroup_subsys_state {
	 */
	u64 serial_nr;

	/*
	 * Incremented by online self and children.  Used to guarantee that
	 * parents are not offlined before their children.
	 */
	atomic_t online_cnt;

	/* percpu_ref killing and RCU release */
	struct rcu_head rcu_head;
	struct work_struct destroy_work;
+6 −0
Original line number Diff line number Diff line
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
	task_unlock(current);
}

extern void cpuset_post_attach_flush(void);

#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
	return false;
}

static inline void cpuset_post_attach_flush(void)
{
}

#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */
+23 −8
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <net/sock.h>

/*
@@ -2739,6 +2740,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
out_unlock_threadgroup:
	percpu_up_write(&cgroup_threadgroup_rwsem);
	cgroup_kn_unlock(of->kn);
	cpuset_post_attach_flush();
	return ret ?: nbytes;
}

@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)

	if (ss) {
		/* css free path */
		struct cgroup_subsys_state *parent = css->parent;
		int id = css->id;

		if (css->parent)
			css_put(css->parent);

		ss->css_free(css);
		cgroup_idr_remove(&ss->css_idr, id);
		cgroup_put(cgrp);

		if (parent)
			css_put(parent);
	} else {
		/* cgroup free path */
		atomic_dec(&cgrp->root->nr_cgrps);
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
	INIT_LIST_HEAD(&css->sibling);
	INIT_LIST_HEAD(&css->children);
	css->serial_nr = css_serial_nr_next++;
	atomic_set(&css->online_cnt, 0);

	if (cgroup_parent(cgrp)) {
		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
	if (!ret) {
		css->flags |= CSS_ONLINE;
		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

		atomic_inc(&css->online_cnt);
		if (css->parent)
			atomic_inc(&css->parent->online_cnt);
	}
	return ret;
}
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
		container_of(work, struct cgroup_subsys_state, destroy_work);

	mutex_lock(&cgroup_mutex);
	offline_css(css);
	mutex_unlock(&cgroup_mutex);

	do {
		offline_css(css);
		css_put(css);
		/* @css can't go away while we're holding cgroup_mutex */
		css = css->parent;
	} while (css && atomic_dec_and_test(&css->online_cnt));

	mutex_unlock(&cgroup_mutex);
}

/* css kill confirmation processing requires process context, bounce */
@@ -5029,9 +5042,11 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

	if (atomic_dec_and_test(&css->online_cnt)) {
		INIT_WORK(&css->destroy_work, css_killed_work_fn);
		queue_work(cgroup_destroy_wq, &css->destroy_work);
	}
}

/**
 * kill_css - destroy a css
+49 −22
Original line number Diff line number Diff line
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

/*
 * CPU / memory hotplug is handled asynchronously.
 */
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}

/*
 * cpuset_migrate_mm
 *
 *    Migrate memory region from one set of nodes to another.
 *
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
 *    migrating memory region.
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */

struct cpuset_migrate_mm_work {
	struct work_struct	work;
	struct mm_struct	*mm;
	nodemask_t		from;
	nodemask_t		to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
	struct cpuset_migrate_mm_work *mwork =
		container_of(work, struct cpuset_migrate_mm_work, work);

	/* on a wq worker, no need to worry about %current's mems_allowed */
	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
	mmput(mwork->mm);
	kfree(mwork);
}

static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
							const nodemask_t *to)
{
	struct task_struct *tsk = current;

	tsk->mems_allowed = *to;
	struct cpuset_migrate_mm_work *mwork;

	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
	if (mwork) {
		mwork->mm = mm;
		mwork->from = *from;
		mwork->to = *to;
		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
		queue_work(cpuset_migrate_mm_wq, &mwork->work);
	} else {
		mmput(mm);
	}
}

	rcu_read_lock();
	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
	rcu_read_unlock();
void cpuset_post_attach_flush(void)
{
	flush_workqueue(cpuset_migrate_mm_wq);
}

/*
@@ -1097,6 +1119,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
		mpol_rebind_mm(mm, &cs->mems_allowed);
		if (migrate)
			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
		else
			mmput(mm);
	}
	css_task_iter_end(&it);
@@ -1545,10 +1568,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
			 * @old_mems_allowed is the right nodesets that we
			 * migrate mm from.
			 */
			if (is_memory_migrate(cs)) {
			if (is_memory_migrate(cs))
				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
						  &cpuset_attach_nodemask_to);
			}
			else
				mmput(mm);
		}
	}
@@ -1714,6 +1737,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
	mutex_unlock(&cpuset_mutex);
	kernfs_unbreak_active_protection(of->kn);
	css_put(&cs->css);
	flush_workqueue(cpuset_migrate_mm_wq);
	return retval ?: nbytes;
}

@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
	top_cpuset.effective_mems = node_states[N_MEMORY];

	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);

	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
	BUG_ON(!cpuset_migrate_mm_wq);
}

/**