Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5d21cc2d authored by Tejun Heo's avatar Tejun Heo
Browse files

cpuset: replace cgroup_mutex locking with cpuset internal locking



Supposedly for historical reasons, cpuset depends on cgroup core for
locking.  It depends on cgroup_mutex in cgroup callbacks and grabs
cgroup_mutex from other places where it wants to be synchronized.
This is majorly messy and highly prone to introducing circular locking
dependency especially because cgroup_mutex is supposed to be one of
the outermost locks.

As previous patches already plugged possible races which may happen by
decoupling from cgroup_mutex, replacing cgroup_mutex with cpuset
specific cpuset_mutex is mostly straight-forward.  Introduce
cpuset_mutex, replace all occurrences of cgroup_mutex with it, and add
cpuset_mutex locking to places which inherited cgroup_mutex from
cgroup core.

The only complication is from cpuset wanting to initiate task
migration when a cpuset loses all cpus or memory nodes.  Task
migration may go through full cgroup and all subsystem locking and
should be initiated without holding any cpuset specific lock; however,
a previous patch already made hotplug handled asynchronously and
moving the task migration part outside other locks is easy.
cpuset_propagate_hotplug_workfn() now invokes
remove_tasks_in_empty_cpuset() without holding any lock.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarLi Zefan <lizefan@huawei.com>
parent 02bb5863
Loading
Loading
Loading
Loading
+107 −81
Original line number Original line Diff line number Diff line
@@ -208,23 +208,20 @@ static struct cpuset top_cpuset = {
		if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
		if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))


/*
/*
 * There are two global mutexes guarding cpuset structures.  The first
 * There are two global mutexes guarding cpuset structures - cpuset_mutex
 * is the main control groups cgroup_mutex, accessed via
 * and callback_mutex.  The latter may nest inside the former.  We also
 * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
 * require taking task_lock() when dereferencing a task's cpuset pointer.
 * callback_mutex, below. They can nest.  It is ok to first take
 * See "The task_lock() exception", at the end of this comment.
 * cgroup_mutex, then nest callback_mutex.  We also require taking
 *
 * task_lock() when dereferencing a task's cpuset pointer.  See "The
 * A task must hold both mutexes to modify cpusets.  If a task holds
 * task_lock() exception", at the end of this comment.
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
 *
 * is the only task able to also acquire callback_mutex and be able to
 * A task must hold both mutexes to modify cpusets.  If a task
 * modify cpusets.  It can perform various checks on the cpuset structure
 * holds cgroup_mutex, then it blocks others wanting that mutex,
 * first, knowing nothing will change.  It can also allocate memory while
 * ensuring that it is the only task able to also acquire callback_mutex
 * just holding cpuset_mutex.  While it is performing these checks, various
 * and be able to modify cpusets.  It can perform various checks on
 * callback routines can briefly acquire callback_mutex to query cpusets.
 * the cpuset structure first, knowing nothing will change.  It can
 * Once it is ready to make the changes, it takes callback_mutex, blocking
 * also allocate memory while just holding cgroup_mutex.  While it is
 * everyone else.
 * performing these checks, various callback routines can briefly
 * acquire callback_mutex to query cpusets.  Once it is ready to make
 * the changes, it takes callback_mutex, blocking everyone else.
 *
 *
 * Calls to the kernel memory allocator can not be made while holding
 * Calls to the kernel memory allocator can not be made while holding
 * callback_mutex, as that would risk double tripping on callback_mutex
 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -246,6 +243,7 @@ static struct cpuset top_cpuset = {
 * guidelines for accessing subsystem state in kernel/cgroup.c
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */
 */


static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
static DEFINE_MUTEX(callback_mutex);


/*
/*
@@ -351,7 +349,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/*
/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 *
 * Called with callback_mutex/cgroup_mutex held
 * Called with callback_mutex/cpuset_mutex held
 */
 */
static void cpuset_update_task_spread_flag(struct cpuset *cs,
static void cpuset_update_task_spread_flag(struct cpuset *cs,
					struct task_struct *tsk)
					struct task_struct *tsk)
@@ -371,7 +369,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 *
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cgroup_mutex.
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */
 */


static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -420,7 +418,7 @@ static void free_trial_cpuset(struct cpuset *trial)
 * If we replaced the flag and mask values of the current cpuset
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * our various subset and exclusive rules still be valid?  Presumes
 * cgroup_mutex held.
 * cpuset_mutex held.
 *
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * such as list traversal that depend on the actual address of the
@@ -555,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 * that could cause allocation failures below.
 *
 *
 * Must be called with cgroup_lock held.
 * Must be called with cpuset_mutex held.
 *
 *
 * The three key local variables below are:
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -766,7 +764,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 * 'cpus' is removed, then call this routine to rebuild the
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 * scheduler's dynamic sched domains.
 *
 *
 * Call with cgroup_mutex held.  Takes get_online_cpus().
 * Call with cpuset_mutex held.  Takes get_online_cpus().
 */
 */
static void rebuild_sched_domains_locked(void)
static void rebuild_sched_domains_locked(void)
{
{
@@ -774,7 +772,7 @@ static void rebuild_sched_domains_locked(void)
	cpumask_var_t *doms;
	cpumask_var_t *doms;
	int ndoms;
	int ndoms;


	WARN_ON_ONCE(!cgroup_lock_is_held());
	lockdep_assert_held(&cpuset_mutex);
	get_online_cpus();
	get_online_cpus();


	/* Generate domain masks and attrs */
	/* Generate domain masks and attrs */
@@ -800,9 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains,


void rebuild_sched_domains(void)
void rebuild_sched_domains(void)
{
{
	cgroup_lock();
	mutex_lock(&cpuset_mutex);
	rebuild_sched_domains_locked();
	rebuild_sched_domains_locked();
	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);
}
}


/**
/**
@@ -810,7 +808,7 @@ void rebuild_sched_domains(void)
 * @tsk: task to test
 * @tsk: task to test
 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 *
 *
 * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Called for each task in a cgroup by cgroup_scan_tasks().
 * Called for each task in a cgroup by cgroup_scan_tasks().
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
 * words, if its mask is not equal to its cpuset's mask).
@@ -831,7 +829,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 * cpus_allowed mask needs to be changed.
 * cpus_allowed mask needs to be changed.
 *
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cgroup_lock() at this point.
 * holding cpuset_mutex at this point.
 */
 */
static void cpuset_change_cpumask(struct task_struct *tsk,
static void cpuset_change_cpumask(struct task_struct *tsk,
				  struct cgroup_scanner *scan)
				  struct cgroup_scanner *scan)
@@ -844,7 +842,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 *
 * Called with cgroup_mutex held
 * Called with cpuset_mutex held
 *
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 * calling callback functions for each.
@@ -934,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *    so that the migration code can allocate pages on these nodes.
 *
 *
 *    Call holding cgroup_mutex, so current's cpuset won't change
 *    Call holding cpuset_mutex, so current's cpuset won't change
 *    during this call, as manage_mutex holds off any cpuset_attach()
 *    during this call, as manage_mutex holds off any cpuset_attach()
 *    calls.  Therefore we don't need to take task_lock around the
 *    calls.  Therefore we don't need to take task_lock around the
 *    call to guarantee_online_mems(), as we know no one is changing
 *    call to guarantee_online_mems(), as we know no one is changing
@@ -1009,7 +1007,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/*
/*
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
 * memory_migrate flag is set. Called with cgroup_mutex held.
 * memory_migrate flag is set. Called with cpuset_mutex held.
 */
 */
static void cpuset_change_nodemask(struct task_struct *p,
static void cpuset_change_nodemask(struct task_struct *p,
				   struct cgroup_scanner *scan)
				   struct cgroup_scanner *scan)
@@ -1018,7 +1016,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
	struct cpuset *cs;
	struct cpuset *cs;
	int migrate;
	int migrate;
	const nodemask_t *oldmem = scan->data;
	const nodemask_t *oldmem = scan->data;
	static nodemask_t newmems;	/* protected by cgroup_mutex */
	static nodemask_t newmems;	/* protected by cpuset_mutex */


	cs = cgroup_cs(scan->cg);
	cs = cgroup_cs(scan->cg);
	guarantee_online_mems(cs, &newmems);
	guarantee_online_mems(cs, &newmems);
@@ -1045,7 +1043,7 @@ static void *cpuset_being_rebound;
 * @oldmem: old mems_allowed of cpuset cs
 * @oldmem: old mems_allowed of cpuset cs
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 *
 * Called with cgroup_mutex held
 * Called with cpuset_mutex held
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * if @heap != NULL.
 * if @heap != NULL.
 */
 */
@@ -1067,7 +1065,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
	 * take while holding tasklist_lock.  Forks can happen - the
	 * take while holding tasklist_lock.  Forks can happen - the
	 * mpol_dup() cpuset_being_rebound check will catch such forks,
	 * mpol_dup() cpuset_being_rebound check will catch such forks,
	 * and rebind their vma mempolicies too.  Because we still hold
	 * and rebind their vma mempolicies too.  Because we still hold
	 * the global cgroup_mutex, we know that no other rebind effort
	 * the global cpuset_mutex, we know that no other rebind effort
	 * will be contending for the global variable cpuset_being_rebound.
	 * will be contending for the global variable cpuset_being_rebound.
	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
	 * is idempotent.  Also migrate pages in each mm to new nodes.
	 * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1086,7 +1084,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 * migrate the tasks pages to the new memory.
 *
 *
 * Call with cgroup_mutex held.  May take callback_mutex during call.
 * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 * their mempolicies to the cpusets new mems_allowed.
@@ -1184,7 +1182,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 *
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cgroup_lock() at this point.
 * holding cpuset_mutex at this point.
 */
 */
static void cpuset_change_flag(struct task_struct *tsk,
static void cpuset_change_flag(struct task_struct *tsk,
				struct cgroup_scanner *scan)
				struct cgroup_scanner *scan)
@@ -1197,7 +1195,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
 * @cs: the cpuset in which each task's spread flags needs to be changed
 * @cs: the cpuset in which each task's spread flags needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 *
 * Called with cgroup_mutex held
 * Called with cpuset_mutex held
 *
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 * calling callback functions for each.
@@ -1222,7 +1220,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
 * cs:		the cpuset to update
 * cs:		the cpuset to update
 * turning_on: 	whether the flag is being set or cleared
 * turning_on: 	whether the flag is being set or cleared
 *
 *
 * Call with cgroup_mutex held.
 * Call with cpuset_mutex held.
 */
 */


static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1370,15 +1368,18 @@ static int fmeter_getrate(struct fmeter *fmp)
	return val;
	return val;
}
}


/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
{
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);
	struct task_struct *task;
	struct task_struct *task;
	int ret;
	int ret;


	mutex_lock(&cpuset_mutex);

	ret = -ENOSPC;
	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
		return -ENOSPC;
		goto out_unlock;


	cgroup_taskset_for_each(task, cgrp, tset) {
	cgroup_taskset_for_each(task, cgrp, tset) {
		/*
		/*
@@ -1390,10 +1391,12 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
		 * set_cpus_allowed_ptr() on all attached tasks before
		 * set_cpus_allowed_ptr() on all attached tasks before
		 * cpus_allowed may be changed.
		 * cpus_allowed may be changed.
		 */
		 */
		ret = -EINVAL;
		if (task->flags & PF_THREAD_BOUND)
		if (task->flags & PF_THREAD_BOUND)
			return -EINVAL;
			goto out_unlock;
		if ((ret = security_task_setscheduler(task)))
		ret = security_task_setscheduler(task);
			return ret;
		if (ret)
			goto out_unlock;
	}
	}


	/*
	/*
@@ -1401,18 +1404,22 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
	 * changes which zero cpus/mems_allowed.
	 * changes which zero cpus/mems_allowed.
	 */
	 */
	cs->attach_in_progress++;
	cs->attach_in_progress++;

	ret = 0;
	return 0;
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return ret;
}
}


static void cpuset_cancel_attach(struct cgroup *cgrp,
static void cpuset_cancel_attach(struct cgroup *cgrp,
				 struct cgroup_taskset *tset)
				 struct cgroup_taskset *tset)
{
{
	mutex_lock(&cpuset_mutex);
	cgroup_cs(cgrp)->attach_in_progress--;
	cgroup_cs(cgrp)->attach_in_progress--;
	mutex_unlock(&cpuset_mutex);
}
}


/*
/*
 * Protected by cgroup_mutex.  cpus_attach is used only by cpuset_attach()
 * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
 * but we can't allocate it dynamically there.  Define it global and
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 * allocate from cpuset_init().
 */
 */
@@ -1420,7 +1427,7 @@ static cpumask_var_t cpus_attach;


static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
{
	/* static bufs protected by cgroup_mutex */
	/* static bufs protected by cpuset_mutex */
	static nodemask_t cpuset_attach_nodemask_from;
	static nodemask_t cpuset_attach_nodemask_from;
	static nodemask_t cpuset_attach_nodemask_to;
	static nodemask_t cpuset_attach_nodemask_to;
	struct mm_struct *mm;
	struct mm_struct *mm;
@@ -1430,6 +1437,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *oldcs = cgroup_cs(oldcgrp);
	struct cpuset *oldcs = cgroup_cs(oldcgrp);


	mutex_lock(&cpuset_mutex);

	/* prepare for attach */
	/* prepare for attach */
	if (cs == &top_cpuset)
	if (cs == &top_cpuset)
		cpumask_copy(cpus_attach, cpu_possible_mask);
		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1473,6 +1482,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
	 */
	 */
	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
		schedule_cpuset_propagate_hotplug(cs);
		schedule_cpuset_propagate_hotplug(cs);

	mutex_unlock(&cpuset_mutex);
}
}


/* The various types of files and directories in a cpuset file system */
/* The various types of files and directories in a cpuset file system */
@@ -1494,12 +1505,13 @@ typedef enum {


static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
{
	int retval = 0;
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);
	cpuset_filetype_t type = cft->private;
	cpuset_filetype_t type = cft->private;
	int retval = -ENODEV;


	if (!cgroup_lock_live_group(cgrp))
	mutex_lock(&cpuset_mutex);
		return -ENODEV;
	if (!is_cpuset_online(cs))
		goto out_unlock;


	switch (type) {
	switch (type) {
	case FILE_CPU_EXCLUSIVE:
	case FILE_CPU_EXCLUSIVE:
@@ -1533,18 +1545,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
		retval = -EINVAL;
		retval = -EINVAL;
		break;
		break;
	}
	}
	cgroup_unlock();
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return retval;
	return retval;
}
}


static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{
{
	int retval = 0;
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);
	cpuset_filetype_t type = cft->private;
	cpuset_filetype_t type = cft->private;
	int retval = -ENODEV;


	if (!cgroup_lock_live_group(cgrp))
	mutex_lock(&cpuset_mutex);
		return -ENODEV;
	if (!is_cpuset_online(cs))
		goto out_unlock;


	switch (type) {
	switch (type) {
	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1554,7 +1568,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
		retval = -EINVAL;
		retval = -EINVAL;
		break;
		break;
	}
	}
	cgroup_unlock();
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return retval;
	return retval;
}
}


@@ -1564,9 +1579,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
				const char *buf)
				const char *buf)
{
{
	int retval = 0;
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *trialcs;
	struct cpuset *trialcs;
	int retval = -ENODEV;


	/*
	/*
	 * CPU or memory hotunplug may leave @cs w/o any execution
	 * CPU or memory hotunplug may leave @cs w/o any execution
@@ -1586,13 +1601,14 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
	flush_work(&cpuset_hotplug_work);
	flush_work(&cpuset_hotplug_work);
	flush_workqueue(cpuset_propagate_hotplug_wq);
	flush_workqueue(cpuset_propagate_hotplug_wq);


	if (!cgroup_lock_live_group(cgrp))
	mutex_lock(&cpuset_mutex);
		return -ENODEV;
	if (!is_cpuset_online(cs))
		goto out_unlock;


	trialcs = alloc_trial_cpuset(cs);
	trialcs = alloc_trial_cpuset(cs);
	if (!trialcs) {
	if (!trialcs) {
		retval = -ENOMEM;
		retval = -ENOMEM;
		goto out;
		goto out_unlock;
	}
	}


	switch (cft->private) {
	switch (cft->private) {
@@ -1608,8 +1624,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
	}
	}


	free_trial_cpuset(trialcs);
	free_trial_cpuset(trialcs);
out:
out_unlock:
	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);
	return retval;
	return retval;
}
}


@@ -1867,6 +1883,8 @@ static int cpuset_css_online(struct cgroup *cgrp)
	if (!parent)
	if (!parent)
		return 0;
		return 0;


	mutex_lock(&cpuset_mutex);

	set_bit(CS_ONLINE, &cs->flags);
	set_bit(CS_ONLINE, &cs->flags);
	if (is_spread_page(parent))
	if (is_spread_page(parent))
		set_bit(CS_SPREAD_PAGE, &cs->flags);
		set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1876,7 +1894,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
	number_of_cpusets++;
	number_of_cpusets++;


	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
		return 0;
		goto out_unlock;


	/*
	/*
	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1895,7 +1913,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
	cpuset_for_each_child(tmp_cs, pos_cg, parent) {
	cpuset_for_each_child(tmp_cs, pos_cg, parent) {
		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
			rcu_read_unlock();
			rcu_read_unlock();
			return 0;
			goto out_unlock;
		}
		}
	}
	}
	rcu_read_unlock();
	rcu_read_unlock();
@@ -1904,7 +1922,8 @@ static int cpuset_css_online(struct cgroup *cgrp)
	cs->mems_allowed = parent->mems_allowed;
	cs->mems_allowed = parent->mems_allowed;
	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
	mutex_unlock(&callback_mutex);
	mutex_unlock(&callback_mutex);

out_unlock:
	mutex_unlock(&cpuset_mutex);
	return 0;
	return 0;
}
}


@@ -1912,8 +1931,7 @@ static void cpuset_css_offline(struct cgroup *cgrp)
{
{
	struct cpuset *cs = cgroup_cs(cgrp);
	struct cpuset *cs = cgroup_cs(cgrp);


	/* css_offline is called w/o cgroup_mutex, grab it */
	mutex_lock(&cpuset_mutex);
	cgroup_lock();


	if (is_sched_load_balance(cs))
	if (is_sched_load_balance(cs))
		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1921,7 +1939,7 @@ static void cpuset_css_offline(struct cgroup *cgrp)
	number_of_cpusets--;
	number_of_cpusets--;
	clear_bit(CS_ONLINE, &cs->flags);
	clear_bit(CS_ONLINE, &cs->flags);


	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);
}
}


/*
/*
@@ -1996,7 +2014,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
{
{
	struct cgroup *new_cgroup = scan->data;
	struct cgroup *new_cgroup = scan->data;


	cgroup_lock();
	cgroup_attach_task(new_cgroup, tsk);
	cgroup_attach_task(new_cgroup, tsk);
	cgroup_unlock();
}
}


/**
/**
@@ -2004,7 +2024,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 * @from: cpuset in which the tasks currently reside
 * @from: cpuset in which the tasks currently reside
 * @to: cpuset to which the tasks will be moved
 * @to: cpuset to which the tasks will be moved
 *
 *
 * Called with cgroup_mutex held
 * Called with cpuset_mutex held
 * callback_mutex must not be held, as cpuset_attach() will take it.
 * callback_mutex must not be held, as cpuset_attach() will take it.
 *
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -2031,9 +2051,6 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 * removing that CPU or node from all cpusets.  If this removes the
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 * cpuset to its next-highest non-empty parent.
 *
 * Called with cgroup_mutex held
 * callback_mutex must not be held, as cpuset_attach() will take it.
 */
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
{
@@ -2089,8 +2106,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
	static cpumask_t off_cpus;
	static cpumask_t off_cpus;
	static nodemask_t off_mems, tmp_mems;
	static nodemask_t off_mems, tmp_mems;
	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
	bool is_empty;


	cgroup_lock();
	mutex_lock(&cpuset_mutex);


	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
@@ -2112,10 +2130,18 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
		update_tasks_nodemask(cs, &tmp_mems, NULL);
		update_tasks_nodemask(cs, &tmp_mems, NULL);
	}
	}


	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
	is_empty = cpumask_empty(cs->cpus_allowed) ||
		remove_tasks_in_empty_cpuset(cs);
		nodes_empty(cs->mems_allowed);


	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);

	/*
	 * If @cs became empty, move tasks to the nearest ancestor with
	 * execution resources.  This is full cgroup operation which will
	 * also call back into cpuset.  Should be done outside any lock.
	 */
	if (is_empty)
		remove_tasks_in_empty_cpuset(cs);


	/* the following may free @cs, should be the last operation */
	/* the following may free @cs, should be the last operation */
	css_put(&cs->css);
	css_put(&cs->css);
@@ -2169,7 +2195,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
	bool cpus_updated, mems_updated;
	bool cpus_updated, mems_updated;
	bool cpus_offlined, mems_offlined;
	bool cpus_offlined, mems_offlined;


	cgroup_lock();
	mutex_lock(&cpuset_mutex);


	/* fetch the available cpus/mems and find out which changed how */
	/* fetch the available cpus/mems and find out which changed how */
	cpumask_copy(&new_cpus, cpu_active_mask);
	cpumask_copy(&new_cpus, cpu_active_mask);
@@ -2211,7 +2237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
				schedule_cpuset_propagate_hotplug(cs);
				schedule_cpuset_propagate_hotplug(cs);
	}
	}


	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);


	/* wait for propagations to finish */
	/* wait for propagations to finish */
	flush_workqueue(cpuset_propagate_hotplug_wq);
	flush_workqueue(cpuset_propagate_hotplug_wq);
@@ -2222,9 +2248,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
		cpumask_var_t *doms;
		cpumask_var_t *doms;
		int ndoms;
		int ndoms;


		cgroup_lock();
		mutex_lock(&cpuset_mutex);
		ndoms = generate_sched_domains(&doms, &attr);
		ndoms = generate_sched_domains(&doms, &attr);
		cgroup_unlock();
		mutex_unlock(&cpuset_mutex);


		partition_sched_domains(ndoms, doms, attr);
		partition_sched_domains(ndoms, doms, attr);
	}
	}
@@ -2650,7 +2676,7 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 *    anyway.
 */
 */
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2673,7 +2699,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
		goto out_free;
		goto out_free;


	retval = -EINVAL;
	retval = -EINVAL;
	cgroup_lock();
	mutex_lock(&cpuset_mutex);
	css = task_subsys_state(tsk, cpuset_subsys_id);
	css = task_subsys_state(tsk, cpuset_subsys_id);
	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
	if (retval < 0)
	if (retval < 0)
@@ -2681,7 +2707,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
	seq_puts(m, buf);
	seq_puts(m, buf);
	seq_putc(m, '\n');
	seq_putc(m, '\n');
out_unlock:
out_unlock:
	cgroup_unlock();
	mutex_unlock(&cpuset_mutex);
	put_task_struct(tsk);
	put_task_struct(tsk);
out_free:
out_free:
	kfree(buf);
	kfree(buf);