Task Control Groups: automatic userspace notification of idle cgroups (81a6a5cd) · Commits · e / devices / android_kernel_teracube_2e

include/linux/cgroup.h

+9 −2

Original line number	Diff line number	Diff line
		@@ -77,10 +77,11 @@ static inline void css_get(struct cgroup_subsys_state *css)
		* css_get()
		*/

		extern void __css_put(struct cgroup_subsys_state *css);
		static inline void css_put(struct cgroup_subsys_state *css)
		{
		if (!test_bit(CSS_ROOT, &css->flags))
		atomic_dec(&css->refcnt);
		__css_put(css);
		}

		struct cgroup {
		@@ -112,6 +113,13 @@ struct cgroup {
		* tasks in this cgroup. Protected by css_set_lock
		*/
		struct list_head css_sets;

		/*
		* Linked list running through all cgroups that can
		* potentially be reaped by the release agent. Protected by
		* release_list_lock
		*/
		struct list_head release_list;
		};

		/* A css_set is a structure holding pointers to a set of
		@@ -293,7 +301,6 @@ struct task_struct cgroup_iter_next(struct cgroup cont,
		struct cgroup_iter *it);
		void cgroup_iter_end(struct cgroup cont, struct cgroup_iter it);


		#else /* !CONFIG_CGROUPS */

		static inline int cgroup_init_early(void) { return 0; }

kernel/cgroup.c

+386 −42

Original line number	Diff line number	Diff line
		@@ -43,8 +43,11 @@
		#include <linux/spinlock.h>
		#include <linux/string.h>
		#include <linux/sort.h>
		#include <linux/kmod.h>
		#include <asm/atomic.h>

		static DEFINE_MUTEX(cgroup_mutex);

		/* Generate an array of cgroup subsystem pointers */
		#define SUBSYS(_x) &_x ## _subsys,

		@@ -83,6 +86,13 @@ struct cgroupfs_root {

		/* Hierarchy-specific flags */
		unsigned long flags;

		/* The path to use for release notifications. No locking
		* between setting and use - so if userspace updates this
		* while child cgroups exist, you could miss a
		* notification. We ensure that it's always a valid
		* NUL-terminated string */
		char release_agent_path[PATH_MAX];
		};


		@@ -110,7 +120,13 @@ static int need_forkexit_callback;

		/* bits in struct cgroup flags field */
		enum {
		/* Control Group is dead */
		CONT_REMOVED,
		/* Control Group has previously had a child cgroup or a task,
		* but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
		CONT_RELEASABLE,
		/* Control Group requires release notifications to userspace */
		CONT_NOTIFY_ON_RELEASE,
		};

		/* convenient tests for these bits */
		@@ -124,6 +140,19 @@ enum {
		ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
		};

		inline int cgroup_is_releasable(const struct cgroup *cont)
		{
		const int bits =
		(1 << CONT_RELEASABLE) \|
		(1 << CONT_NOTIFY_ON_RELEASE);
		return (cont->flags & bits) == bits;
		}

		inline int notify_on_release(const struct cgroup *cont)
		{
		return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
		}

		/*
		* for_each_subsys() allows you to iterate on each subsystem attached to
		* an active hierarchy
		@@ -135,6 +164,14 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
		#define for_each_root(_root) \
		list_for_each_entry(_root, &roots, root_list)

		/* the list of cgroups eligible for automatic release. Protected by
		* release_list_lock */
		static LIST_HEAD(release_list);
		static DEFINE_SPINLOCK(release_list_lock);
		static void cgroup_release_agent(struct work_struct *work);
		static DECLARE_WORK(release_agent_work, cgroup_release_agent);
		static void check_for_release(struct cgroup *cont);

		/* Link structure for associating css_set objects with cgroups */
		struct cg_cgroup_link {
		/*
		@@ -189,11 +226,8 @@ static int use_task_css_set_links;
		/*
		* unlink a css_set from the list and free it
		*/
		static void release_css_set(struct kref *k)
		static void unlink_css_set(struct css_set *cg)
		{
		struct css_set *cg = container_of(k, struct css_set, ref);
		int i;

		write_lock(&css_set_lock);
		list_del(&cg->list);
		css_set_count--;
		@@ -206,11 +240,39 @@ static void release_css_set(struct kref *k)
		kfree(link);
		}
		write_unlock(&css_set_lock);
		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		atomic_dec(&cg->subsys[i]->cgroup->count);
		}

		static void __release_css_set(struct kref *k, int taskexit)
		{
		int i;
		struct css_set *cg = container_of(k, struct css_set, ref);

		unlink_css_set(cg);

		rcu_read_lock();
		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup *cont = cg->subsys[i]->cgroup;
		if (atomic_dec_and_test(&cont->count) &&
		notify_on_release(cont)) {
		if (taskexit)
		set_bit(CONT_RELEASABLE, &cont->flags);
		check_for_release(cont);
		}
		}
		rcu_read_unlock();
		kfree(cg);
		}

		static void release_css_set(struct kref *k)
		{
		__release_css_set(k, 0);
		}

		static void release_css_set_taskexit(struct kref *k)
		{
		__release_css_set(k, 1);
		}

		/*
		* refcounted get/put for css_set objects
		*/
		@@ -224,6 +286,11 @@ static inline void put_css_set(struct css_set *cg)
		kref_put(&cg->ref, release_css_set);
		}

		static inline void put_css_set_taskexit(struct css_set *cg)
		{
		kref_put(&cg->ref, release_css_set_taskexit);
		}

		/*
		* find_existing_css_set() is a helper for
		* find_css_set(), and checks to see whether an existing
		@@ -465,8 +532,6 @@ static struct css_set *find_css_set(
		* update of a tasks cgroup pointer by attach_task()
		*/

		static DEFINE_MUTEX(cgroup_mutex);

		/**
		* cgroup_lock - lock out any changes to cgroup structures
		*
		@@ -526,6 +591,13 @@ static void cgroup_diput(struct dentry dentry, struct inode inode)
		if (S_ISDIR(inode->i_mode)) {
		struct cgroup *cont = dentry->d_fsdata;
		BUG_ON(!(cgroup_is_removed(cont)));
		/* It's possible for external users to be holding css
		* reference counts on a cgroup; css_put() needs to
		* be able to access the cgroup after decrementing
		* the reference count in order to know if it needs to
		* queue the cgroup to be handled by the release
		* agent */
		synchronize_rcu();
		kfree(cont);
		}
		iput(inode);
		@@ -657,6 +729,8 @@ static int cgroup_show_options(struct seq_file seq, struct vfsmount vfs)
		seq_printf(seq, ",%s", ss->name);
		if (test_bit(ROOT_NOPREFIX, &root->flags))
		seq_puts(seq, ",noprefix");
		if (strlen(root->release_agent_path))
		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
		mutex_unlock(&cgroup_mutex);
		return 0;
		}
		@@ -664,6 +738,7 @@ static int cgroup_show_options(struct seq_file seq, struct vfsmount vfs)
		struct cgroup_sb_opts {
		unsigned long subsys_bits;
		unsigned long flags;
		char *release_agent;
		};

		/* Convert a hierarchy specifier into a bitmask of subsystems and
		@@ -675,6 +750,7 @@ static int parse_cgroupfs_options(char *data,

		opts->subsys_bits = 0;
		opts->flags = 0;
		opts->release_agent = NULL;

		while ((token = strsep(&o, ",")) != NULL) {
		if (!*token)
		@@ -683,6 +759,15 @@ static int parse_cgroupfs_options(char *data,
		opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
		} else if (!strcmp(token, "noprefix")) {
		set_bit(ROOT_NOPREFIX, &opts->flags);
		} else if (!strncmp(token, "release_agent=", 14)) {
		/* Specifying two release agents is forbidden */
		if (opts->release_agent)
		return -EINVAL;
		opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
		if (!opts->release_agent)
		return -ENOMEM;
		strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
		opts->release_agent[PATH_MAX - 1] = 0;
		} else {
		struct cgroup_subsys *ss;
		int i;
		@@ -732,7 +817,11 @@ static int cgroup_remount(struct super_block sb, int flags, char *data)
		if (!ret)
		cgroup_populate_dir(cont);

		if (opts.release_agent)
		strcpy(root->release_agent_path, opts.release_agent);
		out_unlock:
		if (opts.release_agent)
		kfree(opts.release_agent);
		mutex_unlock(&cgroup_mutex);
		mutex_unlock(&cont->dentry->d_inode->i_mutex);
		return ret;
		@@ -756,6 +845,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
		INIT_LIST_HEAD(&cont->sibling);
		INIT_LIST_HEAD(&cont->children);
		INIT_LIST_HEAD(&cont->css_sets);
		INIT_LIST_HEAD(&cont->release_list);
		}

		static int cgroup_test_super(struct super_block sb, void data)
		@@ -830,8 +920,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,

		/* First find the desired set of subsystems */
		ret = parse_cgroupfs_options(data, &opts);
		if (ret)
		if (ret) {
		if (opts.release_agent)
		kfree(opts.release_agent);
		return ret;
		}

		root = kzalloc(sizeof(*root), GFP_KERNEL);
		if (!root)
		@@ -840,6 +933,10 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
		init_cgroup_root(root);
		root->subsys_bits = opts.subsys_bits;
		root->flags = opts.flags;
		if (opts.release_agent) {
		strcpy(root->release_agent_path, opts.release_agent);
		kfree(opts.release_agent);
		}

		sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);

		@@ -1120,7 +1217,7 @@ static int attach_task(struct cgroup cont, struct task_struct tsk)
		ss->attach(ss, cont, oldcont, tsk);
		}
		}

		set_bit(CONT_RELEASABLE, &oldcont->flags);
		synchronize_rcu();
		put_css_set(cg);
		return 0;
		@@ -1170,6 +1267,9 @@ enum cgroup_filetype {
		FILE_ROOT,
		FILE_DIR,
		FILE_TASKLIST,
		FILE_NOTIFY_ON_RELEASE,
		FILE_RELEASABLE,
		FILE_RELEASE_AGENT,
		};

		static ssize_t cgroup_write_uint(struct cgroup cont, struct cftype cft,
		@@ -1240,6 +1340,32 @@ static ssize_t cgroup_common_file_write(struct cgroup *cont,
		case FILE_TASKLIST:
		retval = attach_task_by_pid(cont, buffer);
		break;
		case FILE_NOTIFY_ON_RELEASE:
		clear_bit(CONT_RELEASABLE, &cont->flags);
		if (simple_strtoul(buffer, NULL, 10) != 0)
		set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
		else
		clear_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
		break;
		case FILE_RELEASE_AGENT:
		{
		struct cgroupfs_root *root = cont->root;
		/* Strip trailing newline */
		if (nbytes && (buffer[nbytes-1] == '\n')) {
		buffer[nbytes-1] = 0;
		}
		if (nbytes < sizeof(root->release_agent_path)) {
		/* We never write anything other than '\0'
		* into the last char of release_agent_path,
		* so it always remains a NUL-terminated
		* string */
		strncpy(root->release_agent_path, buffer, nbytes);
		root->release_agent_path[nbytes] = 0;
		} else {
		retval = -ENOSPC;
		}
		break;
		}
		default:
		retval = -EINVAL;
		goto out2;
		@@ -1281,6 +1407,49 @@ static ssize_t cgroup_read_uint(struct cgroup cont, struct cftype cft,
		return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
		}

		static ssize_t cgroup_common_file_read(struct cgroup *cont,
		struct cftype *cft,
		struct file *file,
		char __user *buf,
		size_t nbytes, loff_t *ppos)
		{
		enum cgroup_filetype type = cft->private;
		char *page;
		ssize_t retval = 0;
		char *s;

		if (!(page = (char *)__get_free_page(GFP_KERNEL)))
		return -ENOMEM;

		s = page;

		switch (type) {
		case FILE_RELEASE_AGENT:
		{
		struct cgroupfs_root *root;
		size_t n;
		mutex_lock(&cgroup_mutex);
		root = cont->root;
		n = strnlen(root->release_agent_path,
		sizeof(root->release_agent_path));
		n = min(n, (size_t) PAGE_SIZE);
		strncpy(s, root->release_agent_path, n);
		mutex_unlock(&cgroup_mutex);
		s += n;
		break;
		}
		default:
		retval = -EINVAL;
		goto out;
		}
		*s++ = '\n';

		retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
		out:
		free_page((unsigned long)page);
		return retval;
		}

		static ssize_t cgroup_file_read(struct file file, char __user buf,
		size_t nbytes, loff_t *ppos)
		{
		@@ -1699,16 +1868,49 @@ static int cgroup_tasks_release(struct inode *unused_inode,
		return 0;
		}

		static u64 cgroup_read_notify_on_release(struct cgroup *cont,
		struct cftype *cft)
		{
		return notify_on_release(cont);
		}

		static u64 cgroup_read_releasable(struct cgroup cont, struct cftype cft)
		{
		return test_bit(CONT_RELEASABLE, &cont->flags);
		}

		/*
		* for the common functions, 'private' gives the type of file
		*/
		static struct cftype cft_tasks = {
		static struct cftype files[] = {
		{
		.name = "tasks",
		.open = cgroup_tasks_open,
		.read = cgroup_tasks_read,
		.write = cgroup_common_file_write,
		.release = cgroup_tasks_release,
		.private = FILE_TASKLIST,
		},

		{
		.name = "notify_on_release",
		.read_uint = cgroup_read_notify_on_release,
		.write = cgroup_common_file_write,
		.private = FILE_NOTIFY_ON_RELEASE,
		},

		{
		.name = "releasable",
		.read_uint = cgroup_read_releasable,
		.private = FILE_RELEASABLE,
		}
		};

		static struct cftype cft_release_agent = {
		.name = "release_agent",
		.read = cgroup_common_file_read,
		.write = cgroup_common_file_write,
		.private = FILE_RELEASE_AGENT,
		};

		static int cgroup_populate_dir(struct cgroup *cont)
		@@ -1719,10 +1921,15 @@ static int cgroup_populate_dir(struct cgroup *cont)
		/* First clear out any existing files */
		cgroup_clear_directory(cont->dentry);

		err = cgroup_add_file(cont, NULL, &cft_tasks);
		err = cgroup_add_files(cont, NULL, files, ARRAY_SIZE(files));
		if (err < 0)
		return err;

		if (cont == cont->top_cgroup) {
		if ((err = cgroup_add_file(cont, NULL, &cft_release_agent)) < 0)
		return err;
		}

		for_each_subsys(cont->root, ss) {
		if (ss->populate && (err = ss->populate(ss, cont)) < 0)
		return err;
		@@ -1779,6 +1986,7 @@ static long cgroup_create(struct cgroup parent, struct dentry dentry,
		INIT_LIST_HEAD(&cont->sibling);
		INIT_LIST_HEAD(&cont->children);
		INIT_LIST_HEAD(&cont->css_sets);
		INIT_LIST_HEAD(&cont->release_list);

		cont->parent = parent;
		cont->root = parent->root;
		@@ -1840,6 +2048,38 @@ static int cgroup_mkdir(struct inode dir, struct dentry dentry, int mode)
		return cgroup_create(c_parent, dentry, mode \| S_IFDIR);
		}

		static inline int cgroup_has_css_refs(struct cgroup *cont)
		{
		/* Check the reference count on each subsystem. Since we
		* already established that there are no tasks in the
		* cgroup, if the css refcount is also 0, then there should
		* be no outstanding references, so the subsystem is safe to
		* destroy. We scan across all subsystems rather than using
		* the per-hierarchy linked list of mounted subsystems since
		* we can be called via check_for_release() with no
		* synchronization other than RCU, and the subsystem linked
		* list isn't RCU-safe */
		int i;
		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];
		struct cgroup_subsys_state *css;
		/* Skip subsystems not in this hierarchy */
		if (ss->root != cont->root)
		continue;
		css = cont->subsys[ss->subsys_id];
		/* When called from check_for_release() it's possible
		* that by this point the cgroup has been removed
		* and the css deleted. But a false-positive doesn't
		* matter, since it can only happen if the cgroup
		* has been deleted and hence no longer needs the
		* release agent to be called anyway. */
		if (css && atomic_read(&css->refcnt)) {
		return 1;
		}
		}
		return 0;
		}

		static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
		{
		struct cgroup *cont = dentry->d_fsdata;
		@@ -1848,7 +2088,6 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
		struct cgroup_subsys *ss;
		struct super_block *sb;
		struct cgroupfs_root *root;
		int css_busy = 0;

		/* the vfs holds both inode->i_mutex already */

		@@ -1866,20 +2105,7 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
		root = cont->root;
		sb = root->sb;

		/* Check the reference count on each subsystem. Since we
		* already established that there are no tasks in the
		* cgroup, if the css refcount is also 0, then there should
		* be no outstanding references, so the subsystem is safe to
		* destroy */
		for_each_subsys(root, ss) {
		struct cgroup_subsys_state *css;
		css = cont->subsys[ss->subsys_id];
		if (atomic_read(&css->refcnt)) {
		css_busy = 1;
		break;
		}
		}
		if (css_busy) {
		if (cgroup_has_css_refs(cont)) {
		mutex_unlock(&cgroup_mutex);
		return -EBUSY;
		}
		@@ -1889,7 +2115,11 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
		ss->destroy(ss, cont);
		}

		spin_lock(&release_list_lock);
		set_bit(CONT_REMOVED, &cont->flags);
		if (!list_empty(&cont->release_list))
		list_del(&cont->release_list);
		spin_unlock(&release_list_lock);
		/* delete my sibling from parent->children */
		list_del(&cont->sibling);
		spin_lock(&cont->dentry->d_lock);
		@@ -1901,6 +2131,9 @@ static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
		dput(d);
		root->number_of_cgroups--;

		set_bit(CONT_RELEASABLE, &parent->flags);
		check_for_release(parent);

		mutex_unlock(&cgroup_mutex);
		/* Drop the active superblock reference that we took when we
		* created the cgroup */
		@@ -2263,7 +2496,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
		tsk->cgroups = &init_css_set;
		task_unlock(tsk);
		if (cg)
		put_css_set(cg);
		put_css_set_taskexit(cg);
		}

		/**
		@@ -2374,7 +2607,10 @@ int cgroup_clone(struct task_struct tsk, struct cgroup_subsys subsys)

		out_release:
		mutex_unlock(&inode->i_mutex);

		mutex_lock(&cgroup_mutex);
		put_css_set(cg);
		mutex_unlock(&cgroup_mutex);
		deactivate_super(parent->root->sb);
		return ret;
		}
		@@ -2404,3 +2640,111 @@ int cgroup_is_descendant(const struct cgroup *cont)
		ret = (cont == target);
		return ret;
		}

		static void check_for_release(struct cgroup *cont)
		{
		/* All of these checks rely on RCU to keep the cgroup
		* structure alive */
		if (cgroup_is_releasable(cont) && !atomic_read(&cont->count)
		&& list_empty(&cont->children) && !cgroup_has_css_refs(cont)) {
		/* Control Group is currently removeable. If it's not
		* already queued for a userspace notification, queue
		* it now */
		int need_schedule_work = 0;
		spin_lock(&release_list_lock);
		if (!cgroup_is_removed(cont) &&
		list_empty(&cont->release_list)) {
		list_add(&cont->release_list, &release_list);
		need_schedule_work = 1;
		}
		spin_unlock(&release_list_lock);
		if (need_schedule_work)
		schedule_work(&release_agent_work);
		}
		}

		void __css_put(struct cgroup_subsys_state *css)
		{
		struct cgroup *cont = css->cgroup;
		rcu_read_lock();
		if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cont)) {
		set_bit(CONT_RELEASABLE, &cont->flags);
		check_for_release(cont);
		}
		rcu_read_unlock();
		}

		/*
		* Notify userspace when a cgroup is released, by running the
		* configured release agent with the name of the cgroup (path
		* relative to the root of cgroup file system) as the argument.
		*
		* Most likely, this user command will try to rmdir this cgroup.
		*
		* This races with the possibility that some other task will be
		* attached to this cgroup before it is removed, or that some other
		* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
		* The presumed 'rmdir' will fail quietly if this cgroup is no longer
		* unused, and this cgroup will be reprieved from its death sentence,
		* to continue to serve a useful existence. Next time it's released,
		* we will get notified again, if it still has 'notify_on_release' set.
		*
		* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
		* means only wait until the task is successfully execve()'d. The
		* separate release agent task is forked by call_usermodehelper(),
		* then control in this thread returns here, without waiting for the
		* release agent task. We don't bother to wait because the caller of
		* this routine has no use for the exit status of the release agent
		* task, so no sense holding our caller up for that.
		*
		*/

		static void cgroup_release_agent(struct work_struct *work)
		{
		BUG_ON(work != &release_agent_work);
		mutex_lock(&cgroup_mutex);
		spin_lock(&release_list_lock);
		while (!list_empty(&release_list)) {
		char argv[3], envp[3];
		int i;
		char *pathbuf;
		struct cgroup *cont = list_entry(release_list.next,
		struct cgroup,
		release_list);
		list_del_init(&cont->release_list);
		spin_unlock(&release_list_lock);
		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
		if (!pathbuf) {
		spin_lock(&release_list_lock);
		continue;
		}

		if (cgroup_path(cont, pathbuf, PAGE_SIZE) < 0) {
		kfree(pathbuf);
		spin_lock(&release_list_lock);
		continue;
		}

		i = 0;
		argv[i++] = cont->root->release_agent_path;
		argv[i++] = (char *)pathbuf;
		argv[i] = NULL;

		i = 0;
		/* minimal command environment */
		envp[i++] = "HOME=/";
		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
		envp[i] = NULL;

		/* Drop the lock while we invoke the usermode helper,
		* since the exec could involve hitting disk and hence
		* be a slow process */
		mutex_unlock(&cgroup_mutex);
		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
		kfree(pathbuf);
		mutex_lock(&cgroup_mutex);
		spin_lock(&release_list_lock);
		}
		spin_unlock(&release_list_lock);
		mutex_unlock(&cgroup_mutex);
		}