From 13af07df9b7e49f1987cf36aa048dc6c49d0f93d Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Thu, 23 Aug 2012 16:53:29 -0400
Subject: cgroup: revise how we re-populate root directory

When remounting cgroupfs with some subsystems added to it and some
removed, cgroup will remove all the files in root directory and then
re-popluate it.

What I'm doing here is, only remove files which belong to subsystems that
are to be unbinded, and only create files for newly-added subsystems.
The purpose is to have all other files untouched.

This is a preparation for cgroup xattr support.

v7:
- checkpatch warnings fixed
v6:
- no changes
v5:
- no changes
v4:
- refactored cgroup_clear_directory() to not use cgroup_rm_file()
- instead of going thru the list of files, get the file list using the
  subsystems
- use 'subsys_mask' instead of {added,removed}_bits and made
  cgroup_populate_dir() to match the parameters with cgroup_clear_directory()
v3:
- refresh patches after recent refactoring

Original-patch-by: Li Zefan <lizefan@huawei.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 61 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 13 deletions(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..875a7130647c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -824,7 +824,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+			       unsigned long subsys_mask);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -963,12 +964,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 	return -ENOENT;
 }
 
-static void cgroup_clear_directory(struct dentry *dir)
+/**
+ * cgroup_clear_directory - selective removal of base and subsystem files
+ * @dir: directory containing the files
+ * @base_files: true if the base files should be removed
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_directory(struct dentry *dir, bool base_files,
+				   unsigned long subsys_mask)
 {
 	struct cgroup *cgrp = __d_cgrp(dir);
+	struct cgroup_subsys *ss;
 
-	while (!list_empty(&cgrp->files))
-		cgroup_rm_file(cgrp, NULL);
+	for_each_subsys(cgrp->root, ss) {
+		struct cftype_set *set;
+		if (!test_bit(ss->subsys_id, &subsys_mask))
+			continue;
+		list_for_each_entry(set, &ss->cftsets, node)
+			cgroup_rm_file(cgrp, set->cfts);
+	}
+	if (base_files) {
+		while (!list_empty(&cgrp->files))
+			cgroup_rm_file(cgrp, NULL);
+	}
 }
 
 /*
@@ -977,8 +995,9 @@ static void cgroup_clear_directory(struct dentry *dir)
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
 	struct dentry *parent;
+	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 
-	cgroup_clear_directory(dentry);
+	cgroup_clear_directory(dentry, true, root->subsys_bits);
 
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
@@ -1339,6 +1358,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
+	unsigned long added_bits, removed_bits;
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1354,6 +1374,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
+	added_bits = opts.subsys_bits & ~root->subsys_bits;
+	removed_bits = root->subsys_bits & ~opts.subsys_bits;
+
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
@@ -1369,8 +1392,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* clear out any existing files and repopulate subsystem files */
-	cgroup_clear_directory(cgrp->dentry);
-	cgroup_populate_dir(cgrp);
+	cgroup_clear_directory(cgrp->dentry, false, removed_bits);
+	/* re-populate subsystem files */
+	cgroup_populate_dir(cgrp, false, added_bits);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
@@ -1669,7 +1693,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		BUG_ON(root->number_of_cgroups != 1);
 
 		cred = override_creds(&init_cred);
-		cgroup_populate_dir(root_cgrp);
+		cgroup_populate_dir(root_cgrp, true, root->subsys_bits);
 		revert_creds(cred);
 		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
@@ -3843,18 +3867,29 @@ static struct cftype files[] = {
 	{ }	/* terminate */
 };
 
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * cgroup_populate_dir - selectively creation of files in a directory
+ * @cgrp: target cgroup
+ * @base_files: true if the base files should be added
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+			       unsigned long subsys_mask)
 {
 	int err;
 	struct cgroup_subsys *ss;
 
-	err = cgroup_addrm_files(cgrp, NULL, files, true);
-	if (err < 0)
-		return err;
+	if (base_files) {
+		err = cgroup_addrm_files(cgrp, NULL, files, true);
+		if (err < 0)
+			return err;
+	}
 
 	/* process cftsets of each subsystem */
 	for_each_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
+		if (!test_bit(ss->subsys_id, &subsys_mask))
+			continue;
 
 		list_for_each_entry(set, &ss->cftsets, node)
 			cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3988,7 +4023,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 
-	err = cgroup_populate_dir(cgrp);
+	err = cgroup_populate_dir(cgrp, true, root->subsys_bits);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 
 	mutex_unlock(&cgroup_mutex);
-- 
cgit v1.2.3


From 03b1cde6b22f625ae832b939bc7379ec1466aec5 Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Thu, 23 Aug 2012 16:53:30 -0400
Subject: cgroup: add xattr support

This is one of the items in the plumber's wish list.

For use cases:

>> What would the use case be for this?
>
> Attaching meta information to services, in an easily discoverable
> way. For example, in systemd we create one cgroup for each service, and
> could then store data like the main pid of the specific service as an
> xattr on the cgroup itself. That way we'd have almost all service state
> in the cgroupfs, which would make it possible to terminate systemd and
> later restart it without losing any state information. But there's more:
> for example, some very peculiar services cannot be terminated on
> shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
> services in question could just mark that on their cgroup, by setting an
> xattr. On the more desktopy side of things there are other
> possibilities: for example there are plans defining what an application
> is along the lines of a cgroup (i.e. an app being a collection of
> processes). With xattrs one could then attach an icon or human readable
> program name on the cgroup.
>
> The key idea is that this would allow attaching runtime meta information
> to cgroups and everything they model (services, apps, vms), that doesn't
> need any complex userspace infrastructure, has good access control
> (i.e. because the file system enforces that anyway, and there's the
> "trusted." xattr namespace), notifications (inotify), and can easily be
> shared among applications.
>
> Lennart

v7:
- no changes
v6:
- remove user xattr namespace, only allow trusted and security
v5:
- check for capabilities before setting/removing xattrs
v4:
- no changes
v3:
- instead of config option, use mount option to enable xattr support

Original-patch-by: Li Zefan <lizefan@huawei.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 7 deletions(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 875a7130647c..508b4a97ab19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -276,7 +276,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 
 /* bits in struct cgroupfs_root flags field */
 enum {
-	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */
+	ROOT_XATTR,	/* supports extended attributes */
 };
 
 static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -913,15 +914,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 */
 		BUG_ON(!list_empty(&cgrp->pidlists));
 
+		simple_xattrs_free(&cgrp->xattrs);
+
 		kfree_rcu(cgrp, rcu_head);
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
 		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+		struct cftype *cft = cfe->type;
 
 		WARN_ONCE(!list_empty(&cfe->node) &&
 			  cgrp != &cgrp->root->top_cgroup,
 			  "cfe still linked for %s\n", cfe->type->name);
 		kfree(cfe);
+		simple_xattrs_free(&cft->xattrs);
 	}
 	iput(inode);
 }
@@ -1140,6 +1145,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
+	if (test_bit(ROOT_XATTR, &root->flags))
+		seq_puts(seq, ",xattr");
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	if (clone_children(&root->top_cgroup))
@@ -1208,6 +1215,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			opts->clone_children = true;
 			continue;
 		}
+		if (!strcmp(token, "xattr")) {
+			set_bit(ROOT_XATTR, &opts->flags);
+			continue;
+		}
 		if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
@@ -1425,6 +1436,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
+	simple_xattrs_init(&cgrp->xattrs);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1769,6 +1781,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
+	simple_xattrs_free(&cgrp->xattrs);
+
 	kill_litter_super(sb);
 	cgroup_drop_root(root);
 }
@@ -2575,6 +2589,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 
+static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
+{
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return &__d_cgrp(dentry)->xattrs;
+	else
+		return &__d_cft(dentry)->xattrs;
+}
+
+static inline int xattr_enabled(struct dentry *dentry)
+{
+	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+	return test_bit(ROOT_XATTR, &root->flags);
+}
+
+static bool is_valid_xattr(const char *name)
+{
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+		return true;
+	return false;
+}
+
+static int cgroup_setxattr(struct dentry *dentry, const char *name,
+			   const void *val, size_t size, int flags)
+{
+	if (!xattr_enabled(dentry))
+		return -EOPNOTSUPP;
+	if (!is_valid_xattr(name))
+		return -EINVAL;
+	return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
+}
+
+static int cgroup_removexattr(struct dentry *dentry, const char *name)
+{
+	if (!xattr_enabled(dentry))
+		return -EOPNOTSUPP;
+	if (!is_valid_xattr(name))
+		return -EINVAL;
+	return simple_xattr_remove(__d_xattrs(dentry), name);
+}
+
+static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
+			       void *buf, size_t size)
+{
+	if (!xattr_enabled(dentry))
+		return -EOPNOTSUPP;
+	if (!is_valid_xattr(name))
+		return -EINVAL;
+	return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
+}
+
+static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	if (!xattr_enabled(dentry))
+		return -EOPNOTSUPP;
+	return simple_xattr_list(__d_xattrs(dentry), buf, size);
+}
+
 static const struct file_operations cgroup_file_operations = {
 	.read = cgroup_file_read,
 	.write = cgroup_file_write,
@@ -2583,11 +2655,22 @@ static const struct file_operations cgroup_file_operations = {
 	.release = cgroup_file_release,
 };
 
+static const struct inode_operations cgroup_file_inode_operations = {
+	.setxattr = cgroup_setxattr,
+	.getxattr = cgroup_getxattr,
+	.listxattr = cgroup_listxattr,
+	.removexattr = cgroup_removexattr,
+};
+
 static const struct inode_operations cgroup_dir_inode_operations = {
 	.lookup = cgroup_lookup,
 	.mkdir = cgroup_mkdir,
 	.rmdir = cgroup_rmdir,
 	.rename = cgroup_rename,
+	.setxattr = cgroup_setxattr,
+	.getxattr = cgroup_getxattr,
+	.listxattr = cgroup_listxattr,
+	.removexattr = cgroup_removexattr,
 };
 
 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2635,6 +2718,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
+		inode->i_op = &cgroup_file_inode_operations;
 	}
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
@@ -2695,7 +2779,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 }
 
 static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-			   const struct cftype *cft)
+			   struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct cgroup *parent = __d_cgrp(dir);
@@ -2705,6 +2789,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	umode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
+	simple_xattrs_init(&cft->xattrs);
+
 	/* does @cft->flags tell us to skip creation on @cgrp? */
 	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 		return 0;
@@ -2745,9 +2831,9 @@ out:
 }
 
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-			      const struct cftype cfts[], bool is_add)
+			      struct cftype cfts[], bool is_add)
 {
-	const struct cftype *cft;
+	struct cftype *cft;
 	int err, ret = 0;
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2781,7 +2867,7 @@ static void cgroup_cfts_prepare(void)
 }
 
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-			       const struct cftype *cfts, bool is_add)
+			       struct cftype *cfts, bool is_add)
 	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
 {
 	LIST_HEAD(pending);
@@ -2832,7 +2918,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
  * function currently returns 0 as long as @cfts registration is successful
  * even if some file creation attempts on existing cgroups fail.
  */
-int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype_set *set;
 
@@ -2862,7 +2948,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
  * registered with @ss.
  */
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype_set *set;
 
-- 
cgit v1.2.3


From a1a71b45a66fd3c3c453b55fbd180f8fccdd1daa Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Thu, 23 Aug 2012 16:53:31 -0400
Subject: cgroup: rename subsys_bits to subsys_mask

In a previous discussion, Tejun Heo suggested to rename references to
subsys_bits (added_bits, removed_bits, etc) by something more meaningful.

Cc: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 84 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 508b4a97ab19..ced292d720b9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -111,13 +111,13 @@ struct cgroupfs_root {
 	 * The bitmask of subsystems intended to be attached to this
 	 * hierarchy
 	 */
-	unsigned long subsys_bits;
+	unsigned long subsys_mask;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
 	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_bits;
+	unsigned long actual_subsys_mask;
 
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
@@ -557,7 +557,7 @@ static struct css_set *find_existing_css_set(
 	 * won't change, so no need for locking.
 	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		if (root->subsys_bits & (1UL << i)) {
+		if (root->subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
@@ -1002,7 +1002,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	struct dentry *parent;
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 
-	cgroup_clear_directory(dentry, true, root->subsys_bits);
+	cgroup_clear_directory(dentry, true, root->subsys_mask);
 
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
@@ -1046,22 +1046,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
  * returns an error, no reference counts are touched.
  */
 static int rebind_subsystems(struct cgroupfs_root *root,
-			      unsigned long final_bits)
+			      unsigned long final_subsys_mask)
 {
-	unsigned long added_bits, removed_bits;
+	unsigned long added_mask, removed_mask;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int i;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
-	removed_bits = root->actual_subsys_bits & ~final_bits;
-	added_bits = final_bits & ~root->actual_subsys_bits;
+	removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
+	added_mask = final_subsys_mask & ~root->actual_subsys_mask;
 	/* Check that any added subsystems are currently free */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 		struct cgroup_subsys *ss = subsys[i];
-		if (!(bit & added_bits))
+		if (!(bit & added_mask))
 			continue;
 		/*
 		 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1086,7 +1086,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		unsigned long bit = 1UL << i;
-		if (bit & added_bits) {
+		if (bit & added_mask) {
 			/* We're binding this subsystem to this hierarchy */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i]);
@@ -1099,7 +1099,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			if (ss->bind)
 				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
-		} else if (bit & removed_bits) {
+		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1112,7 +1112,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			list_move(&ss->sibling, &rootnode.subsys_list);
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
-		} else if (bit & final_bits) {
+		} else if (bit & final_subsys_mask) {
 			/* Subsystem state should already exist */
 			BUG_ON(ss == NULL);
 			BUG_ON(!cgrp->subsys[i]);
@@ -1129,7 +1129,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 		}
 	}
-	root->subsys_bits = root->actual_subsys_bits = final_bits;
+	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
 	synchronize_rcu();
 
 	return 0;
@@ -1158,7 +1158,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 }
 
 struct cgroup_sb_opts {
-	unsigned long subsys_bits;
+	unsigned long subsys_mask;
 	unsigned long flags;
 	char *release_agent;
 	bool clone_children;
@@ -1267,7 +1267,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
-			set_bit(i, &opts->subsys_bits);
+			set_bit(i, &opts->subsys_mask);
 			one_ss = true;
 
 			break;
@@ -1288,7 +1288,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 				continue;
 			if (ss->disabled)
 				continue;
-			set_bit(i, &opts->subsys_bits);
+			set_bit(i, &opts->subsys_mask);
 		}
 	}
 
@@ -1300,19 +1300,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 * the cpuset subsystem.
 	 */
 	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
-	    (opts->subsys_bits & mask))
+	    (opts->subsys_mask & mask))
 		return -EINVAL;
 
 
 	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_bits && opts->none)
+	if (opts->subsys_mask && opts->none)
 		return -EINVAL;
 
 	/*
 	 * We either have to specify by name or by subsystems. (So all
 	 * empty hierarchies must have a name).
 	 */
-	if (!opts->subsys_bits && !opts->name)
+	if (!opts->subsys_mask && !opts->name)
 		return -EINVAL;
 
 	/*
@@ -1324,7 +1324,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 
-		if (!(bit & opts->subsys_bits))
+		if (!(bit & opts->subsys_mask))
 			continue;
 		if (!try_module_get(subsys[i]->module)) {
 			module_pin_failed = true;
@@ -1341,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			/* drop refcounts only on the ones we took */
 			unsigned long bit = 1UL << i;
 
-			if (!(bit & opts->subsys_bits))
+			if (!(bit & opts->subsys_mask))
 				continue;
 			module_put(subsys[i]->module);
 		}
@@ -1351,13 +1351,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	return 0;
 }
 
-static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
 	int i;
 	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 
-		if (!(bit & subsys_bits))
+		if (!(bit & subsys_mask))
 			continue;
 		module_put(subsys[i]->module);
 	}
@@ -1369,7 +1369,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
-	unsigned long added_bits, removed_bits;
+	unsigned long added_mask, removed_mask;
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1381,31 +1381,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 
 	/* See feature-removal-schedule.txt */
-	if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
-	added_bits = opts.subsys_bits & ~root->subsys_bits;
-	removed_bits = root->subsys_bits & ~opts.subsys_bits;
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
 		ret = -EINVAL;
-		drop_parsed_module_refcounts(opts.subsys_bits);
+		drop_parsed_module_refcounts(opts.subsys_mask);
 		goto out_unlock;
 	}
 
-	ret = rebind_subsystems(root, opts.subsys_bits);
+	ret = rebind_subsystems(root, opts.subsys_mask);
 	if (ret) {
-		drop_parsed_module_refcounts(opts.subsys_bits);
+		drop_parsed_module_refcounts(opts.subsys_mask);
 		goto out_unlock;
 	}
 
 	/* clear out any existing files and repopulate subsystem files */
-	cgroup_clear_directory(cgrp->dentry, false, removed_bits);
+	cgroup_clear_directory(cgrp->dentry, false, removed_mask);
 	/* re-populate subsystem files */
-	cgroup_populate_dir(cgrp, false, added_bits);
+	cgroup_populate_dir(cgrp, false, added_mask);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
@@ -1491,8 +1491,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
 	 * If we asked for subsystems (or explicitly for no
 	 * subsystems) then they must match
 	 */
-	if ((opts->subsys_bits || opts->none)
-	    && (opts->subsys_bits != root->subsys_bits))
+	if ((opts->subsys_mask || opts->none)
+	    && (opts->subsys_mask != root->subsys_mask))
 		return 0;
 
 	return 1;
@@ -1502,7 +1502,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
 	struct cgroupfs_root *root;
 
-	if (!opts->subsys_bits && !opts->none)
+	if (!opts->subsys_mask && !opts->none)
 		return NULL;
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1515,7 +1515,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	}
 	init_cgroup_root(root);
 
-	root->subsys_bits = opts->subsys_bits;
+	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
@@ -1547,7 +1547,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
 	if (!opts->new_root)
 		return -EINVAL;
 
-	BUG_ON(!opts->subsys_bits && !opts->none);
+	BUG_ON(!opts->subsys_mask && !opts->none);
 
 	ret = set_anon_super(sb, NULL);
 	if (ret)
@@ -1665,7 +1665,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		if (ret)
 			goto unlock_drop;
 
-		ret = rebind_subsystems(root, root->subsys_bits);
+		ret = rebind_subsystems(root, root->subsys_mask);
 		if (ret == -EBUSY) {
 			free_cg_links(&tmp_cg_links);
 			goto unlock_drop;
@@ -1705,7 +1705,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		BUG_ON(root->number_of_cgroups != 1);
 
 		cred = override_creds(&init_cred);
-		cgroup_populate_dir(root_cgrp, true, root->subsys_bits);
+		cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
 		revert_creds(cred);
 		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
@@ -1717,7 +1717,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 */
 		cgroup_drop_root(opts.new_root);
 		/* no subsys rebinding, so refcounts don't change */
-		drop_parsed_module_refcounts(opts.subsys_bits);
+		drop_parsed_module_refcounts(opts.subsys_mask);
 	}
 
 	kfree(opts.release_agent);
@@ -1731,7 +1731,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
  drop_new_super:
 	deactivate_locked_super(sb);
  drop_modules:
-	drop_parsed_module_refcounts(opts.subsys_bits);
+	drop_parsed_module_refcounts(opts.subsys_mask);
  out_err:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -4109,7 +4109,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 
-	err = cgroup_populate_dir(cgrp, true, root->subsys_bits);
+	err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 
 	mutex_unlock(&cgroup_mutex);
-- 
cgit v1.2.3


From be45c900fdc2c66baad5a7703fb8136991d88aeb Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Thu, 13 Sep 2012 09:50:55 +0200
Subject: cgroup: Remove CGROUP_BUILTIN_SUBSYS_COUNT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CGROUP_BUILTIN_SUBSYS_COUNT is used as start index or stop index when
looping over the subsys array looking either at the builtin or the
module subsystems. Since all the builtin subsystems have an id which
is lower then CGROUP_BUILTIN_SUBSYS_COUNT we know that any module will
have an id larger than CGROUP_BUILTIN_SUBSYS_COUNT. In short the ids
are sorted.

We are about to change id assignment to happen only at compile time
later in this series. That means we can't rely on the above trick
since all ids will always be defined at compile time. Furthermore,
ordering the builtin subsystems and the module subsystems is not
really necessary.

So we need a different way to know which subsystem is a builtin or a
module one. We can use the subsys[]->module pointer for this. Any
place where we need to know if a subsys is module we just check for
the pointer. If it is NULL then the subsystem is a builtin one.

With this we are able to drop the CGROUP_BUILTIN_SUBSYS_COUNT
enum. Though we need to introduce a temporary placeholder so that we
don't get a compilation error when only CONFIG_CGROUP is selected and
no single controller. An empty enum definition is not valid. Later in
this series we are able to remove the placeholder again.

And with this change we get a fix for this:

kernel/cgroup.c: In function ‘cgroup_load_subsys’:
kernel/cgroup.c:4326:38: warning: array subscript is below array bounds [-Warray-bounds]

when CONFIG_CGROUP=y and no built in controller was enabled.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
 kernel/cgroup.c | 68 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ced292d720b9..1b18090269ad 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,7 +88,7 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * populated with the built in subsystems, and modular subsystems are
  * registered after that. The mutable section of this array is protected by
  * cgroup_mutex.
  */
@@ -1321,7 +1321,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 * take duplicate reference counts on a subsystem that's already used,
 	 * but rebind_subsystems handles this case.
 	 */
-	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 
 		if (!(bit & opts->subsys_mask))
@@ -1337,7 +1337,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		 * raced with a module_delete call, and to the user this is
 		 * essentially a "subsystem doesn't exist" case.
 		 */
-		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+		for (i--; i >= 0; i--) {
 			/* drop refcounts only on the ones we took */
 			unsigned long bit = 1UL << i;
 
@@ -1354,7 +1354,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
 	int i;
-	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 
 		if (!(bit & subsys_mask))
@@ -4442,8 +4442,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	 * since cgroup_init_subsys will have already taken care of it.
 	 */
 	if (ss->module == NULL) {
-		/* a few sanity checks */
-		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+		/* a sanity check */
 		BUG_ON(subsys[ss->subsys_id] != ss);
 		return 0;
 	}
@@ -4457,7 +4456,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	 */
 	mutex_lock(&cgroup_mutex);
 	/* find the first empty slot in the array */
-	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		if (subsys[i] == NULL)
 			break;
 	}
@@ -4560,7 +4559,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 
 	mutex_lock(&cgroup_mutex);
 	/* deassign the subsys_id */
-	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
 	subsys[ss->subsys_id] = NULL;
 
 	/* remove subsystem from rootnode's list of subsystems */
@@ -4623,10 +4621,13 @@ int __init cgroup_init_early(void)
 	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&css_set_table[i]);
 
-	/* at bootup time, we don't worry about modular subsystems */
-	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 
+		/* at bootup time, we don't worry about modular subsystems */
+		if (!ss || ss->module)
+			continue;
+
 		BUG_ON(!ss->name);
 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 		BUG_ON(!ss->create);
@@ -4659,9 +4660,12 @@ int __init cgroup_init(void)
 	if (err)
 		return err;
 
-	/* at bootup time, we don't worry about modular subsystems */
-	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+
+		/* at bootup time, we don't worry about modular subsystems */
+		if (!ss || ss->module)
+			continue;
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		if (ss->use_id)
@@ -4856,13 +4860,16 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
 	if (need_forkexit_callback) {
 		int i;
-		/*
-		 * forkexit callbacks are only supported for builtin
-		 * subsystems, and the builtin section of the subsys array is
-		 * immutable, so we don't need to lock the subsys array here.
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
+
+			/*
+			 * forkexit callbacks are only supported for
+			 * builtin subsystems.
+			 */
+			if (!ss || ss->module)
+				continue;
+
 			if (ss->fork)
 				ss->fork(child);
 		}
@@ -4967,12 +4974,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	tsk->cgroups = &init_css_set;
 
 	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
+
+			/* modular subsystems can't use callbacks */
+			if (!ss || ss->module)
+				continue;
+
 			if (ss->exit) {
 				struct cgroup *old_cgrp =
 					rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5158,13 +5166,17 @@ static int __init cgroup_disable(char *str)
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
-		/*
-		 * cgroup_disable, being at boot time, can't know about module
-		 * subsystems, so we don't worry about them.
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
+			/*
+			 * cgroup_disable, being at boot time, can't
+			 * know about module subsystems, so we don't
+			 * worry about them.
+			 */
+			if (!ss || ss->module)
+				continue;
+
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
-- 
cgit v1.2.3


From 5fc0b02544b3b9bd3db5a8156b5f3e7350f8e797 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Wed, 12 Sep 2012 16:12:05 +0200
Subject: cgroup: Wrap subsystem selection macro

Before we are able to define all subsystem ids at compile time we need
a more fine grained control what gets defined when we include
cgroup_subsys.h. For example we define the enums for the subsystems or
to declare for struct cgroup_subsys (builtin subsystem) by including
cgroup_subsys.h and defining SUBSYS accordingly.

Currently, the decision if a subsys is used is defined inside the
header by testing if CONFIG_*=y is true. By moving this test outside
of cgroup_subsys.h we are able to control it on the include level.

This is done by introducing IS_SUBSYS_ENABLED which then is defined
according the task, e.g. is CONFIG_*=y or CONFIG_*=m.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1b18090269ad..95e9f3fdb729 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -93,6 +93,7 @@ static DEFINE_MUTEX(cgroup_root_mutex);
  * cgroup_mutex.
  */
 #define SUBSYS(_x) &_x ## _subsys,
+#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
-- 
cgit v1.2.3


From 80f4c87774721e864d5a5a1f7aca3e95fd90e194 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Wed, 12 Sep 2012 16:12:06 +0200
Subject: cgroup: Do not depend on a given order when populating the subsys
 array

The *_subsys_id will be used as index to access the subsys. Therefore
we need to care we populate the subsystem at the correct position by
using designated initialization.

With this change we are able to interleave builtin and modules in the subsys
array.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95e9f3fdb729..2bfc78f531b6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -92,7 +92,7 @@ static DEFINE_MUTEX(cgroup_root_mutex);
  * registered after that. The mutable section of this array is protected by
  * cgroup_mutex.
  */
-#define SUBSYS(_x) &_x ## _subsys,
+#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
-- 
cgit v1.2.3


From 8a8e04df4747661daaee77e98e102d99c9e09b98 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Wed, 12 Sep 2012 16:12:07 +0200
Subject: cgroup: Assign subsystem IDs during compile time

WARNING: With this change it is impossible to load external built
controllers anymore.

In case where CONFIG_NETPRIO_CGROUP=m and CONFIG_NET_CLS_CGROUP=m is
set, corresponding subsys_id should also be a constant. Up to now,
net_prio_subsys_id and net_cls_subsys_id would be of the type int and
the value would be assigned during runtime.

By switching the macro definition IS_SUBSYS_ENABLED from IS_BUILTIN
to IS_ENABLED, all *_subsys_id will have constant value. That means we
need to remove all the code which assumes a value can be assigned to
net_prio_subsys_id and net_cls_subsys_id.

A close look is necessary on the RCU part which was introduces by
following patch:

  commit f845172531fb7410c7fb7780b1a6e51ee6df7d52
  Author:	Herbert Xu <herbert@gondor.apana.org.au>  Mon May 24 09:12:34 2010
  Committer:	David S. Miller <davem@davemloft.net>  Mon May 24 09:12:34 2010

  cls_cgroup: Store classid in struct sock

  Tis code was added to init_cgroup_cls()

	  /* We can't use rcu_assign_pointer because this is an int. */
	  smp_wmb();
	  net_cls_subsys_id = net_cls_subsys.subsys_id;

  respectively to exit_cgroup_cls()

	  net_cls_subsys_id = -1;
	  synchronize_rcu();

  and in module version of task_cls_classid()

	  rcu_read_lock();
	  id = rcu_dereference(net_cls_subsys_id);
	  if (id >= 0)
		  classid = container_of(task_subsys_state(p, id),
					 struct cgroup_cls_state, css)->classid;
	  rcu_read_unlock();

Without an explicit explaination why the RCU part is needed. (The
rcu_deference was fixed by exchanging it to rcu_derefence_index_check()
in a later commit, but that is a minor detail.)

So here is my pondering why it was introduced and why it safe to
remove it now. Note that this code was copied over to net_prio the
reasoning holds for that subsystem too.

The idea behind the RCU use for net_cls_subsys_id is to make sure we
get a valid pointer back from task_subsys_state(). task_subsys_state()
is just blindly accessing the subsys array and returning the
pointer. Obviously, passing in -1 as id into task_subsys_state()
returns an invalid value (out of lower bound).

So this code makes sure that only after module is loaded and the
subsystem registered, the id is assigned.

Before unregistering the module all old readers must have left the
critical section. This is done by assigning -1 to the id and issuing a
synchronized_rcu(). Any new readers wont call task_subsys_state()
anymore and therefore it is safe to unregister the subsystem.

The new code relies on the same trick, but it looks at the subsys
pointer return by task_subsys_state() (remember the id is constant
and therefore we allways have a valid index into the subsys
array).

No precautions need to be taken during module loading
module. Eventually, all CPUs will get a valid pointer back from
task_subsys_state() because rebind_subsystem() which is called after
the module init() function will assigned subsys[net_cls_subsys_id] the
newly loaded module subsystem pointer.

When the subsystem is about to be removed, rebind_subsystem() will
called before the module exit() function. In this case,
rebind_subsys() will assign subsys[net_cls_subsys_id] a NULL pointer
and then it calls synchronize_rcu(). All old readers have left by then
the critical section. Any new reader wont access the subsystem
anymore.  At this point we are safe to unregister the subsystem. No
synchronize_rcu() call is needed.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
 kernel/cgroup.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2bfc78f531b6..485cc1487ea2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4451,24 +4451,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	/* init base cftset */
 	cgroup_init_cftsets(ss);
 
-	/*
-	 * need to register a subsys id before anything else - for example,
-	 * init_cgroup_css needs it.
-	 */
 	mutex_lock(&cgroup_mutex);
-	/* find the first empty slot in the array */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		if (subsys[i] == NULL)
-			break;
-	}
-	if (i == CGROUP_SUBSYS_COUNT) {
-		/* maximum number of subsystems already registered! */
-		mutex_unlock(&cgroup_mutex);
-		return -EBUSY;
-	}
-	/* assign ourselves the subsys_id */
-	ss->subsys_id = i;
-	subsys[i] = ss;
+	subsys[ss->subsys_id] = ss;
 
 	/*
 	 * no ss->create seems to need anything important in the ss struct, so
@@ -4477,7 +4461,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	css = ss->create(dummytop);
 	if (IS_ERR(css)) {
 		/* failure case - need to deassign the subsys[] slot. */
-		subsys[i] = NULL;
+		subsys[ss->subsys_id] = NULL;
 		mutex_unlock(&cgroup_mutex);
 		return PTR_ERR(css);
 	}
@@ -4493,7 +4477,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		if (ret) {
 			dummytop->subsys[ss->subsys_id] = NULL;
 			ss->destroy(dummytop);
-			subsys[i] = NULL;
+			subsys[ss->subsys_id] = NULL;
 			mutex_unlock(&cgroup_mutex);
 			return ret;
 		}
-- 
cgit v1.2.3


From 8c7f6edbda01f1b1a2e60ad61f14fe38023e433b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Sep 2012 12:20:58 -0700
Subject: cgroup: mark subsystems with broken hierarchy support and whine if
 cgroups are nested for them

Currently, cgroup hierarchy support is a mess.  cpu related subsystems
behave correctly - configuration, accounting and control on a parent
properly cover its children.  blkio and freezer completely ignore
hierarchy and treat all cgroups as if they're directly under the root
cgroup.  Others show yet different behaviors.

These differing interpretations of cgroup hierarchy make using cgroup
confusing and it impossible to co-mount controllers into the same
hierarchy and obtain sane behavior.

Eventually, we want full hierarchy support from all subsystems and
probably a unified hierarchy.  Users using separate hierarchies
expecting completely different behaviors depending on the mounted
subsystem is deterimental to making any progress on this front.

This patch adds cgroup_subsys.broken_hierarchy and sets it to %true
for controllers which are lacking in hierarchy support.  The goal of
this patch is two-fold.

* Move users away from using hierarchy on currently non-hierarchical
  subsystems, so that implementing proper hierarchy support on those
  doesn't surprise them.

* Keep track of which controllers are broken how and nudge the
  subsystems to implement proper hierarchy support.

For now, start with a single warning message.  We can whine louder
later on.

v2: Fixed a typo spotted by Michal. Warning message updated.

v3: Updated memcg part so that it doesn't generate warning in the
    cases where .use_hierarchy=false doesn't make the behavior
    different from root.use_hierarchy=true.  Fixed a typo spotted by
    Glauber.

v4: Check ->broken_hierarchy after cgroup creation is complete so that
    ->create() can affect the result per Michal.  Dropped unnecessary
    memcg root handling per Michal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel/cgroup.c')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..b7d9606b17d7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3954,8 +3954,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 
 	for_each_subsys(root, ss) {
-		struct cgroup_subsys_state *css = ss->create(cgrp);
+		struct cgroup_subsys_state *css;
 
+		css = ss->create(cgrp);
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
 			goto err_destroy;
@@ -3969,6 +3970,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		/* At error, ->destroy() callback has to free assigned ID. */
 		if (clone_children(parent) && ss->post_clone)
 			ss->post_clone(cgrp);
+
+		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+		    parent->parent) {
+			pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+				   current->comm, current->pid, ss->name);
+			if (!strcmp(ss->name, "memory"))
+				pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+			ss->warned_broken_hierarchy = true;
+		}
 	}
 
 	list_add(&cgrp->sibling, &cgrp->parent->children);
-- 
cgit v1.2.3