summaryrefslogtreecommitdiff
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2013-12-19 15:08:03 +0100
committerJiri Kosina <jkosina@suse.cz>2013-12-19 15:08:32 +0100
commite23c34bb41da65f354fb7eee04300c56ee48f60c (patch)
tree549fbe449d55273b81ef104a9755109bf4ae7817 /kernel/cgroup.c
parentb481c2cb3534c85dca625973b33eba15f9af3e4c (diff)
parent319e2e3f63c348a9b66db4667efa73178e18b17d (diff)
Merge branch 'master' into for-next
Sync with Linus' tree to be able to apply fixes on top of newer things in tree (efi-stub). Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c337
1 files changed, 56 insertions, 281 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e0aeb32415ff..8b729c278b64 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/file.h>
#include <linux/atomic.h>
@@ -89,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -124,38 +133,6 @@ struct cfent {
};
/*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
- */
-#define CSS_ID_MAX (65535)
-struct css_id {
- /*
- * The css to which this ID points. This pointer is set to valid value
- * after cgroup is populated. If cgroup is removed, this will be NULL.
- * This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_tryget()
- * should be used for avoiding race.
- */
- struct cgroup_subsys_state __rcu *css;
- /*
- * ID of this css.
- */
- unsigned short id;
- /*
- * Depth in hierarchy which this ID belongs to.
- */
- unsigned short depth;
- /*
- * ID is freed by RCU. (and lookup routine is RCU safe.)
- */
- struct rcu_head rcu_head;
- /*
- * Hierarchy of CSS ID belongs to.
- */
- unsigned short stack[0]; /* Array of Length (depth+1) */
-};
-
-/*
* cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
@@ -222,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -386,9 +364,6 @@ struct cgrp_cset_link {
static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
-
/*
* css_set_lock protects the list of css_set objects, and the chain of
* tasks off each css_set. Nests outside task->alloc_lock due to
@@ -840,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static int alloc_css_id(struct cgroup_subsys_state *child_css);
-
static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -907,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
- schedule_work(&cgrp->destroy_work);
+ queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -931,11 +904,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
iput(inode);
}
-static int cgroup_delete(const struct dentry *d)
-{
- return 1;
-}
-
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -1522,7 +1490,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
{
static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
- .d_delete = cgroup_delete,
+ .d_delete = always_delete_dentry,
};
struct inode *inode =
@@ -2038,7 +2006,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
- continue;
+ goto next;
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
@@ -2046,7 +2014,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
ent.cgrp = task_cgroup_from_root(tsk, root);
/* nothing to do if this task is already in the cgroup */
if (ent.cgrp == cgrp)
- continue;
+ goto next;
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
@@ -2054,7 +2022,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
-
+ next:
if (!threadgroup)
break;
} while_each_thread(leader, tsk);
@@ -2462,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
.write = cgroup_file_write,
.llseek = seq_lseek,
- .release = single_release,
+ .release = cgroup_file_release,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2523,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
ret = cft->release(inode, file);
if (css->ss)
css_put(css);
+ if (file->f_op == &cgroup_seqfile_operations)
+ single_release(inode, file);
return ret;
}
@@ -3187,11 +3157,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
WARN_ON_ONCE(!rcu_read_lock_held());
- /* if first iteration, visit the leftmost descendant */
- if (!pos) {
- next = css_leftmost_descendant(root);
- return next != root ? next : NULL;
- }
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
/* if we visited @root, we're done */
if (pos == root)
@@ -4034,8 +4002,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
struct cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
- struct file *efile;
- struct file *cfile;
+ struct fd efile;
+ struct fd cfile;
char *endp;
int ret;
@@ -4058,31 +4026,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
- efile = eventfd_fget(efd);
- if (IS_ERR(efile)) {
- ret = PTR_ERR(efile);
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
goto out_kfree;
}
- event->eventfd = eventfd_ctx_fileget(efile);
+ event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
- cfile = fget(cfd);
- if (!cfile) {
+ cfile = fdget(cfd);
+ if (!cfile.file) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile), MAY_READ);
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto out_put_cfile;
- event->cft = __file_cft(cfile);
+ event->cft = __file_cft(cfile.file);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto out_put_cfile;
@@ -4103,7 +4071,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
ret = -EINVAL;
event->css = cgroup_css(cgrp, event->cft->ss);
- cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
+ cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
if (event->css && event->css == cfile_css && css_tryget(event->css))
ret = 0;
@@ -4121,25 +4089,25 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
if (ret)
goto out_put_css;
- efile->f_op->poll(efile, &event->pt);
+ efile.file->f_op->poll(efile.file, &event->pt);
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
- fput(cfile);
- fput(efile);
+ fdput(cfile);
+ fdput(efile);
return 0;
out_put_css:
css_put(event->css);
out_put_cfile:
- fput(cfile);
+ fdput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
- fput(efile);
+ fdput(efile);
out_kfree:
kfree(event);
@@ -4241,21 +4209,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
goto err;
}
}
-
- /* This cgroup is ready now */
- for_each_root_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
- struct css_id *id = rcu_dereference_protected(css->id, true);
-
- /*
- * Update id->css pointer and make this css visible from
- * CSS ID functions. This pointer will be dereferened
- * from RCU-read-side without locks.
- */
- if (id)
- rcu_assign_pointer(id->css, css);
- }
-
return 0;
err:
cgroup_clear_dir(cgrp, subsys_mask);
@@ -4307,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
* css_put(). dput() requires process context which we don't have.
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
- schedule_work(&css->destroy_work);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
static void css_release(struct percpu_ref *ref)
@@ -4324,7 +4277,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
css->cgroup = cgrp;
css->ss = ss;
css->flags = 0;
- css->id = NULL;
if (cgrp->parent)
css->parent = cgroup_css(cgrp->parent, ss);
@@ -4456,12 +4408,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_free_all;
init_css(css, ss, cgrp);
-
- if (ss->use_id) {
- err = alloc_css_id(css);
- if (err)
- goto err_free_all;
- }
}
/*
@@ -4604,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- schedule_work(&css->destroy_work);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
/**
@@ -4926,12 +4872,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
/* our new subsystem will be attached to the dummy hierarchy. */
init_css(css, ss, cgroup_dummy_top);
- /* init_idr must be after init_css() because it sets css->id. */
- if (ss->use_id) {
- ret = cgroup_init_idr(ss, css);
- if (ret)
- goto err_unload;
- }
/*
* Now we need to entangle the css into the existing css_sets. unlike
@@ -4997,9 +4937,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
offline_css(cgroup_css(cgroup_dummy_top, ss));
- if (ss->use_id)
- idr_destroy(&ss->idr);
-
/* deassign the subsys_id */
cgroup_subsys[ss->subsys_id] = NULL;
@@ -5026,8 +4963,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
/*
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
- * the cgrp->subsys pointer to find their state. note that this
- * also takes care of freeing the css_id.
+ * the cgrp->subsys pointer to find their state.
*/
ss->css_free(cgroup_css(cgroup_dummy_top, ss));
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5098,8 +5034,6 @@ int __init cgroup_init(void)
for_each_builtin_subsys(ss, i) {
if (!ss->early_init)
cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
}
/* allocate id for the dummy hierarchy */
@@ -5140,6 +5074,22 @@ out:
return err;
}
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5519,181 +5469,6 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-
-/* to get ID other than 0, this should be called when !cgroup_is_dead() */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- /*
- * This css_id() can return correct value when somone has refcnt
- * on this or this is under rcu_read_lock(). Once css->id is allocated,
- * it's unchanged until freed.
- */
- cssid = rcu_dereference_raw(css->id);
-
- if (cssid)
- return cssid->id;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
-
-/**
- * css_is_ancestor - test "root" css is an ancestor of "child"
- * @child: the css to be tested.
- * @root: the css supporsed to be an ancestor of the child.
- *
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, the caller must hold rcu_read_lock().
- * But, considering usual usage, the csses should be valid objects after test.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
- */
-
-bool css_is_ancestor(struct cgroup_subsys_state *child,
- const struct cgroup_subsys_state *root)
-{
- struct css_id *child_id;
- struct css_id *root_id;
-
- child_id = rcu_dereference(child->id);
- if (!child_id)
- return false;
- root_id = rcu_dereference(root->id);
- if (!root_id)
- return false;
- if (child_id->depth < root_id->depth)
- return false;
- if (child_id->stack[root_id->depth] != root_id->id)
- return false;
- return true;
-}
-
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
- struct css_id *id = rcu_dereference_protected(css->id, true);
-
- /* When this is called before css_id initialization, id can be NULL */
- if (!id)
- return;
-
- BUG_ON(!ss->use_id);
-
- rcu_assign_pointer(id->css, NULL);
- rcu_assign_pointer(css->id, NULL);
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, id->id);
- spin_unlock(&ss->id_lock);
- kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
- */
-
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
-{
- struct css_id *newid;
- int ret, size;
-
- BUG_ON(!ss->use_id);
-
- size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- newid = kzalloc(size, GFP_KERNEL);
- if (!newid)
- return ERR_PTR(-ENOMEM);
-
- idr_preload(GFP_KERNEL);
- spin_lock(&ss->id_lock);
- /* Don't use 0. allocates an ID of 1-65535 */
- ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
- spin_unlock(&ss->id_lock);
- idr_preload_end();
-
- /* Returns error when there are no free spaces for new ID.*/
- if (ret < 0)
- goto err_out;
-
- newid->id = ret;
- newid->depth = depth;
- return newid;
-err_out:
- kfree(newid);
- return ERR_PTR(ret);
-
-}
-
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *rootcss)
-{
- struct css_id *newid;
-
- spin_lock_init(&ss->id_lock);
- idr_init(&ss->idr);
-
- newid = get_new_cssid(ss, 0);
- if (IS_ERR(newid))
- return PTR_ERR(newid);
-
- newid->stack[0] = newid->id;
- RCU_INIT_POINTER(newid->css, rootcss);
- RCU_INIT_POINTER(rootcss->id, newid);
- return 0;
-}
-
-static int alloc_css_id(struct cgroup_subsys_state *child_css)
-{
- struct cgroup_subsys_state *parent_css = css_parent(child_css);
- struct css_id *child_id, *parent_id;
- int i, depth;
-
- parent_id = rcu_dereference_protected(parent_css->id, true);
- depth = parent_id->depth + 1;
-
- child_id = get_new_cssid(child_css->ss, depth);
- if (IS_ERR(child_id))
- return PTR_ERR(child_id);
-
- for (i = 0; i < depth; i++)
- child_id->stack[i] = parent_id->stack[i];
- child_id->stack[depth] = child_id->id;
- /*
- * child_id->css pointer will be set after this cgroup is available
- * see cgroup_populate_dir()
- */
- rcu_assign_pointer(child_css->id, child_id);
-
- return 0;
-}
-
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
- struct css_id *cssid = NULL;
-
- BUG_ON(!ss->use_id);
- cssid = idr_find(&ss->idr, id);
-
- if (unlikely(!cssid))
- return NULL;
-
- return rcu_dereference(cssid->css);
-}
-EXPORT_SYMBOL_GPL(css_lookup);
-
/**
* css_from_dir - get corresponding css from the dentry of a cgroup dir
* @dentry: directory dentry of interest