diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 1821 |
1 files changed, 1255 insertions, 566 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cd3d81961cc2..64a3b3db2484 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,6 +59,13 @@ #include <linux/delay.h> #include <linux/cpuset.h> #include <linux/atomic.h> +#include <linux/proc_ns.h> +#include <linux/nsproxy.h> +#include <linux/file.h> +#include <net/sock.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -177,10 +184,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; + +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ @@ -204,23 +217,34 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; + +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -243,6 +267,11 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -332,13 +361,30 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static struct cgroup *cgroup_parent(struct cgroup *cgrp) +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) { - struct cgroup_subsys_state *parent_css = cgrp->self.parent; + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent_css) - return container_of(parent_css, struct cgroup, self); - return NULL; + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; } /** @@ -380,16 +426,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -433,22 +478,6 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return !(cgrp->self.flags & CSS_ONLINE); } -static void cgroup_get(struct cgroup *cgrp) -{ - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); -} - -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -469,25 +498,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -532,22 +542,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -561,6 +577,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -691,6 +725,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -724,8 +761,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); @@ -1108,18 +1143,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarchy ID should already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - idr_destroy(&root->cgroup_idr); kfree(root); } @@ -1130,13 +1159,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + trace_cgroup_destroy_root(root); + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1165,6 +1196,41 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) +{ + struct cgroup *res = NULL; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + rcu_read_lock(); + + cset = current->nsproxy->cgroup_ns->root_cset; + if (cset == &init_css_set) { + res = &root->cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res = c; + break; + } + } + } + rcu_read_unlock(); + + BUG_ON(!res); + return res; +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) @@ -1276,46 +1342,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask - * @cgrp: the target cgroup + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { - struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->child_subsys_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -1326,19 +1386,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask - * @cgrp: the target cgroup - * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). - */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) -{ - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); -} - -/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1366,19 +1413,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1397,7 +1447,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1427,14 +1480,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1442,17 +1498,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1469,6 +1526,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -1479,67 +1539,51 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + do_each_subsys_mask(ss, ssid, ss_mask) { + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; + } while_each_subsys_mask(); - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } - return ret; + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); } - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1547,8 +1591,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1560,28 +1610,55 @@ static int rebind_subsystems(struct cgroup_root *dst_root, &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) +{ + int len = 0; + char *buf = NULL; + struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); + struct cgroup *ns_cgroup; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock_bh(&css_set_lock); + ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); + len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + spin_unlock_bh(&css_set_lock); + + if (len >= PATH_MAX) + len = -ERANGE; + else if (len > 0) { + seq_escape(sf, buf, " \t\n\\"); + len = 0; + } + kfree(buf); + return len; +} + static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { @@ -1612,7 +1689,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1625,13 +1702,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1706,6 +1783,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1726,7 +1805,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -1756,14 +1835,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1796,13 +1875,16 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -1908,7 +1990,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1921,6 +2003,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -1930,10 +2013,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; @@ -1950,7 +2034,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -1958,6 +2042,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2003,6 +2092,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root = NULL; struct cgroup_sb_opts opts; @@ -2011,6 +2101,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -2021,15 +2119,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (is_v2) { if (data) { pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); return ERR_PTR(-EINVAL); } - cgrp_dfl_root_visible = true; + cgrp_dfl_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); goto out_mount; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -2126,6 +2225,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2144,12 +2249,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); + } out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2162,6 +2292,7 @@ out_mount: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } @@ -2190,14 +2321,41 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); +} + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + int ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2211,12 +2369,12 @@ static struct file_system_type cgroup2_fs_type = { * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2225,16 +2383,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -2369,38 +2526,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - tset->ssid = i; - ret = css->ss->can_attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2427,25 +2584,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->attach) { - tset->ssid = i; - css->ss->attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); } - } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) { - tset->ssid = i; - css->ss->cancel_attach(tset); + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); } - } + } while_each_subsys_mask(); out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2458,6 +2615,20 @@ out_release_tset: } /** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2473,6 +2644,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); @@ -2519,52 +2691,42 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, child_subsys_mask must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2577,6 +2739,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2602,11 +2765,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2617,7 +2780,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2638,7 +2801,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2656,6 +2819,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); @@ -2670,11 +2836,15 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -2737,7 +2907,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2840,7 +3010,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2868,38 +3038,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -2916,16 +3076,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2935,21 +3096,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + list_for_each_entry(link, &dsct->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; @@ -2967,20 +3124,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } +/** + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses + * @cgrp: root of the target subtree + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. + */ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + +restart: + mutex_lock(&cgroup_mutex); + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css || !percpu_ref_is_dying(&css->refcnt)) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + + cgroup_put(dsct); + goto restart; + } + } +} + +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (css_visible(css)) { + ret = css_populate_dir(css); + if (ret) + return ret; + } + } + } + + return 0; +} + +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!css) + continue; + + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!css_visible(css)) { + css_clear_dir(css); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2992,11 +3401,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -3011,12 +3418,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -3027,10 +3434,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -3059,155 +3463,45 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; - } - - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; - - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - for_each_subsys_which(ss, ssid, &css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); - return restart_syscall(); + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } } - } - - cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; + spin_unlock_irq(&css_set_lock); - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } + if (ret) + goto out_unlock; } - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; - - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - if (!ss->css_e_css_changed || !this_css) - continue; + ret = cgroup_apply_control(cgrp); - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; - - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; - - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) @@ -3217,6 +3511,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_file_open(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->open) + return cft->open(of); + return 0; +} + +static void cgroup_file_release(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->release) + cft->release(of); +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -3255,6 +3566,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static unsigned int cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3267,7 +3588,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - seq_cft(seq)->seq_stop(seq, v); + if (seq_cft(seq)->seq_stop) + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -3289,13 +3611,19 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -3311,6 +3639,10 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, struct cgroup *cgrp = kn->priv; int ret; + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) @@ -3334,6 +3666,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -3405,7 +3739,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3434,7 +3768,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) @@ -3451,7 +3785,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); @@ -3641,7 +3975,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@ -4072,6 +4408,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); percpu_down_write(&cgroup_threadgroup_rwsem); @@ -4082,12 +4421,12 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; /* - * Migrate tasks one-by-one until @form is empty. This fails iff + * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { @@ -4102,7 +4441,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -4610,12 +4951,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4774,6 +5109,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4784,7 +5121,11 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4856,6 +5197,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -4866,17 +5210,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4886,8 +5229,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); + if (!css) + css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4897,15 +5242,9 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4923,45 +5262,32 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +/* + * The returned cgroup is fully initialized including its control mask, but + * it isn't associated with its kernfs_node and doesn't have the control + * mask applied. + */ +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct kernfs_node *kn; - int ssid, ret; - - /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; + struct cgroup_root *root = parent->root; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4981,6 +5307,15 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) { + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); + goto out_cancel_ref; + } + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4988,20 +5323,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -5015,51 +5336,84 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + /* + * On the default hierarchy, a child doesn't automatically inherit + * subtree_control from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->subtree_control = cgroup_control(cgrp); + + cgroup_propagate_control(cgrp); + + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct kernfs_node *kn; + int ret; + + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn, false); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + + /* + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. + */ + kernfs_get(kn); + ret = cgroup_kn_set_ugid(kn); if (ret) goto out_destroy; - ret = css_populate_dir(&cgrp->self, NULL); + ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; - /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } - } + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; - /* - * On the default hierarchy, a child doesn't automatically inherit - * subtree_control from the parent. Each is configured manually. - */ - if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); - } + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* @@ -5113,7 +5467,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive @@ -5218,12 +5572,15 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } @@ -5234,13 +5591,14 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); @@ -5308,7 +5666,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, @@ -5325,7 +5683,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5336,9 +5694,9 @@ static unsigned long cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5349,11 +5707,16 @@ int __init cgroup_init(void) */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5386,10 +5749,16 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5402,6 +5771,11 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); @@ -5443,7 +5817,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5460,7 +5834,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -5486,17 +5860,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); - if (!path) { + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -5548,19 +5923,6 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) - return &ss_priv[i - CGROUP_CANFORK_START]; - return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - void **private = subsys_canfork_priv_p(ss_priv, i); - return private ? *private : NULL; -} - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5583,17 +5945,16 @@ void cgroup_fork(struct task_struct *child) * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { - ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + do_each_subsys_mask(ss, i, have_canfork_callback) { + ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5602,7 +5963,7 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + ss->cancel_fork(child); } return ret; @@ -5615,15 +5976,14 @@ out_revert: * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ -void cgroup_cancel_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); + ss->cancel_fork(child); } /** @@ -5636,8 +5996,7 @@ void cgroup_cancel_fork(struct task_struct *child, * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child, - void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; @@ -5680,8 +6039,9 @@ void cgroup_post_fork(struct task_struct *child, * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); + do_each_subsys_mask(ss, i, have_fork_callback) { + ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5727,8 +6087,9 @@ void cgroup_exit(struct task_struct *tsk) spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5737,8 +6098,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } @@ -5777,8 +6139,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -5788,13 +6151,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ @@ -5833,6 +6196,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5846,12 +6236,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); @@ -5883,8 +6274,306 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return id > 0 ? idr_find(&ss->css_idr, id) : NULL; + return idr_find(&ss->css_idr, id); +} + +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; + return; + } + + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + /* Socket clone path */ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + } +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + if (skcd->no_refcnt) + return; + + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + +#ifdef CONFIG_CGROUP_BPF +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; } +#endif /* CONFIG_CGROUP_BPF */ #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * |