diff options
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r-- | kernel/cgroup/cgroup.c | 197 |
1 files changed, 148 insertions, 49 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 63eff85f251f..8df1b3cdfc42 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/cpu.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -677,7 +678,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -1151,7 +1153,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -1649,7 +1652,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1693,7 +1696,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1711,9 +1715,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ @@ -2208,6 +2225,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) EXPORT_SYMBOL_GPL(task_cgroup_path); /** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(void) +{ + get_online_cpus(); + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(void) +{ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_online_cpus(); +} + +/** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context @@ -2455,21 +2511,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2511,7 +2573,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); @@ -2522,7 +2584,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2547,7 +2609,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2566,7 +2628,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2574,8 +2636,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2686,7 +2748,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); rcu_read_lock(); if (pid) { @@ -2717,7 +2779,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); out_unlock_rcu: rcu_read_unlock(); return tsk; @@ -2732,7 +2794,7 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2791,7 +2853,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2809,7 +2871,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ @@ -2821,7 +2884,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); return ret; } @@ -3451,24 +3514,43 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) { + put_cgroup_ns(ctx->ns); + kfree(ctx); + } + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; @@ -3482,7 +3564,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4376,21 +4458,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4398,21 +4480,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); @@ -4447,9 +4526,9 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; struct inode *inode; int ret; @@ -4485,8 +4564,10 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; dst_cgrp = cgroup_kn_lock_live(of->kn, false); @@ -4503,8 +4584,16 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + of->file->f_path.dentry->d_sb, + ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; @@ -4526,8 +4615,10 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; buf = strstrip(buf); @@ -4546,9 +4637,16 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* thread migrations follow the cgroup.procs delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + of->file->f_path.dentry->d_sb, + ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; @@ -4703,10 +4801,10 @@ static void css_free_rwork_fn(struct work_struct *work) } } -static void css_release_work_fn(struct work_struct *work) +static void css_release_work_fn(struct swork_event *sev) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + container_of(sev, struct cgroup_subsys_state, destroy_swork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; @@ -4768,8 +4866,8 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + INIT_SWORK(&css->destroy_swork, css_release_work_fn); + swork_queue(&css->destroy_swork); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5489,6 +5587,7 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + BUG_ON(swork_get()); return 0; } core_initcall(cgroup_wq_init); |