aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c36
-rw-r--r--kernel/cgroup/cgroup.c80
-rw-r--r--kernel/cgroup/cpuset.c307
-rw-r--r--kernel/cgroup/rstat.c150
5 files changed, 398 insertions, 179 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c56071f150f2..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,13 +164,13 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
/* iterate across the hierarchies */
#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
/**
* for_each_subsys - iterate all enabled cgroup subsystems
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 76db6c67e39a..520a11cb12f4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -802,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- if (ret < 0 || ret >= PATH_MAX)
+ if (ret < 0)
goto out_free;
argv[0] = agentbuf;
@@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
return ret;
}
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
static int __init cgroup1_wq_init(void)
{
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4b9ff41ca603..a66c088c851c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- kfree(root);
+ kfree_rcu(root, rcu);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
spin_unlock_irq(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
if (!have_favordynmods)
cgroup_favor_dynmods(root, false);
@@ -1390,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
}
}
- BUG_ON(!res_cgroup);
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
return res_cgroup;
}
@@ -1413,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_unlock();
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
return res;
}
@@ -1449,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root);
@@ -1457,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1893,7 +1906,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
- if (len >= PATH_MAX)
+ if (len == -E2BIG)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
@@ -2032,7 +2045,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
@@ -2115,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
- list_add(&root->root_list, &cgroup_roots);
+ list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
@@ -4169,20 +4182,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -4195,25 +4194,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
- int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_fsuid(), current_fsgid(),
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
@@ -5616,7 +5608,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
goto out_cancel_ref;
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_stat_exit;
@@ -5761,10 +5755,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(cgrp->kn);
- if (ret)
- goto out_destroy;
-
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
@@ -6265,7 +6255,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- cgroup_lock();
+ rcu_read_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -6276,6 +6266,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
@@ -6286,9 +6281,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
-
- cgrp = task_cgroup_from_root(tsk, root);
-
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
@@ -6301,7 +6293,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
@@ -6320,7 +6312,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- cgroup_unlock();
+ rcu_read_unlock();
kfree(buf);
out:
return retval;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 615daaf87f1f..927bef3a598a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,6 +25,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
@@ -43,6 +44,7 @@
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -204,6 +206,11 @@ struct cpuset {
*/
static cpumask_var_t subpartitions_cpus;
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
+
/* List of remote partition root children */
static struct list_head remote_children;
@@ -1317,6 +1324,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
*/
enum partition_cmd {
partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's effective_cpus */
partcmd_invalidate, /* Make partition invalid */
@@ -1419,6 +1427,109 @@ static void reset_partition_data(struct cpuset *cs)
}
/*
+ * partition_xcpus_newstate - Exclusive CPUs state change
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (new_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (old_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+{
+ int ret;
+
+ lockdep_assert_cpus_held();
+
+ if (!isolcpus_updated)
+ return;
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+}
+
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
+/*
* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
* @cs: cpuset
* @xcpus: effective exclusive CPUs value to be set
@@ -1456,14 +1567,18 @@ static inline bool is_local_partition(struct cpuset *cs)
/*
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
+ * @new_prs: new partition_root_state
* @tmp: temparary masks
* Return: 1 if successful, 0 if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
* directly from the top cpuset. cpuset_mutex must be held by the caller.
*/
-static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
{
+ bool isolcpus_updated;
+
/*
* The user must have sysadmin privilege.
*/
@@ -1485,26 +1600,22 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
return 0;
spin_lock_irq(&callback_lock);
- cpumask_andnot(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->new_cpus);
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->new_cpus);
-
+ isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ list_add(&cs->remote_sibling, &remote_children);
if (cs->use_parent_ecpus) {
struct cpuset *parent = parent_cs(cs);
cs->use_parent_ecpus = false;
parent->child_ecpus_count--;
}
- list_add(&cs->remote_sibling, &remote_children);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
-
return 1;
}
@@ -1519,23 +1630,22 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
*/
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
+ bool isolcpus_updated;
+
compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
WARN_ON_ONCE(!is_remote_partition(cs));
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->new_cpus);
- cpumask_and(tmp->new_cpus,
- tmp->new_cpus, cpu_active_mask);
- cpumask_or(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->new_cpus);
list_del_init(&cs->remote_sibling);
+ isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+ NULL, tmp->new_cpus);
cs->partition_root_state = -cs->partition_root_state;
if (!cs->prs_err)
cs->prs_err = PERR_INVCPUS;
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1557,6 +1667,8 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
struct tmpmasks *tmp)
{
bool adding, deleting;
+ int prs = cs->partition_root_state;
+ int isolcpus_updated = 0;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
@@ -1580,21 +1692,12 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
goto invalidate;
spin_lock_irq(&callback_lock);
- if (adding) {
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->addmask);
- cpumask_andnot(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->addmask);
- }
- if (deleting) {
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->delmask);
- cpumask_and(tmp->delmask,
- tmp->delmask, cpu_active_mask);
- cpumask_or(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->delmask);
- }
+ if (adding)
+ isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1676,11 +1779,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
* @tmp: Temporary addmask and delmask
* Return: 0 or a partition root state error code
*
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
- * not set) mask of the given cpuset will be taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * effective_xcpus can be granted or an error code will be returned.
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in effective_xcpus will be
@@ -1695,7 +1798,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
*
* For partcmd_invalidate, the current partition will be made invalid.
*
- * The partcmd_enable and partcmd_disable commands are used by
+ * The partcmd_enable* and partcmd_disable commands are used by
* update_prstate(). An error code may be returned and the caller will check
* for error.
*
@@ -1716,6 +1819,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int part_error = PERR_NONE; /* Partition error? */
int subparts_delta = 0;
struct cpumask *xcpus; /* cs effective_xcpus */
+ int isolcpus_updated = 0;
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
@@ -1760,7 +1864,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
nocpu = tasks_nocpu_error(parent, cs, xcpus);
- if (cmd == partcmd_enable) {
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
* Enabling partition root is not allowed if its
* effective_xcpus is empty or doesn't overlap with
@@ -1783,6 +1887,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_copy(tmp->delmask, xcpus);
deleting = true;
subparts_delta++;
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus to parent's effective_cpus for
@@ -1792,6 +1897,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
if (adding)
subparts_delta--;
+ new_prs = PRS_MEMBER;
} else if (newmask) {
/*
* Empty cpumask is not allowed
@@ -1940,38 +2046,28 @@ write_error:
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (adding) {
- if (parent == &top_cpuset)
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->addmask);
- /*
- * Some of the CPUs in effective_xcpus might have been offlined.
- */
- cpumask_or(parent->effective_cpus,
- parent->effective_cpus, tmp->addmask);
- cpumask_and(parent->effective_cpus,
- parent->effective_cpus, cpu_active_mask);
- }
- if (deleting) {
- if (parent == &top_cpuset)
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->delmask);
- cpumask_andnot(parent->effective_cpus,
- parent->effective_cpus, tmp->delmask);
- }
-
- if (is_partition_valid(parent)) {
- parent->nr_subparts += subparts_delta;
- WARN_ON_ONCE(parent->nr_subparts < 0);
- }
-
if (old_prs != new_prs) {
cs->partition_root_state = new_prs;
if (new_prs <= 0)
cs->nr_subparts = 0;
}
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ isolcpus_updated += partition_xcpus_del(old_prs, parent,
+ tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_add(new_prs, parent,
+ tmp->delmask);
+ if (is_partition_valid(parent)) {
+ parent->nr_subparts += subparts_delta;
+ WARN_ON_ONCE(parent->nr_subparts < 0);
+ }
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive(cs, new_prs);
@@ -2466,7 +2562,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_partition_sd_lb(cs, old_prs);
out_free:
free_cpumasks(NULL, &tmp);
- return 0;
+ return retval;
}
/**
@@ -2502,9 +2598,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
-
if (*buf)
compute_effective_exclusive_cpumask(trialcs, NULL);
@@ -2519,6 +2612,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
if (old_prs) {
if (cpumask_empty(trialcs->effective_xcpus)) {
invalidate = true;
@@ -2948,6 +3044,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
+ bool new_xcpus_state = false;
if (old_prs == new_prs)
return 0;
@@ -2977,6 +3074,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
if (!old_prs) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
/*
* cpus_allowed cannot be empty.
*/
@@ -2985,19 +3085,18 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
}
- err = update_parent_effective_cpumask(cs, partcmd_enable,
- NULL, &tmpmask);
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
/*
* If an attempt to become local partition root fails,
* try to become a remote partition root instead.
*/
- if (err && remote_partition_enable(cs, &tmpmask))
+ if (err && remote_partition_enable(cs, new_prs, &tmpmask))
err = 0;
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- ;
+ new_xcpus_state = true;
} else {
/*
* Switching back to member is always allowed even if it
@@ -3029,7 +3128,10 @@ out:
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
+ else if (new_xcpus_state)
+ partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
@@ -3386,6 +3488,7 @@ typedef enum {
FILE_SUBPARTS_CPULIST,
FILE_EXCLUSIVE_CPULIST,
FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -3582,6 +3685,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -3875,6 +3981,13 @@ static struct cftype dfl_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
+ {
+ .name = "cpus.isolated",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
{ } /* terminate */
};
@@ -4194,6 +4307,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -4306,6 +4420,30 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
+/*
+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
+ * progress.
+ * Return: true if successful, false otherwise
+ *
+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+static bool cpuset_hotplug_cpus_read_trylock(void)
+{
+ int retries = 0;
+
+ while (!cpus_read_trylock()) {
+ /*
+ * CPU hotplug still in progress. Retry 5 times
+ * with a 10ms wait before bailing out.
+ */
+ if (++retries > 5)
+ return false;
+ msleep(10);
+ }
+ return true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -4322,6 +4460,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
bool cpus_updated;
bool mems_updated;
bool remote;
+ int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -4353,11 +4492,13 @@ retry:
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) &&
- partition_is_populated(cs, NULL)) {
+ partition_is_populated(cs, NULL) &&
+ cpuset_hotplug_cpus_read_trylock()) {
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
cpuset_force_rebuild();
+ cpus_read_unlock();
}
/*
@@ -4368,18 +4509,28 @@ retry:
* partitions.
*/
if (is_local_partition(cs) && (!is_partition_valid(parent) ||
- tasks_nocpu_error(parent, cs, &new_cpus))) {
- update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
- compute_effective_cpumask(&new_cpus, cs, parent);
- cpuset_force_rebuild();
- }
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
* back to a regular one.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
- update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
- if (is_partition_valid(cs)) {
+ else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ partcmd = partcmd_update;
+
+ /*
+ * cpus_read_lock needs to be held before calling
+ * update_parent_effective_cpumask(). To avoid circular lock
+ * dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+ if (partcmd >= 0) {
+ if (!cpuset_hotplug_cpus_read_trylock())
+ goto update_tasks;
+
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ cpus_read_unlock();
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
}
@@ -4941,7 +5092,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c0adb7254b45..a8350d2d63e6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
}
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
*
- * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+ struct cgroup *child, int cpu)
+{
+ struct cgroup *chead = child; /* Head of child cgroup level */
+ struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
+ struct cgroup *parent, *grandchild;
+ struct cgroup_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+next_level:
+ while (chead) {
+ child = chead;
+ chead = child->rstat_flush_next;
+ parent = cgroup_parent(child);
+
+ /* updated_next is parent cgroup terminated */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = cgroup_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ chead = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned cgroup is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+ struct cgroup *head = NULL, *parent, *child;
+ unsigned long flags;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
*/
- if (!pos) {
- pos = root;
- /* return NULL if this subtree is not on-list */
- if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
- return NULL;
- } else {
- pos = cgroup_parent(pos);
- }
+ raw_spin_lock_irqsave(cpu_lock, flags);
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
- }
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ goto unlock_ret;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
+ parent = cgroup_parent(root);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (*nextp != pos) {
+ while (*nextp != root) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
@@ -142,7 +187,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
}
rstatc->updated_next = NULL;
- return pos;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+ return head;
}
/*
@@ -176,21 +231,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
- unsigned long flags;
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
@@ -202,7 +245,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {