aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_watch.c9
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/bpf_lru_list.c21
-rw-r--r--kernel/bpf/bpf_lru_list.h7
-rw-r--r--kernel/bpf/core.c5
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/verifier.c6
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup-v1.c8
-rw-r--r--kernel/cgroup/cgroup.c70
-rw-r--r--kernel/cgroup/cpuset.c29
-rw-r--r--kernel/cgroup/namespace.c6
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cred.c66
-rw-r--r--kernel/debug/kdb/kdb_main.c14
-rw-r--r--kernel/dma/coherent.c4
-rw-r--r--kernel/dma/debug.c2
-rw-r--r--kernel/events/core.c152
-rw-r--r--kernel/events/ring_buffer.c11
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c74
-rw-r--r--kernel/extable.c3
-rw-r--r--kernel/futex.c12
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/irq/generic-chip.c25
-rw-r--r--kernel/irq/irqdomain.c31
-rw-r--r--kernel/irq/matrix.c6
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/kexec_file.c14
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/locking/lockdep.c6
-rw-r--r--kernel/locking/test-ww_mutex.c20
-rw-r--r--kernel/module.c41
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c75
-rw-r--r--kernel/power/snapshot.c16
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/relay.c22
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/fair.c56
-rw-r--r--kernel/sched/idle.c22
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/alarmtimer.c33
-rw-r--r--kernel/time/hrtimer.c35
-rw-r--r--kernel/time/posix-stubs.c2
-rw-r--r--kernel/time/posix-timers.c33
-rw-r--r--kernel/time/tick-sched.c23
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ftrace.c78
-rw-r--r--kernel/trace/ring_buffer.c82
-rw-r--r--kernel/trace/trace.c91
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_events.c15
-rw-r--r--kernel/trace/trace_events_hist.c16
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_output.c9
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/watchdog_hld.c6
-rw-r--r--kernel/workqueue.c21
66 files changed, 965 insertions, 391 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 81f9831a7859..6d98aed403ba 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -331,6 +331,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
diff --git a/kernel/async.c b/kernel/async.c
index 4bf1b00a28d8..e59bd2240cb8 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime, delta, rettime;
/* 1) run (and print duration) */
if (initcall_debug && system_state < SYSTEM_RUNNING) {
@@ -283,7 +283,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime, delta, endtime;
if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dc14a4d9e3c..471d3ad910aa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1796,7 +1796,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
{
struct audit_buffer *ab;
struct timespec64 t;
- unsigned int uninitialized_var(serial);
+ unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 50952d6d8120..ff33536ae1ad 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -557,11 +557,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- exe_file = get_task_exe_file(tsk);
+ /* only do exe filtering if we are recording @current events/records */
+ if (tsk != current)
+ return 0;
+
+ if (!current->mm)
+ return 0;
+ exe_file = get_mm_exe_file(current->mm);
if (!exe_file)
return 0;
ino = file_inode(exe_file)->i_ino;
dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
+
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1513873e23bd..e4de5b9d5d3f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1923,6 +1923,8 @@ void __audit_inode_child(struct inode *parent,
}
}
+ cond_resched();
+
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 9b5eeff72fd3..39a0e768adc3 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -44,7 +44,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
- return node->ref;
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -92,7 +97,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -113,7 +118,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
@@ -356,7 +361,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, local_pending_list(loc_l));
}
@@ -422,7 +427,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
@@ -525,7 +530,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -571,7 +576,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
@@ -597,7 +602,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..08da78b59f0b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -66,11 +66,8 @@ struct bpf_lru {
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
- /* ref is an approximation on access frequency. It does not
- * have to be very accurate. Hence, no protection is used.
- */
- if (!node->ref)
- node->ref = 1;
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cbbd0168f50c..285101772c75 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,7 @@
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
+#include <linux/nospec.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -602,7 +603,7 @@ static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
@@ -1373,9 +1374,7 @@ out:
* reuse preexisting logic from Spectre v1 mitigation that
* happens to produce the required code on x86 for v4 as well.
*/
-#ifdef CONFIG_X86
barrier_nospec();
-#endif
CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1a8b208f6c55..fcd3a15add41 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -194,6 +194,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
struct lpm_trie_node *node, *found = NULL;
struct bpf_lpm_trie_key *key = _key;
+ if (key->prefixlen > trie->max_prefixlen)
+ return NULL;
+
/* Start walking the trie from the root node ... */
for (node = rcu_dereference(trie->root); node;) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 694ee0b1fefe..a48de55f5630 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1012,7 +1012,9 @@ static int check_stack_write(struct bpf_verifier_env *env,
bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true;
break;
}
@@ -5934,7 +5936,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
insn->dst_reg,
shift);
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1ULL << size * 8) - 1);
}
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 8f7729b0a638..4168e7d97e87 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -148,7 +148,6 @@ extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 61644976225a..55a61deb3d04 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -13,6 +13,7 @@
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
+#include <linux/cpu.h>
#include <trace/events/cgroup.h>
@@ -55,6 +56,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ get_online_cpus();
percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -71,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
break;
}
percpu_up_write(&cgroup_threadgroup_rwsem);
+ put_online_cpus();
mutex_unlock(&cgroup_mutex);
return retval;
@@ -392,10 +395,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
css_task_iter_end(&it);
length = n;
- /* now sort & (if procs) strip out duplicates */
+ /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 992f0ee8dba1..8df1b3cdfc42 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/sched/cputime.h>
+#include <linux/cpu.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -1651,7 +1652,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1695,7 +1696,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
@@ -1713,9 +1715,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
spin_unlock_irq(&css_set_lock);
/* default hierarchy doesn't enable controllers by default */
@@ -2210,6 +2225,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(void)
+{
+ get_online_cpus();
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(void)
+{
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ put_online_cpus();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2694,7 +2748,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock();
rcu_read_lock();
if (pid) {
@@ -2725,7 +2779,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
goto out_unlock_rcu;
out_unlock_threadgroup:
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
out_unlock_rcu:
rcu_read_unlock();
return tsk;
@@ -2740,7 +2794,7 @@ void cgroup_procs_write_finish(struct task_struct *task)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2799,7 +2853,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock();
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
@@ -2830,7 +2884,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 42ed4e497336..41019614ca7c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -830,8 +830,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -839,15 +839,13 @@ static void rebuild_sched_domains_locked(void)
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -857,9 +855,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
/**
@@ -1504,7 +1504,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cs = css_cs(css);
mutex_lock(&cpuset_mutex);
- css_cs(css)->attach_in_progress--;
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex);
}
@@ -1528,13 +1530,9 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
mutex_lock(&cpuset_mutex);
- /*
- * It should hold cpus lock because a cpu offline event can
- * cause set_cpus_allowed_ptr() failed.
- */
- get_online_cpus();
/* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1553,7 +1551,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
}
- put_online_cpus();
/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -1617,6 +1614,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1654,6 +1652,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1664,6 +1663,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1678,6 +1678,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1716,6 +1717,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1741,6 +1743,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -1985,6 +1988,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
@@ -2035,6 +2039,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
raw_spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return 0;
}
@@ -2048,6 +2053,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
@@ -2057,6 +2063,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..313e66b8c662 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -148,9 +148,3 @@ const struct proc_ns_operations cgroupns_operations = {
.install = cgroupns_install,
.owner = cgroupns_owner,
};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/compat.c b/kernel/compat.c
index e4548a9e9c52..5f320b0db8d0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -307,7 +307,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 57bd32c83c9e..4b0c5788a369 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1462,7 +1462,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -1529,6 +1529,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = NULL,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/cred.c b/kernel/cred.c
index a9f0f8b21d8c..8c58f0f63af2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -101,17 +101,17 @@ static void put_cred_rcu(struct rcu_head *rcu)
#ifdef CONFIG_DEBUG_CREDENTIALS
if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
+ atomic_long_read(&cred->usage) != 0 ||
read_cred_subscribers(cred) != 0)
panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
+ " mag %x, put %p, usage %ld, subscr %d\n",
cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
+ atomic_long_read(&cred->usage),
read_cred_subscribers(cred));
#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
#endif
security_cred_free(cred);
@@ -134,11 +134,11 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
+ kdebug("__put_cred(%p{%ld,%d})", cred,
+ atomic_long_read(&cred->usage),
read_cred_subscribers(cred));
- BUG_ON(atomic_read(&cred->usage) != 0);
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(cred) != 0);
cred->magic = CRED_MAGIC_DEAD;
@@ -161,8 +161,8 @@ void exit_creds(struct task_struct *tsk)
{
struct cred *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
+ kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
cred = (struct cred *) tsk->real_cred;
@@ -197,7 +197,7 @@ const struct cred *get_task_cred(struct task_struct *task)
do {
cred = __task_cred((task));
BUG_ON(!cred);
- } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+ } while (!atomic_long_inc_not_zero(&((struct cred *)cred)->usage));
rcu_read_unlock();
return cred;
@@ -215,7 +215,7 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
+ atomic_long_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
@@ -262,7 +262,7 @@ struct cred *prepare_creds(void)
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
+ atomic_long_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
@@ -338,8 +338,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
+ kdebug("share_creds(%p{%ld,%d})",
+ p->cred, atomic_long_read(&p->cred->usage),
read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
@@ -429,8 +429,8 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
+ kdebug("commit_creds(%p{%ld,%d})", new,
+ atomic_long_read(&new->usage),
read_cred_subscribers(new));
BUG_ON(task->cred != old);
@@ -439,7 +439,7 @@ int commit_creds(struct cred *new)
validate_creds(old);
validate_creds(new);
#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -512,14 +512,14 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
+ kdebug("abort_creds(%p{%ld,%d})", new,
+ atomic_long_read(&new->usage),
read_cred_subscribers(new));
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(new) != 0);
#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
@@ -535,8 +535,8 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
+ kdebug("override_creds(%p{%ld,%d})", new,
+ atomic_long_read(&new->usage),
read_cred_subscribers(new));
validate_creds(old);
@@ -558,8 +558,8 @@ const struct cred *override_creds(const struct cred *new)
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
+ kdebug("override_creds() = %p{%ld,%d}", old,
+ atomic_long_read(&old->usage),
read_cred_subscribers(old));
return old;
}
@@ -576,8 +576,8 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
+ kdebug("revert_creds(%p{%ld,%d})", old,
+ atomic_long_read(&old->usage),
read_cred_subscribers(old));
validate_creds(old);
@@ -637,7 +637,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
*new = *old;
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
+ atomic_long_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -760,8 +760,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
cred == tsk->cred ? "[eff]" : "");
printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
+ printk(KERN_ERR "CRED: ->usage=%ld, subscr=%d\n",
+ atomic_long_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
from_kuid_munged(&init_user_ns, cred->uid),
@@ -833,9 +833,9 @@ EXPORT_SYMBOL(__validate_process_creds);
*/
void validate_creds_for_do_exit(struct task_struct *tsk)
{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+ kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
+ atomic_long_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
__validate_process_creds(tsk, __FILE__, __LINE__);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index dc6bf35e7884..7f78657dfa00 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -400,6 +400,13 @@ int kdb_set(int argc, const char **argv)
return KDB_ARGCOUNT;
/*
+ * Censor sensitive variables
+ */
+ if (strcmp(argv[1], "PROMPT") == 0 &&
+ !kdb_check_flags(KDB_ENABLE_MEM_READ, kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
* Check for internal variables
*/
if (strcmp(argv[1], "KDBDEBUG") == 0) {
@@ -1299,14 +1306,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*(cmd_hist[cmd_head]) = '\0';
do_full_getstr:
-#if defined(CONFIG_SMP)
+ /* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
-#else
- snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
-#endif
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 597d40893862..4c7ffd094a57 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -369,8 +369,10 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
static void rmem_dma_device_release(struct reserved_mem *rmem,
struct device *dev)
{
- if (dev)
+ if (dev) {
dev->dma_mem = NULL;
+ dev->dma_mem = NULL;
+ }
}
static const struct reserved_mem_ops rmem_dma_ops = {
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7c6cd00d0fca..c345a6e2f7b7 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -963,7 +963,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
{
struct device *dev = data;
- struct dma_debug_entry *uninitialized_var(entry);
+ struct dma_debug_entry *entry;
int count;
if (dma_debug_disabled())
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 11e9bbad4878..164a2912d4a3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1133,6 +1133,11 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1705,28 +1710,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1767,8 +1778,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1799,23 +1811,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -1843,6 +1876,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -1913,6 +1947,7 @@ static void perf_group_detach(struct perf_event *event)
if (event->group_leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -4750,7 +4785,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4760,6 +4795,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -4782,15 +4844,20 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -4809,10 +4876,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -4843,7 +4906,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -4853,6 +4916,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -6162,7 +6227,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6176,6 +6241,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6186,7 +6253,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ u64 values[6];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
@@ -6204,6 +6271,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6217,6 +6286,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -7843,8 +7914,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -9644,8 +9715,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -9682,13 +9752,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -10429,7 +10501,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -10573,7 +10645,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx, *gctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -11597,6 +11669,8 @@ static int inherit_group(struct perf_event *parent_event,
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 12f351b253bb..ddcbd03eccbb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -165,8 +165,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -249,6 +251,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -639,6 +642,12 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
}
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24342bca11f2..72ae05d65066 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1887,7 +1887,7 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int uninitialized_var(is_swbp);
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == get_trampoline_vaddr())
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a414fc71b87..cbe99ab0ac19 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,12 +62,59 @@
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
+#include <linux/sysfs.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -93,7 +140,7 @@ static void __exit_signal(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
+ struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
@@ -922,6 +969,31 @@ void __noreturn do_exit(long code)
}
EXPORT_SYMBOL_GPL(do_exit);
+void __noreturn make_task_dead(int signr)
+{
+ /*
+ * Take the task off the cpu after something catastrophic has
+ * happened.
+ */
+ unsigned int limit;
+
+ /*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ do_exit(signr);
+}
+
void complete_and_exit(struct completion *comp, long code)
{
if (comp)
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..b3ca75d6bf92 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ u32 __initdata __visible main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ if (main_extable_sort_needed &&
+ &__stop___ex_table > &__start___ex_table) {
pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 25294744ca36..9f64bbc8b57e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1400,7 +1400,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
int err;
- u32 uninitialized_var(curval);
+ u32 curval;
if (unlikely(should_fail_futex(true)))
return -EFAULT;
@@ -1571,7 +1571,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
*/
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- u32 uninitialized_var(curval), newval;
+ u32 curval, newval;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
@@ -3114,7 +3114,7 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+ u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
@@ -3629,7 +3629,7 @@ err_unlock:
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
- u32 uval, uninitialized_var(nval), mval;
+ u32 uval, nval, mval;
int err;
/* Futex address must be 32bit aligned */
@@ -3759,7 +3759,7 @@ static void exit_robust_list(struct task_struct *curr)
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
unsigned long futex_offset;
int rc;
@@ -4058,7 +4058,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index bb6ccf456e2d..1c2bd1f23536 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -85,6 +85,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -97,6 +98,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e2999a070a99..4195e7ad1ff2 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -537,21 +537,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int i, virq;
raw_spin_lock(&gc_lock);
list_del(&gc->list);
raw_spin_unlock(&gc_lock);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ /*
+ * Interrupt domain based chips store the base hardware
+ * interrupt number in gc::irq_base. Otherwise gc::irq_base
+ * contains the base Linux interrupt number.
+ */
+ if (gc->domain) {
+ virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+ if (!virq)
+ continue;
+ } else {
+ virq = gc->irq_base + i;
+ }
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(virq, NULL);
+ irq_set_chip(virq, &no_irq_chip);
+ irq_set_chip_data(virq, NULL);
+ irq_modify_status(virq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1e42fc2ad4d5..3ebe231a5eda 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -494,6 +494,9 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&irq_domain_mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -513,10 +516,12 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&irq_domain_mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -529,7 +534,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -546,7 +550,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
@@ -557,12 +560,23 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&irq_domain_mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&irq_domain_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -818,13 +832,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
+ if (WARN_ON(!irq_data))
return 0;
- }
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 8e586858bcf4..d25edbb87119 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
}
/**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
* @m: Pointer to the matrix to search
*
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
*/
unsigned int irq_matrix_allocated(struct irq_matrix *m)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return cm->allocated;
+ return cm->allocated - cm->managed_allocated;
}
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 6b3d7f7211dd..3666d434a8f5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1020,6 +1020,7 @@ int crash_shrink_memory(unsigned long new_size)
start = crashk_res.start;
end = crashk_res.end;
old_size = (end == 0) ? 0 : end - start + 1;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
if (new_size >= old_size) {
ret = (new_size == old_size) ? 0 : -EINVAL;
goto unlock;
@@ -1031,9 +1032,7 @@ int crash_shrink_memory(unsigned long new_size)
goto unlock;
}
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
+ end = start + new_size;
crash_free_reserved_phys_range(end, crashk_res.end);
if ((start == end) && (crashk_res.parent != NULL))
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ab1934a2b2e6..416c1e0fdc91 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -793,10 +793,22 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
}
offset = ALIGN(offset, align);
+
+ /*
+ * Check if the segment contains the entry point, if so,
+ * calculate the value of image->start based on it.
+ * If the compiler has produced more than one .text section
+ * (Eg: .text.hot), they are generally after the main .text
+ * section, and they shall not be used to calculate
+ * image->start. So do not re-calculate image->start if it
+ * is not set to the initial value, and warn the user so they
+ * have a chance to fix their purgatory's linker script.
+ */
if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
- + sechdrs[i].sh_size)) {
+ + sechdrs[i].sh_size) &&
+ !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 33aba4e2e3e3..e1fb6453e8e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -418,8 +418,8 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
-/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
-static inline int kprobe_disarmed(struct kprobe *p)
+/* Return true if the kprobe is disarmed. Note: p must be on hash list */
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -626,7 +626,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4dd83428b56d..87006e19c9b0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1246,7 +1246,7 @@ static int noop_count(struct lock_list *entry, void *data)
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_forwards(this, (void *)&count, noop_count, &target_entry);
@@ -1274,7 +1274,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_backwards(this, (void *)&count, noop_count, &target_entry);
@@ -2662,7 +2662,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
{
int ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
root.parent = NULL;
root.class = hlock_class(this);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..4fd05d9d5d6d 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -439,7 +439,6 @@ retry:
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -504,7 +503,6 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
@@ -525,8 +523,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -537,15 +533,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -569,9 +574,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -586,6 +589,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
diff --git a/kernel/module.c b/kernel/module.c
index 42a604401c4d..2ec961945fa8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2261,15 +2261,26 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
struct module *owner;
+ enum mod_license license;
const struct kernel_symbol *sym;
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
- if (sym && strong_try_module_get(owner))
+ sym = find_symbol(symbol, &owner, NULL, &license, true, true);
+ if (!sym)
+ goto fail;
+ if (license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ goto fail;
+ }
+ if (strong_try_module_get(owner))
sym = NULL;
preempt_enable();
return sym ? (void *)kernel_symbol_value(sym) : NULL;
+fail:
+ preempt_enable();
+ return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -3478,7 +3489,8 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
@@ -3632,20 +3644,35 @@ static int add_unformed_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
finished_loading(mod->name));
if (err)
goto out_unlocked;
- goto again;
+
+ /* The module might have gone in the meantime. */
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name),
+ true);
}
- err = -EEXIST;
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ err = -EEXIST;
+ else
+ err = -EBUSY;
goto out;
}
mod_update_bounds(mod);
diff --git a/kernel/padata.c b/kernel/padata.c
index 7f2b6d369fd4..a9e14183e188 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -121,7 +121,7 @@ int padata_do_parallel(struct padata_instance *pinst,
if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
goto out;
- err = -EBUSY;
+ err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index fa114be5c30f..51d6c00f9d7a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -29,6 +29,7 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
@@ -42,6 +43,7 @@ static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -50,6 +52,45 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_panic_table[] = {
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -125,6 +166,19 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -142,6 +196,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -532,16 +596,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (args)
vprintk(args->fmt, args->args);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
- }
+ check_panic_on_warn("kernel");
print_modules();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f2635fc751d9..5abe4582df1b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2376,8 +2376,9 @@ static void *get_highmem_page_buffer(struct page *page,
pbe->copy_page = tmp;
} else {
/* Copy of the page will be stored in normal memory */
- kaddr = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ kaddr = __get_safe_page(ca->gfp_mask);
+ if (!kaddr)
+ return ERR_PTR(-ENOMEM);
pbe->copy_page = virt_to_page(kaddr);
}
pbe->next = highmem_pblist;
@@ -2557,8 +2558,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return ERR_PTR(-ENOMEM);
}
pbe->orig_address = page_address(page);
- pbe->address = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ pbe->address = __get_safe_page(ca->gfp_mask);
+ if (!pbe->address)
+ return ERR_PTR(-ENOMEM);
pbe->next = restore_pblist;
restore_pblist = pbe;
return pbe->address;
@@ -2589,8 +2591,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
- handle->sync_read = 1;
-
if (!handle->cur) {
if (!buffer)
/* This makes the buffer be freed by swsusp_free() */
@@ -2631,7 +2631,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
memory_bm_position_reset(&orig_bm);
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
- handle->sync_read = 0;
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
}
@@ -2643,9 +2642,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
- if (handle->buffer != buffer)
- handle->sync_read = 0;
}
+ handle->sync_read = (handle->buffer == buffer);
handle->cur++;
return PAGE_SIZE;
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index ac6d6fdf5783..289a26b81fd7 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -570,7 +570,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(rnp->expmask & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
}
jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
diff --git a/kernel/relay.c b/kernel/relay.c
index 735cb208f023..e6f70f4c41a3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -163,13 +163,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -997,14 +997,14 @@ static void relay_file_read_consume(struct rchan_buf *buf,
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
size_t consumed = buf->subbufs_consumed;
- relay_file_read_consume(buf, read_pos, 0);
+ relay_file_read_consume(buf, 0, 0);
consumed = buf->subbufs_consumed;
@@ -1065,23 +1065,21 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
- * @read_pos: file read position
* @buf: relay channel buffer
*
- * If the @read_pos is in the middle of padding, return the
+ * If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
-static size_t relay_file_read_start_pos(size_t read_pos,
- struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
+ size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
+ % (n_subbufs * subbuf_size);
- if (!read_pos)
- read_pos = consumed * subbuf_size + buf->bytes_consumed;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -1137,10 +1135,10 @@ static ssize_t relay_file_read(struct file *filp,
do {
void *from;
- if (!relay_file_read_avail(buf, *ppos))
+ if (!relay_file_read_avail(buf))
break;
- read_start = relay_file_read_start_pos(*ppos, buf);
+ read_start = relay_file_read_start_pos(buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 91683193a33f..c6d3847eebf4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -801,6 +801,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
@@ -3380,8 +3383,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_ip_sym(preempt_disable_ip);
pr_cont("\n");
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -5060,14 +5062,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 97ebedeeafce..4d4cc7400de1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3854,6 +3854,29 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}
+static inline bool entity_is_long_sleeper(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+ u64 sleep_time;
+
+ if (se->exec_start == 0)
+ return false;
+
+ cfs_rq = cfs_rq_of(se);
+
+ sleep_time = rq_clock_task(rq_of(cfs_rq));
+
+ /* Happen while migrating because of clock task divergence */
+ if (sleep_time <= se->exec_start)
+ return false;
+
+ sleep_time -= se->exec_start;
+ if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+ return true;
+
+ return false;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
@@ -3882,8 +3905,29 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * Pull vruntime of the entity being placed to the base level of
+ * cfs_rq, to prevent boosting it if placed backwards.
+ * However, min_vruntime can advance much faster than real time, with
+ * the extreme being when an entity with the minimal weight always runs
+ * on the cfs_rq. If the waking entity slept for a long time, its
+ * vruntime difference from min_vruntime may overflow s64 and their
+ * comparison may get inversed, so ignore the entity's original
+ * vruntime in that case.
+ * The maximal vruntime speedup is given by the ratio of normal to
+ * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
+ * When placing a migrated waking entity, its exec_start has been set
+ * from a different rq. In order to take into account a possible
+ * divergence between new and prev rq's clocks task because of irq and
+ * stolen time, we take an additional margin.
+ * So, cutting off on the sleep time of
+ * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
+ * should be safe.
+ */
+ if (entity_is_long_sleeper(se))
+ se->vruntime = vruntime;
+ else
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3978,6 +4022,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
@@ -6554,9 +6601,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
- /* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
-
update_scan_period(p, new_cpu);
}
@@ -8687,7 +8731,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 44a17366c8ec..4e3d149d64ad 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -53,17 +53,18 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ trace_cpu_idle(0, smp_processor_id());
+ stop_critical_timings();
rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- stop_critical_timings();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
- start_critical_timings();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+
rcu_idle_exit();
+ start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
return 1;
}
@@ -90,7 +91,9 @@ void __cpuidle default_idle_call(void)
local_irq_enable();
} else {
stop_critical_timings();
+ rcu_idle_enter();
arch_cpu_idle();
+ rcu_idle_exit();
start_critical_timings();
}
}
@@ -148,7 +151,6 @@ static void cpuidle_idle_call(void)
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
default_idle_call();
goto exit_idle;
@@ -166,19 +168,15 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
- rcu_idle_enter();
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
-
- rcu_idle_exit();
}
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
@@ -195,8 +193,6 @@ static void cpuidle_idle_call(void)
else
tick_nohz_idle_retain_tick();
- rcu_idle_enter();
-
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -212,8 +208,6 @@ exit_idle:
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
-
- rcu_idle_exit();
}
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b15428ede6cf..41e73baf41ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1522,6 +1522,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1535,7 +1537,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
- BUG_ON(!rt_se);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
diff --git a/kernel/sys.c b/kernel/sys.c
index d0663f8e6fb8..3548467f6459 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1530,6 +1530,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
+
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e7b983df6996..85c7f4a6b9c1 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -476,11 +476,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->gettime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
+
+ return alarm_forward(alarm, now, interval);
+}
- return alarm_forward(alarm, base->gettime(), interval);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -554,9 +578,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index a84673149881..59a754881976 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1905,6 +1905,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
@@ -1925,6 +1926,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
@@ -2020,29 +2022,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, ncpu = cpumask_first(cpu_active_mask);
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ tick_cancel_sched_timer(dying_cpu);
+
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
- /*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2053,15 +2048,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
- hrtimer_update_softirq_timer(new_base, false);
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ raw_spin_unlock(&old_base->lock);
- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 2c6847d5d69b..362c159fb3f8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -144,6 +144,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
@@ -230,6 +231,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 3fd433d2a767..ba508c0924b9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -159,25 +159,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -1245,6 +1250,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1272,6 +1278,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4d31ec98e968..eae0992e71ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -204,6 +204,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU);
+ return true;
+ }
+
return false;
}
@@ -331,6 +336,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -338,6 +344,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
@@ -405,7 +412,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
tick_nohz_full_running = true;
}
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
* The boot CPU handles housekeeping duty (unbound timers,
@@ -413,8 +420,13 @@ static int tick_nohz_cpu_down(unsigned int cpu)
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -1332,13 +1344,18 @@ void tick_setup_sched_timer(void)
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t idle_sleeptime, iowait_sleeptime;
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
+ idle_sleeptime = ts->idle_sleeptime;
+ iowait_sleeptime = ts->iowait_sleeptime;
memset(ts, 0, sizeof(*ts));
+ ts->idle_sleeptime = idle_sleeptime;
+ ts->iowait_sleeptime = iowait_sleeptime;
}
#endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 75ea1a5be31a..113cbe1a20aa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1594,7 +1594,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1cb13d6368f3..b794470bb42e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1274,7 +1274,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
#ifdef CONFIG_UPROBE_EVENTS
if (flags & TRACE_EVENT_FL_UPROBE)
err = bpf_get_uprobe_info(event, fd_type, buf,
- probe_offset,
+ probe_offset, probe_addr,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9c7795566436..81f5c9c85d06 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1124,7 +1124,7 @@ struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
int index;
- int size;
+ int order;
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
@@ -1581,7 +1581,8 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
key.flags = end; /* overload flags, as it is unsigned long */
for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
+ if (pg->index == 0 ||
+ end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
continue;
rec = bsearch(&key, pg->records, pg->index,
@@ -2914,6 +2915,8 @@ static void ftrace_shutdown_sysctl(void)
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
+unsigned long ftrace_number_of_pages;
+unsigned long ftrace_number_of_groups;
static inline int ops_traces_mod(struct ftrace_ops *ops)
{
@@ -3038,8 +3041,11 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
goto again;
}
+ ftrace_number_of_pages += 1 << order;
+ ftrace_number_of_groups++;
+
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
- pg->size = cnt;
+ pg->order = order;
if (cnt > count)
cnt = count;
@@ -3047,12 +3053,27 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
return cnt;
}
+static void ftrace_free_pages(struct ftrace_page *pages)
+{
+ struct ftrace_page *pg = pages;
+
+ while (pg) {
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
+ pages = pg->next;
+ kfree(pg);
+ pg = pages;
+ ftrace_number_of_groups--;
+ }
+}
+
static struct ftrace_page *
ftrace_allocate_pages(unsigned long num_to_init)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
- int order;
int cnt;
if (!num_to_init)
@@ -3086,14 +3107,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- pg = start_pg;
- while (pg) {
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- start_pg = pg->next;
- kfree(pg);
- pg = start_pg;
- }
+ ftrace_free_pages(start_pg);
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
}
@@ -5585,9 +5599,11 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg_unuse = NULL;
struct ftrace_page *start_pg;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ unsigned long skipped = 0;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -5633,6 +5649,7 @@ static int ftrace_process_locs(struct module *mod,
p = start;
pg = start_pg;
while (p < end) {
+ unsigned long end_offset;
addr = ftrace_call_adjust(*p++);
/*
* Some architecture linkers will pad between
@@ -5640,10 +5657,13 @@ static int ftrace_process_locs(struct module *mod,
* object files to satisfy alignments.
* Skip any NULL pointers.
*/
- if (!addr)
+ if (!addr) {
+ skipped++;
continue;
+ }
- if (pg->index == pg->size) {
+ end_offset = (pg->index+1) * sizeof(pg->records[0]);
+ if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
if (WARN_ON(!pg->next))
break;
@@ -5654,8 +5674,10 @@ static int ftrace_process_locs(struct module *mod,
rec->ip = addr;
}
- /* We should have used all pages */
- WARN_ON(pg->next);
+ if (pg->next) {
+ pg_unuse = pg->next;
+ pg->next = NULL;
+ }
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
@@ -5677,6 +5699,11 @@ static int ftrace_process_locs(struct module *mod,
out:
mutex_unlock(&ftrace_lock);
+ /* We should have used all pages unless we skipped some */
+ if (pg_unuse) {
+ WARN_ON(!skipped);
+ ftrace_free_pages(pg_unuse);
+ }
return ret;
}
@@ -5783,7 +5810,6 @@ void ftrace_release_mod(struct module *mod)
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
- int order;
mutex_lock(&ftrace_lock);
@@ -5834,10 +5860,13 @@ void ftrace_release_mod(struct module *mod)
/* Needs to be called outside of ftrace_lock */
clear_mod_from_hashes(pg);
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
tmp_page = pg->next;
kfree(pg);
+ ftrace_number_of_groups--;
}
}
@@ -6143,7 +6172,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
struct list_head clear_hash;
- int order;
INIT_LIST_HEAD(&clear_hash);
@@ -6181,8 +6209,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
+ ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
@@ -6238,6 +6269,9 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
+ pr_info("ftrace: allocated %ld pages with %ld groups\n",
+ ftrace_number_of_pages, ftrace_number_of_groups);
+
set_ftrace_early_filters();
return;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5e5b0c067f61..d2903d8834fe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -493,6 +493,8 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ /* pages removed since last reset */
+ unsigned long pages_removed;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -528,6 +530,7 @@ struct ring_buffer_iter {
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
+ unsigned long cache_pages_removed;
u64 read_stamp;
};
@@ -561,7 +564,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
*/
int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
{
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
int ret = 0;
@@ -1326,6 +1329,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ irq_work_sync(&cpu_buffer->irq_work.work);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -1339,6 +1344,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(bpage);
}
+ free_page((unsigned long)cpu_buffer->free_page);
+
kfree(cpu_buffer);
}
@@ -1431,6 +1438,8 @@ ring_buffer_free(struct ring_buffer *buffer)
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ irq_work_sync(&buffer->irq_work.work);
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
@@ -1510,6 +1519,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
to_remove = rb_list_head(to_remove)->next;
head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
+ /* Read iterators need to reset themselves when some pages removed */
+ cpu_buffer->pages_removed += nr_removed;
next_page = rb_list_head(to_remove)->next;
@@ -1531,12 +1542,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cpu_buffer->head_page = list_entry(next_page,
struct buffer_page, list);
- /*
- * change read pointer to make sure any read iterators reset
- * themselves
- */
- cpu_buffer->read = 0;
-
/* pages are removed, resume tracing and then free the pages */
atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
@@ -1750,6 +1755,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
err = -ENOMEM;
goto out_err;
}
+
+ cond_resched();
}
get_online_cpus();
@@ -2463,6 +2470,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
@@ -2476,6 +2487,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
@@ -2878,6 +2891,12 @@ rb_reserve_next_event(struct ring_buffer *buffer,
int nr_loops = 0;
u64 diff;
+ /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ (unlikely(in_nmi()))) {
+ return NULL;
+ }
+
rb_start_commit(cpu_buffer);
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
@@ -3572,6 +3591,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
@@ -3841,7 +3861,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Make sure we see any padding after the write update
- * (see rb_reset_tail())
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
*/
smp_rmb();
@@ -4007,12 +4032,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
/*
- * Check if someone performed a consuming read to
- * the buffer. A consuming read invalidates the iterator
- * and we need to reset the iterator in this case.
+ * Check if someone performed a consuming read to the buffer
+ * or removed some pages from the buffer. In these cases,
+ * iterator was invalidated and we need to reset it.
*/
if (unlikely(iter->cache_read != cpu_buffer->read ||
- iter->cache_reader_page != cpu_buffer->reader_page))
+ iter->cache_reader_page != cpu_buffer->reader_page ||
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
rb_iter_reset(iter);
again:
@@ -4393,28 +4419,34 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+static void rb_clear_buffer_page(struct buffer_page *page)
+{
+ local_set(&page->write, 0);
+ local_set(&page->entries, 0);
+ rb_init_page(page->page);
+ page->read = 0;
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct buffer_page *page;
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
- local_set(&cpu_buffer->head_page->write, 0);
- local_set(&cpu_buffer->head_page->entries, 0);
- local_set(&cpu_buffer->head_page->page->commit, 0);
-
- cpu_buffer->head_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->head_page);
+ list_for_each_entry(page, cpu_buffer->pages, list) {
+ rb_clear_buffer_page(page);
+ }
cpu_buffer->tail_page = cpu_buffer->head_page;
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
- local_set(&cpu_buffer->reader_page->write, 0);
- local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
- cpu_buffer->reader_page->read = 0;
+ rb_clear_buffer_page(cpu_buffer->reader_page);
local_set(&cpu_buffer->entries_bytes, 0);
local_set(&cpu_buffer->overrun, 0);
@@ -4433,6 +4465,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_overrun = 0;
rb_head_page_activate(cpu_buffer);
+ cpu_buffer->pages_removed = 0;
}
/**
@@ -4685,11 +4718,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
*/
void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
/* If the page is still in use someplace else, we can't reuse it */
if (page_ref_count(page) > 1)
goto out;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e1db56edeb0d..f73e112cd91c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2213,8 +2213,11 @@ void trace_buffered_event_enable(void)
for_each_tracing_cpu(cpu) {
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto failed;
+ /* This is just an optimization and can handle failures */
+ if (!page) {
+ pr_err("Failed to allocate event buffer\n");
+ break;
+ }
event = page_address(page);
memset(event, 0, sizeof(*event));
@@ -2228,10 +2231,6 @@ void trace_buffered_event_enable(void)
WARN_ON_ONCE(1);
preempt_enable();
}
-
- return;
- failed:
- trace_buffered_event_disable();
}
static void enable_trace_buffered_event(void *data)
@@ -2266,11 +2265,9 @@ void trace_buffered_event_disable(void)
if (--trace_buffered_event_ref)
return;
- preempt_disable();
/* For each CPU, set the buffer as used. */
- smp_call_function_many(tracing_buffer_mask,
- disable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
+ NULL, true);
/* Wait for all current users to finish */
synchronize_sched();
@@ -2279,17 +2276,19 @@ void trace_buffered_event_disable(void)
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
per_cpu(trace_buffered_event, cpu) = NULL;
}
+
/*
- * Make sure trace_buffered_event is NULL before clearing
- * trace_buffered_event_cnt.
+ * Wait for all CPUs that potentially started checking if they can use
+ * their event buffer only after the previous synchronize_rcu() call and
+ * they still read a valid pointer from trace_buffered_event. It must be
+ * ensured they don't see cleared trace_buffered_event_cnt else they
+ * could wrongly decide to use the pointed-to buffer which is now freed.
*/
- smp_wmb();
+ synchronize_rcu();
- preempt_disable();
- /* Do the work on each cpu */
- smp_call_function_many(tracing_buffer_mask,
- enable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ /* For each CPU, relinquish the buffer */
+ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
+ true);
}
static struct ring_buffer *temp_buffer;
@@ -3281,8 +3280,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
* will point to the same string as current_trace->name.
*/
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
*iter->trace = *tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -3831,7 +3837,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
} else {
- print_trace_line(iter);
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ }
ret = trace_print_seq(m, &iter->seq);
/*
* If we overflow the seq_file buffer, then it will
@@ -5261,8 +5271,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
- !tr->current_trace->use_max_tr)
+ if (!tr->current_trace->use_max_tr)
goto out;
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
@@ -5812,7 +5821,20 @@ waitagain:
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- /* don't print partial lines */
+ /*
+ * If one print_trace_line() fills entire trace_seq in one shot,
+ * trace_seq_to_user() will returns -EBUSY because save_len == 0,
+ * In this case, we need to consume it, otherwise, loop will peek
+ * this event next time, resulting in an infinite loop.
+ */
+ if (save_len == 0) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ trace_consume(iter);
+ break;
+ }
+
+ /* In other cases, don't print partial lines */
iter->seq.seq.len = save_len;
break;
}
@@ -7103,14 +7125,23 @@ static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- unsigned long *p = filp->private_data;
- char buf[64]; /* Not too big for a shallow stack */
+ ssize_t ret;
+ char *buf;
int r;
- r = scnprintf(buf, 63, "%ld", *p);
- buf[r++] = '\n';
+ /* 256 should be plenty to hold the amount needed */
+ buf = kmalloc(256, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ ftrace_update_tot_cnt,
+ ftrace_number_of_pages,
+ ftrace_number_of_groups);
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ kfree(buf);
+ return ret;
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -8242,7 +8273,7 @@ static __init int tracer_init_tracefs(void)
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+ NULL, &tracing_dyn_info_fops);
#endif
create_trace_instances(d_tracer);
@@ -8670,6 +8701,8 @@ void __init early_trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+
+ init_events();
}
void __init trace_init(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7740bcdad355..00fbf2cb505c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -750,6 +750,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+extern unsigned long ftrace_number_of_pages;
+extern unsigned long ftrace_number_of_groups;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -1532,6 +1534,7 @@ extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
+extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 91df34a11341..fc5c032d4b66 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -374,7 +374,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
- unsigned long file_flags = file->flags;
int ret = 0;
int disable;
@@ -398,6 +397,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Disable use of trace_buffered_event */
+ trace_buffered_event_disable();
} else
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
@@ -436,6 +437,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_inc_return(&file->sm_ref) > 1)
break;
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Enable use of trace_buffered_event */
+ trace_buffered_event_enable();
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
@@ -475,15 +478,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
}
- /* Enable or disable use of trace_buffered_event */
- if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
- (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
- if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
- trace_buffered_event_enable();
- else
- trace_buffered_event_disable();
- }
-
return ret;
}
@@ -2250,6 +2244,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
update_event_printk(call, map[i]);
}
}
+ cond_resched();
}
up_write(&trace_event_sem);
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 48f85dab9ef1..e004daf8cad5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1764,6 +1764,9 @@ static const char *hist_field_name(struct hist_field *field,
{
const char *field_name = "";
+ if (WARN_ON_ONCE(!field))
+ return field_name;
+
if (level > 1)
return field_name;
@@ -2314,6 +2317,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
+ if (!hist_field->operands[0])
+ goto free;
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
@@ -5782,13 +5787,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
if (get_named_trigger_data(trigger_data))
goto enable;
- if (has_hist_vars(hist_data))
- save_hist_vars(hist_data);
-
ret = create_actions(hist_data, file);
if (ret)
goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_unreg;
+ }
+
ret = tracing_map_init(hist_data->map);
if (ret)
goto out_unreg;
@@ -5807,7 +5815,7 @@ enable:
/* Just return zero, not the number of registered triggers */
ret = 0;
out:
- if (ret == 0)
+ if (ret == 0 && glob[0])
hist_err_clear();
return ret;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 98ea6d28df15..0f36bb59970d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -222,7 +222,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3f78b0afb729..b7cda6d62333 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1335,11 +1335,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %s", field->buf);
+ trace_seq_printf(s, ": %.*s", max, field->buf);
return trace_handle_return(s);
}
@@ -1348,10 +1349,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
return trace_handle_return(&iter->seq);
}
@@ -1410,7 +1412,7 @@ static struct trace_event *events[] __initdata = {
NULL
};
-__init static int init_events(void)
+__init int init_events(void)
{
struct trace_event *event;
int i, ret;
@@ -1428,4 +1430,3 @@ __init static int init_events(void)
return 0;
}
-early_initcall(init_events);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 11f4dbd9526b..8041bd5e4262 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -287,6 +287,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0da379b90249..0e3bdd69fa2d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1159,7 +1159,7 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
const char **filename, u64 *probe_offset,
- bool perf_type_tracepoint)
+ u64 *probe_addr, bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1176,6 +1176,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
+ *probe_addr = 0;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 685443375dc0..72a119b97f3e 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -116,14 +116,14 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (!watchdog_check_timestamp())
+ return;
+
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
}
- if (!watchdog_check_timestamp())
- return;
-
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c047a647146e..8e012510425e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -680,12 +680,17 @@ static void clear_work_data(struct work_struct *work)
set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
+static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
+{
+ return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
+}
+
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ return work_struct_pwq(data);
else
return NULL;
}
@@ -713,8 +718,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
assert_rcu_or_pool_mutex();
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
+ return work_struct_pwq(data)->pool;
pool_id = data >> WORK_OFFQ_POOL_SHIFT;
if (pool_id == WORK_OFFQ_POOL_NONE)
@@ -735,8 +739,7 @@ static int get_work_pool_id(struct work_struct *work)
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+ return work_struct_pwq(data)->pool->id;
return data >> WORK_OFFQ_POOL_SHIFT;
}
@@ -5085,9 +5088,13 @@ static int workqueue_apply_unbound_cpumask(void)
list_for_each_entry(wq, &workqueues, list) {
if (!(wq->flags & WQ_UNBOUND))
continue;
+
/* creating multiple pwqs breaks ordering guarantee */
- if (wq->flags & __WQ_ORDERED)
- continue;
+ if (!list_empty(&wq->pwqs)) {
+ if (wq->flags & __WQ_ORDERED_EXPLICIT)
+ continue;
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
if (!ctx) {