aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Kconfig.preempt33
-rw-r--r--kernel/audit.c95
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c87
-rw-r--r--kernel/bpf/btf.c5
-rw-r--r--kernel/bpf/cgroup.c8
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/inode.c3
-rw-r--r--kernel/bpf/offload.c2
-rw-r--r--kernel/bpf/stackmap.c7
-rw-r--r--kernel/bpf/syscall.c47
-rw-r--r--kernel/bpf/tnum.c9
-rw-r--r--kernel/bpf/verifier.c175
-rw-r--r--kernel/cgroup/cgroup-v1.c3
-rw-r--r--kernel/cgroup/cgroup.c80
-rw-r--r--kernel/cgroup/cpuset.c75
-rw-r--r--kernel/cgroup/freezer.c9
-rw-r--r--kernel/cgroup/pids.c11
-rw-r--r--kernel/cgroup/rstat.c21
-rw-r--r--kernel/cpu.c218
-rw-r--r--kernel/cpu_pm.c4
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/debug_core.c9
-rw-r--r--kernel/debug/kdb/kdb_main.c8
-rw-r--r--kernel/dma/coherent.c27
-rw-r--r--kernel/dma/debug.c10
-rw-r--r--kernel/dma/direct.c5
-rw-r--r--kernel/events/core.c92
-rw-r--r--kernel/events/uprobes.c18
-rw-r--r--kernel/exit.c39
-rw-r--r--kernel/fork.c44
-rw-r--r--kernel/futex.c278
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/irq/debugfs.c12
-rw-r--r--kernel/irq/handle.c8
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irqdesc.c6
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/irq/manage.c80
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq/timings.c74
-rw-r--r--kernel/irq_work.c59
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c129
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/locking/Makefile10
-rw-r--r--kernel/locking/lockdep.c107
-rw-r--r--kernel/locking/locktorture.c9
-rw-r--r--kernel/locking/mutex-rt.c223
-rw-r--r--kernel/locking/rtmutex.c944
-rw-r--r--kernel/locking/rtmutex_common.h31
-rw-r--r--kernel/locking/rwlock-rt.c384
-rw-r--r--kernel/locking/rwsem-rt.c302
-rw-r--r--kernel/locking/rwsem.h2
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c37
-rw-r--r--kernel/module.c89
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/padata.c49
-rw-r--r--kernel/panic.c37
-rw-r--r--kernel/power/hibernate.c14
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h67
-rw-r--r--kernel/printk/printk.c1933
-rw-r--r--kernel/printk/printk_safe.c415
-rw-r--r--kernel/ptrace.c24
-rw-r--r--kernel/rcu/Kconfig4
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/rcutorture.c96
-rw-r--r--kernel/rcu/srcutree.c20
-rw-r--r--kernel/rcu/tree.c152
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_exp.h32
-rw-r--r--kernel/rcu/tree_plugin.h151
-rw-r--r--kernel/rcu/update.c107
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched/completion.c34
-rw-r--r--kernel/sched/core.c487
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq.c18
-rw-r--r--kernel/sched/cpufreq_schedutil.c8
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c13
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/fair.c469
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/loadavg.c33
-rw-r--r--kernel/sched/psi.c8
-rw-r--r--kernel/sched/rt.c20
-rw-r--r--kernel/sched/sched.h31
-rw-r--r--kernel/sched/swait.c22
-rw-r--r--kernel/sched/topology.c53
-rw-r--r--kernel/seccomp.c10
-rw-r--r--kernel/signal.c220
-rw-r--r--kernel/softirq.c231
-rw-r--r--kernel/stacktrace.c6
-rw-r--r--kernel/stop_machine.c7
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/alarmtimer.c30
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/hrtimer.c137
-rw-r--r--kernel/time/itimer.c1
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-clock.c31
-rw-r--r--kernel/time/posix-cpu-timers.c181
-rw-r--r--kernel/time/posix-timers.c42
-rw-r--r--kernel/time/posix-timers.h2
-rw-r--r--kernel/time/sched_clock.c9
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-common.c10
-rw-r--r--kernel/time/tick-sched.c45
-rw-r--r--kernel/time/time.c3
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/time/timekeeping.h3
-rw-r--r--kernel/time/timer.c100
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c213
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/ftrace.c41
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c68
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c173
-rw-r--r--kernel/trace/trace_events_trigger.c36
-rw-r--r--kernel/trace/trace_hwlat.c7
-rw-r--r--kernel/trace/trace_kprobe.c29
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_stat.c31
-rw-r--r--kernel/trace/tracing_map.c4
-rw-r--r--kernel/umh.c11
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/workqueue.c249
152 files changed, 7038 insertions, 3485 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index e0852dc333ac..5b3d5fd32f98 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -226,11 +226,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
config RWSEM_SPIN_ON_OWNER
def_bool y
- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
config LOCK_SPIN_ON_OWNER
def_bool y
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index dc0b682ec2d9..deefffc83d6b 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,4 +1,17 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT
+ bool
+ select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+ bool
+ select PREEMPT
+
+config HAVE_PREEMPT_LAZY
+ bool
+
+config PREEMPT_LAZY
+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
choice
prompt "Preemption Model"
@@ -35,10 +48,10 @@ config PREEMPT_VOLUNTARY
Select this if you are building a kernel for a desktop system.
-config PREEMPT
+config PREEMPT__LL
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_COUNT
+ select PREEMPT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
@@ -55,6 +68,22 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_RTB
+ bool "Preemptible Kernel (Basic RT)"
+ select PREEMPT_RT_BASE
+ help
+ This option is basically the same as (Low-Latency Desktop) but
+ enables changes which are preliminary for the full preemptible
+ RT kernel.
+
+config PREEMPT_RT_FULL
+ bool "Fully Preemptible Kernel (RT)"
+ depends on IRQ_FORCED_THREADING
+ select PREEMPT_RT_BASE
+ select PREEMPT_RCU
+ help
+ All and everything
+
endchoice
config PREEMPT_COUNT
diff --git a/kernel/audit.c b/kernel/audit.c
index 486c968214d9..a664fe710289 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -879,7 +879,7 @@ main_queue:
return 0;
}
-int audit_send_list(void *_dest)
+int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
@@ -923,19 +923,30 @@ out_kfree_skb:
return NULL;
}
+static void audit_free_reply(struct audit_reply *reply)
+{
+ if (!reply)
+ return;
+
+ if (reply->skb)
+ kfree_skb(reply->skb);
+ if (reply->net)
+ put_net(reply->net);
+ kfree(reply);
+}
+
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct sock *sk = audit_get_sk(reply->net);
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(sk, reply->skb, reply->portid, 0);
- put_net(reply->net);
- kfree(reply);
+ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
+ reply->skb = NULL;
+ audit_free_reply(reply);
return 0;
}
@@ -949,35 +960,32 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the port id.
- * No failure notifications.
+ * Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct sk_buff *skb;
struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
+ struct audit_reply *reply;
+ reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
- skb = audit_make_reply(seq, type, done, multi, payload, size);
- if (!skb)
- goto out;
-
- reply->net = get_net(net);
+ reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
+ if (!reply->skb)
+ goto err;
+ reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
- reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ if (IS_ERR(tsk))
+ goto err;
+
+ return;
+
+err:
+ audit_free_reply(reply);
}
/*
@@ -1100,13 +1108,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
audit_log_end(ab);
}
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
{
- struct audit_features *uaf;
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
- uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -1174,6 +1180,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
+ int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
@@ -1187,6 +1194,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
+ data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
@@ -1210,7 +1218,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
@@ -1314,7 +1322,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
break;
case AUDIT_SET_FEATURE:
- err = audit_set_feature(skb);
+ if (data_len < sizeof(struct audit_features))
+ return -EINVAL;
+ err = audit_set_feature(data);
if (err)
return err;
break;
@@ -1323,9 +1333,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
+ char *str = data;
+
err = 0;
if (msg_type == AUDIT_USER_TTY) {
err = tty_audit_push();
@@ -1333,26 +1348,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
audit_log_user_recv_msg(&ab, msg_type);
- if (msg_type != AUDIT_USER_TTY)
+ if (msg_type != AUDIT_USER_TTY) {
+ /* ensure NULL termination */
+ str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
- (char *)data);
- else {
- int size;
-
+ str);
+ } else {
audit_log_format(ab, " data=");
- size = nlmsg_len(nlh);
- if (size > 0 &&
- ((unsigned char *)data)[size - 1] == '\0')
- size--;
- audit_log_n_untrustedstring(ab, data, size);
+ if (data_len > 0 && str[data_len - 1] == '\0')
+ data_len--;
+ audit_log_n_untrustedstring(ab, str, data_len);
}
audit_log_end(ab);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(audit_context(), &ab,
@@ -1364,7 +1377,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_end(ab);
return -EPERM;
}
- err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
+ err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
@@ -1379,7 +1392,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
- size_t msglen = nlmsg_len(nlh);
+ size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
@@ -1455,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
diff --git a/kernel/audit.h b/kernel/audit.h
index 6c076d4982da..c018dcab4b11 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -229,7 +229,7 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *_dest);
+int audit_send_list_thread(void *_dest);
extern int selinux_audit_rule_update(void);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 1f31c2f1e6fc..4508d5e0cf69 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
+ inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9f8e190e3bea..d26db3c9a015 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -439,6 +439,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
+ u32 f_val;
err = -EINVAL;
@@ -447,12 +448,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
f->type = data->fields[i];
- f->val = data->values[i];
+ f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
- f->val = 0;
+ f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
@@ -468,7 +469,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
- f->uid = make_kuid(current_user_ns(), f->val);
+ f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
@@ -477,11 +478,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
- f->gid = make_kgid(current_user_ns(), f->val);
+ f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
+ f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
@@ -494,11 +496,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
+ entry->rule.buflen += f_val;
+ f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
(void **)&f->lsm_rule);
/* Keep currently invalid fields around in case they
@@ -507,68 +511,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
- }
- if (err) {
- kfree(str);
+ } else if (err)
goto exit_free;
- } else
- f->lsm_str = str;
break;
case AUDIT_WATCH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
- err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ }
+ err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
+ entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
+ entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
+ f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
- if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
+ }
+ entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
- if (entry->rule.exe || f->val > PATH_MAX)
+ if (entry->rule.exe || f_val > PATH_MAX)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
+ str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
- entry->rule.buflen += f->val;
-
- audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
goto exit_free;
}
+ entry->rule.buflen += f_val;
entry->rule.exe = audit_mark;
break;
+ default:
+ f->val = f_val;
+ break;
}
}
@@ -1137,11 +1144,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
- int err = 0;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1149,25 +1153,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
+ dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
+ put_net(dest->net);
kfree(dest);
- err = PTR_ERR(tsk);
+ return PTR_ERR(tsk);
}
- return err;
+ return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5fcc7a17eb5a..6d37931145d8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2309,7 +2309,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
- if (struct_size - bytes_offset < sizeof(int)) {
+ if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
@@ -3449,7 +3449,7 @@ int btf_get_info_by_fd(const struct btf *btf,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo;
- struct bpf_btf_info info = {};
+ struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
u32 uinfo_len;
@@ -3458,6 +3458,7 @@ int btf_get_info_by_fd(const struct btf *btf,
uinfo_len = attr->info.info_len;
info_copy = min_t(u32, uinfo_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 92a7d0cf8d13..b6ad8cb50efc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -239,8 +239,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
{
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
- struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
- *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
enum bpf_cgroup_storage_type stype;
struct bpf_prog_list *pl;
bool pl_was_allocated;
@@ -1046,12 +1046,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
return false;
switch (off) {
- case offsetof(struct bpf_sysctl, write):
+ case bpf_ctx_range(struct bpf_sysctl, write):
if (type != BPF_READ)
return false;
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
- case offsetof(struct bpf_sysctl, file_pos):
+ case bpf_ctx_range(struct bpf_sysctl, file_pos):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ceee0730fba5..fe8a72dc3a47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
}
-void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
+static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ebd0fa826f8..ae5de038f4a6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -476,7 +476,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOVERFLOW;
/* Make sure CPU is a valid possible cpu */
- if (!cpu_possible(key_cpu))
+ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
return -ENODEV;
if (qsize == 0) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 583df5cb302d..09ba58465758 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -679,15 +679,20 @@ static void htab_elem_free_rcu(struct rcu_head *head)
preempt_enable();
}
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
{
struct bpf_map *map = &htab->map;
+ void *ptr;
if (map->ops->map_fd_put_ptr) {
- void *ptr = fd_htab_map_get_ptr(map, l);
-
+ ptr = fd_htab_map_get_ptr(map, l);
map->ops->map_fd_put_ptr(ptr);
}
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
__pcpu_freelist_push(&htab->freelist, &l->fnode);
@@ -739,6 +744,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
+ htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cc0d0cf114e3..482e259ffcfb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -195,6 +195,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -207,8 +208,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ba635209ae9a..1853aa420f2f 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
ulen = info->jited_prog_len;
info->jited_prog_len = aux->offload->jited_len;
- if (info->jited_prog_len & ulen) {
+ if (info->jited_prog_len && ulen) {
uinsns = u64_to_user_ptr(info->jited_prog_insns);
ulen = min_t(u32, info->jited_prog_len, ulen);
if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d38e49f943a1..d41d4af5e3f3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
bool irq_work_busy = false;
struct stack_map_irq_work *work = NULL;
- if (in_nmi()) {
+ if (irqs_disabled()) {
work = this_cpu_ptr(&up_read_work);
if (work->irq_work.flags & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
@@ -297,8 +297,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
/*
- * We cannot do up_read() in nmi context. To do build_id lookup
- * in nmi context, we need to run up_read() in irq_work. We use
+ * We cannot do up_read() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To do build_id lookup when the
+ * irqs are disabled, we need to run up_read() in irq_work. We use
* a percpu variable to do the irq_work. If the irq_work is
* already used by another lookup, we fall back to report ips.
*
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d2146277071f..e1cc2d1f4003 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1102,7 +1102,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1130,8 +1131,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (err)
goto free_value;
- if (copy_to_user(uvalue, value, value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0) {
+ err = -EFAULT;
goto free_value;
+ }
err = 0;
@@ -1313,24 +1316,32 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+ kvfree(aux->func_info);
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
+static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
+{
+ bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ bpf_prog_free_linfo(prog);
+
+ if (deferred)
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ __bpf_prog_put_rcu(&prog->aux->rcu);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- bpf_prog_kallsyms_del_all(prog);
- btf_put(prog->aux->btf);
- kvfree(prog->aux->func_info);
- bpf_prog_free_linfo(prog);
-
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ __bpf_prog_put_noref(prog, true);
}
}
@@ -1709,11 +1720,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return err;
free_used_maps:
- bpf_prog_free_linfo(prog);
- kvfree(prog->aux->func_info);
- btf_put(prog->aux->btf);
- bpf_prog_kallsyms_del_subprogs(prog);
- free_used_maps(prog->aux);
+ /* In case we have subprogs, we need to wait for a grace
+ * period before we can tear down JIT memory since symbols
+ * are already exposed under kallsyms.
+ */
+ __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
@@ -2266,7 +2278,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_prog_info info = {};
+ struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_stats stats;
char __user *uinsns;
@@ -2278,6 +2290,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
@@ -2541,7 +2554,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_map_info info = {};
+ struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;
@@ -2550,6 +2563,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -2777,7 +2791,7 @@ out:
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
- union bpf_attr attr = {};
+ union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
@@ -2789,6 +2803,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ca52b9642943..d4f335a9a899 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
-struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
/* if a.value is negative, arithmetic shifting by minimum shift
* will have larger negative offset compared to more shifting.
* If a.value is nonnegative, arithmetic shifting by minimum shift
* will have larger positive offset compare to more shifting.
*/
- return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
}
struct tnum tnum_add(struct tnum a, struct tnum b)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11528bdaa9dc..187ea0cfbcca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -201,8 +201,7 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
- s64 msize_smax_value;
- u64 msize_umax_value;
+ u64 msize_max_value;
int ref_obj_id;
int func_id;
};
@@ -2657,8 +2656,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* remember the mem_size which may be used later
* to refine return values.
*/
- meta->msize_smax_value = reg->smax_value;
- meta->msize_umax_value = reg->umax_value;
+ meta->msize_max_value = reg->umax_value;
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
@@ -3141,21 +3139,44 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
- int func_id,
- struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+ struct bpf_reg_state tmp_reg = *ret_reg;
+ bool ret;
if (ret_type != RET_INTEGER ||
(func_id != BPF_FUNC_get_stack &&
func_id != BPF_FUNC_probe_read_str))
- return;
+ return 0;
+
+ /* Error case where ret is in interval [S32MIN, -1]. */
+ ret_reg->smin_value = S32_MIN;
+ ret_reg->smax_value = -1;
+
+ __reg_deduce_bounds(ret_reg);
+ __reg_bound_offset(ret_reg);
+ __update_reg_bounds(ret_reg);
+
+ ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (!ret)
+ return -EFAULT;
+
+ *ret_reg = tmp_reg;
+
+ /* Success case where ret is in range [0, msize_max_value]. */
+ ret_reg->smin_value = 0;
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->umin_value = ret_reg->smin_value;
+ ret_reg->umax_value = ret_reg->smax_value;
- ret_reg->smax_value = meta->msize_smax_value;
- ret_reg->umax_value = meta->msize_umax_value;
__reg_deduce_bounds(ret_reg);
__reg_bound_offset(ret_reg);
+ __update_reg_bounds(ret_reg);
+
+ return 0;
}
static int
@@ -3384,7 +3405,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].ref_obj_id = id;
}
- do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
@@ -4084,9 +4107,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+ if (insn_bitness == 32) {
+ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
+ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
+ } else {
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ }
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
+ insn_bitness);
/* blow away the dst_reg umin_value/umax_value and rely on
* dst_reg var_off to refine the result.
@@ -4580,6 +4610,70 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
}
+/* Constrain the possible values of @reg with unsigned upper bound @bound.
+ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
+ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
+ * of @reg.
+ */
+static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
+ bool is_exclusive)
+{
+ if (is_exclusive) {
+ /* There are no values for `reg` that make `reg<0` true. */
+ if (bound == 0)
+ return;
+ bound--;
+ }
+ if (is_jmp32) {
+ /* Constrain the register's value in the tnum representation.
+ * For 64-bit comparisons this happens later in
+ * __reg_bound_offset(), but for 32-bit comparisons, we can be
+ * more precise than what can be derived from the updated
+ * numeric bounds.
+ */
+ struct tnum t = tnum_range(0, bound);
+
+ t.mask |= ~0xffffffffULL; /* upper half is unknown */
+ reg->var_off = tnum_intersect(reg->var_off, t);
+
+ /* Compute the 64-bit bound from the 32-bit bound. */
+ bound += gen_hi_max(reg->var_off);
+ }
+ reg->umax_value = min(reg->umax_value, bound);
+}
+
+/* Constrain the possible values of @reg with unsigned lower bound @bound.
+ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
+ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
+ * of @reg.
+ */
+static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
+ bool is_exclusive)
+{
+ if (is_exclusive) {
+ /* There are no values for `reg` that make `reg>MAX` true. */
+ if (bound == (is_jmp32 ? U32_MAX : U64_MAX))
+ return;
+ bound++;
+ }
+ if (is_jmp32) {
+ /* Constrain the register's value in the tnum representation.
+ * For 64-bit comparisons this happens later in
+ * __reg_bound_offset(), but for 32-bit comparisons, we can be
+ * more precise than what can be derived from the updated
+ * numeric bounds.
+ */
+ struct tnum t = tnum_range(bound, U32_MAX);
+
+ t.mask |= ~0xffffffffULL; /* upper half is unknown */
+ reg->var_off = tnum_intersect(reg->var_off, t);
+
+ /* Compute the 64-bit bound from the 32-bit bound. */
+ bound += gen_hi_min(reg->var_off);
+ }
+ reg->umin_value = max(reg->umin_value, bound);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4635,15 +4729,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGE:
case BPF_JGT:
{
- u64 false_umax = opcode == BPF_JGT ? val : val - 1;
- u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
-
- if (is_jmp32) {
- false_umax += gen_hi_max(false_reg->var_off);
- true_umin += gen_hi_min(true_reg->var_off);
- }
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE);
+ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT);
break;
}
case BPF_JSGE:
@@ -4664,15 +4751,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JLE:
case BPF_JLT:
{
- u64 false_umin = opcode == BPF_JLT ? val : val + 1;
- u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
-
- if (is_jmp32) {
- false_umin += gen_hi_min(false_reg->var_off);
- true_umax += gen_hi_max(true_reg->var_off);
- }
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE);
+ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT);
break;
}
case BPF_JSLE:
@@ -4747,15 +4827,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGE:
case BPF_JGT:
{
- u64 false_umin = opcode == BPF_JGT ? val : val + 1;
- u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
-
- if (is_jmp32) {
- false_umin += gen_hi_min(false_reg->var_off);
- true_umax += gen_hi_max(true_reg->var_off);
- }
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JGE);
+ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JGT);
break;
}
case BPF_JSGE:
@@ -4773,15 +4846,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JLE:
case BPF_JLT:
{
- u64 false_umax = opcode == BPF_JLT ? val : val - 1;
- u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
-
- if (is_jmp32) {
- false_umax += gen_hi_max(false_reg->var_off);
- true_umin += gen_hi_min(true_reg->var_off);
- }
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JLE);
+ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JLT);
break;
}
case BPF_JSLE:
@@ -5268,6 +5334,7 @@ static bool may_access_skb(enum bpf_prog_type type)
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -5301,7 +5368,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(env, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
return err;
@@ -5320,7 +5387,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
@@ -5333,6 +5400,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 88006be40ea3..9dc41608b013 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -492,6 +492,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
*/
p++;
if (p >= end) {
+ (*pos)++;
return NULL;
} else {
*pos = *p;
@@ -802,7 +803,7 @@ void cgroup1_release_agent(struct work_struct *work)
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
+ if (!pathbuf || !agentbuf || !strlen(agentbuf))
goto out;
spin_lock_irq(&css_set_lock);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d2cba714d3ee..fb24e7424411 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2119,11 +2119,12 @@ int cgroup_do_get_tree(struct fs_context *fc)
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
- fc->root = nsdentry;
if (IS_ERR(nsdentry)) {
- ret = PTR_ERR(nsdentry);
deactivate_locked_super(sb);
+ ret = PTR_ERR(nsdentry);
+ nsdentry = NULL;
}
+ fc->root = nsdentry;
}
if (!ctx->kfc.new_sb_created)
@@ -3067,8 +3068,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
@@ -3078,6 +3077,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
return PTR_ERR(css);
}
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
@@ -3113,11 +3114,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!css)
continue;
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
@@ -3404,7 +3405,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn, false);
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
@@ -4415,12 +4417,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
}
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
- if (!list_empty(&cset->tasks))
+ if (!list_empty(&cset->tasks)) {
it->task_pos = cset->tasks.next;
- else if (!list_empty(&cset->mg_tasks))
+ it->cur_tasks_head = &cset->tasks;
+ } else if (!list_empty(&cset->mg_tasks)) {
it->task_pos = cset->mg_tasks.next;
- else
+ it->cur_tasks_head = &cset->mg_tasks;
+ } else {
it->task_pos = cset->dying_tasks.next;
+ it->cur_tasks_head = &cset->dying_tasks;
+ }
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
@@ -4478,10 +4484,14 @@ repeat:
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head)
+ if (it->task_pos == it->tasks_head) {
it->task_pos = it->mg_tasks_head->next;
- if (it->task_pos == it->mg_tasks_head)
+ it->cur_tasks_head = it->mg_tasks_head;
+ }
+ if (it->task_pos == it->mg_tasks_head) {
it->task_pos = it->dying_tasks_head->next;
+ it->cur_tasks_head = it->dying_tasks_head;
+ }
if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
} else {
@@ -4500,11 +4510,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (!atomic_read(&task->signal->live))
+ if (it->cur_tasks_head == it->dying_tasks_head &&
+ !atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (task->flags & PF_EXITING)
+ if (it->cur_tasks_head == it->dying_tasks_head)
goto repeat;
}
}
@@ -4613,6 +4624,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
+ if (pos)
+ (*pos)++;
+
return css_task_iter_next(it);
}
@@ -4628,7 +4642,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
* from position 0, so we can simply keep iterating on !0 *pos.
*/
if (!it) {
- if (WARN_ON_ONCE((*pos)++))
+ if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
it = kzalloc(sizeof(*it), GFP_KERNEL);
@@ -4636,10 +4650,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
return ERR_PTR(-ENOMEM);
of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
- } else if (!(*pos)++) {
+ } else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
- }
+ } else
+ return it->cur_task;
return cgroup_procs_next(s, NULL, NULL);
}
@@ -6276,19 +6291,14 @@ void cgroup_sk_alloc_disable(void)
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled)
+ if (cgroup_sk_alloc_disabled) {
+ skcd->no_refcnt = 1;
return;
+ }
- /* Socket clone path */
- if (skcd->val) {
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt())
return;
- }
rcu_read_lock();
@@ -6306,8 +6316,24 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
rcu_read_unlock();
}
+void cgroup_sk_clone(struct sock_cgroup_data *skcd)
+{
+ /* Socket clone path */
+ if (skcd->val) {
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(sock_cgroup_ptr(skcd));
+ }
+}
+
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
+ if (skcd->no_refcnt)
+ return;
+
cgroup_put(sock_cgroup_ptr(skcd));
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 515525ff1cfd..e00ecf6e1311 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -333,7 +333,7 @@ static struct cpuset top_cpuset = {
*/
static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_SPINLOCK(callback_lock);
+static DEFINE_RAW_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -839,7 +839,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
- if (is_sched_load_balance(cp))
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
@@ -1234,7 +1235,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (adding) {
cpumask_or(parent->subparts_cpus,
parent->subparts_cpus, tmp->addmask);
@@ -1253,7 +1254,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
return cmd == partcmd_update;
}
@@ -1358,7 +1359,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
if (cp->nr_subparts_cpus &&
@@ -1389,7 +1390,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
= cpumask_weight(cp->subparts_cpus);
}
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1507,7 +1508,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
return -EINVAL;
}
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
@@ -1518,7 +1519,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
update_cpumasks_hier(cs, &tmp);
@@ -1712,9 +1713,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1782,9 +1783,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &trialcs->mems_allowed);
@@ -1875,9 +1876,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -2380,7 +2381,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
cpuset_filetype_t type = seq_cft(sf)->private;
int ret = 0;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
@@ -2402,7 +2403,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
ret = -EINVAL;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
return ret;
}
@@ -2712,14 +2713,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -2746,12 +2747,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2804,7 +2805,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2815,7 +2816,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}
@@ -2829,7 +2830,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
@@ -2916,12 +2917,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2958,10 +2959,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
@@ -3116,7 +3117,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
/*
@@ -3136,17 +3137,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -3247,11 +3248,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
}
/**
@@ -3312,11 +3313,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
nodemask_t mask;
unsigned long flags;
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -3408,14 +3409,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 8cf010680678..3984dd6b8ddb 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -231,6 +231,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
return;
/*
+ * It's not necessary to do changes if both of the src and dst cgroups
+ * are not freezing and task is not frozen.
+ */
+ if (!test_bit(CGRP_FREEZE, &src->flags) &&
+ !test_bit(CGRP_FREEZE, &dst->flags) &&
+ !task->frozen)
+ return;
+
+ /*
* Adjust counters of freezing and frozen tasks.
* Note, that if the task is frozen, but the destination cgroup is not
* frozen, we bump both counters to keep them balanced.
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 8e513a573fe9..138059eb730d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -45,7 +45,7 @@ struct pids_cgroup {
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
*/
atomic64_t counter;
- int64_t limit;
+ atomic64_t limit;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->limit, PIDS_MAX);
atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
for (p = pids; parent_pids(p); p = parent_pids(p)) {
int64_t new = atomic64_add_return(num, &p->counter);
+ int64_t limit = atomic64_read(&p->limit);
/*
* Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > p->limit)
+ if (new > limit)
goto revert;
}
@@ -277,7 +278,7 @@ set_limit:
* Limit updates don't need to be mutex'd, since it isn't
* critical that any racing fork()s follow the new limit.
*/
- pids->limit = limit;
+ atomic64_set(&pids->limit, limit);
return nbytes;
}
@@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v)
{
struct cgroup_subsys_state *css = seq_css(sf);
struct pids_cgroup *pids = css_pids(css);
- int64_t limit = pids->limit;
+ int64_t limit = atomic64_read(&pids->limit);
if (limit >= PIDS_MAX)
seq_printf(sf, "%s\n", PIDS_MAX_STR);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ca19b4c8acf5..be99d3c13ba4 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
return;
/*
- * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
- * see NULL updated_next or they see our updated stat.
- */
- smp_mb();
-
- /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
* Because @parent's updated_children is terminated with @parent
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
@@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
*nextp = rstatc->updated_next;
rstatc->updated_next = NULL;
- /*
- * Paired with the one in cgroup_rstat_cpu_updated().
- * Either they see NULL updated_next or we see their
- * updated stat.
- */
- smp_mb();
-
return pos;
}
@@ -160,8 +150,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -173,7 +164,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 44fe5c7cfc07..f523bb170423 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -515,8 +516,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
- /* Unpark the stopper thread and the hotplug thread of the target cpu */
- stop_machine_unpark(cpu);
+ /* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
@@ -555,6 +555,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}
+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -839,6 +854,15 @@ static int take_cpu_down(void *_param)
int err, cpu = smp_processor_id();
int ret;
+#ifdef CONFIG_PREEMPT_RT_BASE
+ /*
+ * If any tasks disabled migration before we got here,
+ * go back and sleep again.
+ */
+ if (cpu_nr_pinned(cpu))
+ return -EAGAIN;
+#endif
+
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
@@ -868,6 +892,10 @@ static int take_cpu_down(void *_param)
return 0;
}
+#ifdef CONFIG_PREEMPT_RT_BASE
+struct task_struct *takedown_cpu_task;
+#endif
+
static int takedown_cpu(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -882,11 +910,39 @@ static int takedown_cpu(unsigned int cpu)
*/
irq_lock_sparse();
+#ifdef CONFIG_PREEMPT_RT_BASE
+ WARN_ON_ONCE(takedown_cpu_task);
+ takedown_cpu_task = current;
+
+again:
+ /*
+ * If a task pins this CPU after we pass this check, take_cpu_down
+ * will return -EAGAIN.
+ */
+ for (;;) {
+ int nr_pinned;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ nr_pinned = cpu_nr_pinned(cpu);
+ if (nr_pinned == 0)
+ break;
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
+#endif
+
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
+#ifdef CONFIG_PREEMPT_RT_BASE
+ if (err == -EAGAIN)
+ goto again;
+#endif
if (err) {
+#ifdef CONFIG_PREEMPT_RT_BASE
+ takedown_cpu_task = NULL;
+#endif
/* CPU refused to die */
irq_unlock_sparse();
/* Unpark the hotplug thread so we can rollback there */
@@ -905,6 +961,9 @@ static int takedown_cpu(unsigned int cpu)
wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
+#ifdef CONFIG_PREEMPT_RT_BASE
+ takedown_cpu_task = NULL;
+#endif
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
irq_unlock_sparse();
@@ -1079,8 +1138,8 @@ void notify_cpu_starting(unsigned int cpu)
/*
* Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
@@ -1090,6 +1149,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ /*
+ * Unpart the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
@@ -1412,7 +1477,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
@@ -1892,6 +1957,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
EXPORT_SYMBOL(__cpuhp_remove_state);
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -2046,77 +2183,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static void cpuhp_offline_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = true;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
-}
-
-static void cpuhp_online_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = false;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_ONLINE);
-}
-
-int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- for_each_online_cpu(cpu) {
- if (topology_is_primary_thread(cpu))
- continue;
- ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
- if (ret)
- break;
- /*
- * As this needs to hold the cpu maps lock it's impossible
- * to call device_offline() because that ends up calling
- * cpu_down() which takes cpu maps lock. cpu maps lock
- * needs to be held as this might race against in kernel
- * abusers of the hotplug machinery (thermal management).
- *
- * So nothing would update device:offline state. That would
- * leave the sysfs entry stale and prevent onlining after
- * smt control has been changed to 'off' again. This is
- * called under the sysfs hotplug lock, so it is properly
- * serialized against the regular offline usage.
- */
- cpuhp_offline_cpu_device(cpu);
- }
- if (!ret)
- cpu_smt_control = ctrlval;
- cpu_maps_update_done();
- return ret;
-}
-
-int cpuhp_smt_enable(void)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- cpu_smt_control = CPU_SMT_ENABLED;
- for_each_present_cpu(cpu) {
- /* Skip online CPUs and CPUs on offline nodes */
- if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
- continue;
- ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
- if (ret)
- break;
- /* See comment in cpuhp_smt_disable() */
- cpuhp_online_cpu_device(cpu);
- }
- cpu_maps_update_done();
- return ret;
-}
-
-
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index cbca6879ab7d..44a259338e33 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
diff --git a/kernel/cred.c b/kernel/cred.c
index 153ae369e024..7cfd1db529d4 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -218,7 +218,7 @@ struct cred *cred_alloc_blank(void)
new->magic = CRED_MAGIC;
#endif
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -277,7 +277,7 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
@@ -706,7 +706,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..18a6a8642495 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -501,6 +501,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -545,6 +546,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -601,6 +603,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -619,6 +622,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -634,6 +638,8 @@ return_normal:
if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
goto kgdb_restore;
+ atomic_inc(&ignore_console_lock_warning);
+
/* Call the I/O driver's pre_exception routine */
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
@@ -706,6 +712,8 @@ cpu_master_loop:
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
+ atomic_dec(&ignore_console_lock_warning);
+
if (!kgdb_single_step) {
raw_spin_unlock(&dbg_slave_lock);
/* Wait till all the CPUs have quit from the debugger. */
@@ -738,6 +746,7 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 9ecfa37c7fbf..bb5dec762b69 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1103,12 +1103,12 @@ static int handle_ctrl_cmd(char *cmd)
case CTRL_P:
if (cmdptr != cmd_tail)
cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
case CTRL_N:
if (cmdptr != cmd_head)
cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
}
return 0;
@@ -1315,7 +1315,7 @@ do_full_getstr:
if (*cmdbuf != '\n') {
if (*cmdbuf < 32) {
if (cmdptr == cmd_head) {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
*(cmd_hist[cmd_head] +
strlen(cmd_hist[cmd_head])-1) = '\0';
@@ -1325,7 +1325,7 @@ do_full_getstr:
cmdbuf = cmd_cur;
goto do_full_getstr;
} else {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
}
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 29fd6590dc1e..3fe6e0119ee6 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -135,8 +135,9 @@ void dma_release_declared_memory(struct device *dev)
}
EXPORT_SYMBOL(dma_release_declared_memory);
-static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
- ssize_t size, dma_addr_t *dma_handle)
+static void *__dma_alloc_from_coherent(struct device *dev,
+ struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
{
int order = get_order(size);
unsigned long flags;
@@ -145,7 +146,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
spin_lock_irqsave(&mem->spinlock, flags);
- if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT)))
goto err;
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
@@ -155,8 +156,9 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
/*
* Memory was found in the coherent area.
*/
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ *dma_handle = dma_get_device_base(dev, mem) +
+ ((dma_addr_t)pageno << PAGE_SHIFT);
+ ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT);
spin_unlock_irqrestore(&mem->spinlock, flags);
memset(ret, 0, size);
return ret;
@@ -187,24 +189,25 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
if (!mem)
return 0;
- *ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+ *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
return 1;
}
-void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
{
if (!dma_coherent_default_memory)
return NULL;
- return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
- dma_handle);
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
}
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
int order, void *vaddr)
{
if (mem && vaddr >= mem->virt_base && vaddr <
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
unsigned long flags;
@@ -248,10 +251,10 @@ static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
{
if (mem && vaddr >= mem->virt_base && vaddr + size <=
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
unsigned long off = vma->vm_pgoff;
int start = (vaddr - mem->virt_base) >> PAGE_SHIFT;
- int user_count = vma_pages(vma);
+ unsigned long user_count = vma_pages(vma);
int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
*ret = -ENXIO;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 099002d84f46..cb6425e52bf7 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -137,9 +137,12 @@ static const char *const maperr2str[] = {
[MAP_ERR_CHECKED] = "dma map error checked",
};
-static const char *type2name[5] = { "single", "page",
- "scather-gather", "coherent",
- "resource" };
+static const char *type2name[] = {
+ [dma_debug_single] = "single",
+ [dma_debug_sg] = "scather-gather",
+ [dma_debug_coherent] = "coherent",
+ [dma_debug_resource] = "resource",
+};
static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
"DMA_FROM_DEVICE", "DMA_NONE" };
@@ -420,6 +423,7 @@ void debug_dma_dump_mappings(struct device *dev)
}
spin_unlock_irqrestore(&bucket->lock, flags);
+ cond_resched();
}
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9912be7a970d..bebc5ee62cbb 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -53,7 +53,8 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
- u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+ phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ u64 max_dma = phys_to_dma_direct(dev, phys);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
@@ -357,7 +358,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
{
dma_addr_t dma_addr = paddr;
- if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size))) {
report_addr(dev, dma_addr, size);
return DMA_MAPPING_ERROR;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a690905247fe..24941a001438 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -93,11 +93,11 @@ static void remote_function(void *data)
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
+ * be on the current CPU, which just calls the function directly. This will
+ * retry due to any failures in smp_call_function_single(), such as if the
+ * task_cpu() goes offline concurrently.
*
- * returns: @func return value, or
- * -ESRCH - when the process isn't running
- * -EAGAIN - when the process moved away
+ * returns @func return value or -ESRCH when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -110,11 +110,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
};
int ret;
- do {
- ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
- if (!ret)
- ret = data.ret;
- } while (ret == -EAGAIN);
+ for (;;) {
+ ret = smp_call_function_single(task_cpu(p), remote_function,
+ &data, 1);
+ ret = !ret ? data.ret : -EAGAIN;
+
+ if (ret != -EAGAIN)
+ break;
+
+ cond_resched();
+ }
return ret;
}
@@ -1103,7 +1108,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
@@ -3685,11 +3690,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3721,9 +3738,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_event_to_rotate(ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
@@ -5510,7 +5527,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
@@ -5583,7 +5600,8 @@ again:
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
@@ -5727,8 +5745,20 @@ accounting:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
- if (user_locked > user_lock_limit)
+ if (user_locked <= user_lock_limit) {
+ /* charge all to locked_vm */
+ } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
+ /* charge all to pinned_vm */
+ extra = user_extra;
+ user_extra = 0;
+ } else {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
extra = user_locked - user_lock_limit;
+ user_extra -= extra;
+ }
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
@@ -6419,9 +6449,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
@@ -6931,10 +6964,17 @@ static void perf_event_task_output(struct perf_event *event,
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
@@ -9511,7 +9551,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
@@ -11686,7 +11726,7 @@ inherit_event(struct perf_event *parent_event,
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..673044edd9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -815,10 +815,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
@@ -1114,6 +1110,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (offset > i_size_read(inode))
return -EINVAL;
+ /*
+ * This ensures that copy_from_page(), copy_to_page() and
+ * __update_ref_ctr() can't cross page boundary.
+ */
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return -EINVAL;
+ if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
+ return -EINVAL;
+
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
@@ -1962,6 +1967,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
@@ -2145,7 +2153,7 @@ static void handle_swbp(struct pt_regs *regs)
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
- send_sig(SIGTRAP, current, 0);
+ force_sig(SIGTRAP, current);
} else {
/*
* Either we raced with uprobe_unregister() or we can't
diff --git a/kernel/exit.c b/kernel/exit.c
index a75b6a7f458a..a7bab565eb19 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -161,7 +161,7 @@ static void __exit_signal(struct task_struct *tsk)
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
- flush_sigqueue(&tsk->pending);
+ flush_task_sigqueue(tsk);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
@@ -579,10 +579,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
- }
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -777,8 +773,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -796,6 +796,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -833,19 +843,20 @@ void __noreturn do_exit(long code)
raw_spin_lock_irq(&tsk->pi_lock);
raw_spin_unlock_irq(&tsk->pi_lock);
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
diff --git a/kernel/fork.c b/kernel/fork.c
index cfd1ad92c890..91f13a133be3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,7 @@
#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kprobes.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -685,6 +686,19 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+#ifdef CONFIG_PREEMPT_RT_BASE
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+ __mmdrop(mm);
+}
+#endif
+
static void mmdrop_async_fn(struct work_struct *work)
{
struct mm_struct *mm;
@@ -720,12 +734,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
+#ifdef CONFIG_PREEMPT_RT_BASE
+static
+#endif
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(tsk);
+
+ /* Task is done with its stack. */
+ put_task_stack(tsk);
+
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
@@ -736,7 +762,18 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
EXPORT_SYMBOL_GPL(__put_task_struct);
+#else
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
+
+ __put_task_struct(tsk);
+
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+#endif
void __init __weak arch_task_cache_init(void) { }
@@ -898,6 +935,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
@@ -910,6 +949,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
+ tsk->wake_q_sleeper.next = NULL;
account_kernel_stack(tsk, 1);
@@ -1645,6 +1685,9 @@ static void rt_mutex_init_task(struct task_struct *p)
*/
static void posix_cpu_timers_init(struct task_struct *tsk)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ tsk->posix_timer_list = NULL;
+#endif
tsk->cputime_expires.prof_exp = 0;
tsk->cputime_expires.virt_exp = 0;
tsk->cputime_expires.sched_exp = 0;
@@ -1889,6 +1932,7 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
+ p->sigqueue_cache = NULL;
p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/kernel/futex.c b/kernel/futex.c
index 4b5b468c58b6..7d1b6cadd5d7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -379,9 +379,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
+
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -423,7 +423,7 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies smp_mb(); (B) */
+ smp_mb(); /* explicit smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies smp_mb(); (B) */
@@ -457,7 +457,6 @@ static void drop_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- iput(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
@@ -470,6 +469,46 @@ enum futex_access {
FUTEX_WRITE
};
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that match_futex() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
@@ -482,9 +521,15 @@ enum futex_access {
*
* The key words are stored in @key on success.
*
- * For shared mappings, it's (page->index, file_inode(vma->vm_file),
- * offset_within_page). For private mappings, it's (uaddr, current->mm).
- * We can usually work out the index without swapping in the page.
+ * For shared mappings (when @fshared), the key is:
+ * ( inode->i_sequence, page->index, offset_within_page )
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
@@ -624,8 +669,6 @@ again:
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */
-
} else {
struct inode *inode;
@@ -657,40 +700,14 @@ again:
goto again;
}
- /*
- * Take a reference unless it is about to be freed. Previously
- * this reference was taken by ihold under the page lock
- * pinning the inode in place so i_lock was unnecessary. The
- * only way for this check to fail is if the inode was
- * truncated in parallel which is almost certainly an
- * application bug. In such a case, just retry.
- *
- * We are not calling into get_futex_key_refs() in file-backed
- * cases, therefore a successful atomic_inc return below will
- * guarantee that get_futex_key() will still imply smp_mb(); (B).
- */
- if (!atomic_inc_not_zero(&inode->i_count)) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- /* Should be impossible but lets be paranoid for now */
- if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
- err = -EFAULT;
- rcu_read_unlock();
- iput(inode);
-
- goto out;
- }
-
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = inode;
+ key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
out:
put_page(page);
return err;
@@ -910,7 +927,9 @@ void exit_pi_state_list(struct task_struct *curr)
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&curr->pi_lock);
spin_unlock(&hb->lock);
+ raw_spin_lock_irq(&curr->pi_lock);
put_pi_state(pi_state);
continue;
}
@@ -1465,6 +1484,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_sleeper_q);
int ret = 0;
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1524,13 +1544,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
+ &wake_sleeper_q);
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
return ret;
}
@@ -2137,6 +2157,16 @@ retry_private:
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
+ } else if (ret == -EAGAIN) {
+ /*
+ * Waiter was woken by timeout or
+ * signal and has set pi_blocked_on to
+ * PI_WAKEUP_INPROGRESS before we
+ * tried to enqueue it on the rtmutex.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ continue;
} else if (ret) {
/*
* rt_mutex_start_proxy_lock() detected a
@@ -2692,10 +2722,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
}
@@ -2794,9 +2823,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
if (time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
+ hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
+ HRTIMER_MODE_ABS, current);
hrtimer_set_expires(&to->timer, *time);
}
@@ -2851,7 +2879,7 @@ retry_private:
goto no_block;
}
- rt_mutex_init_waiter(&rt_waiter);
+ rt_mutex_init_waiter(&rt_waiter, false);
/*
* On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
@@ -2867,6 +2895,14 @@ retry_private:
* before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ /*
+ * the migrate_disable() here disables migration in the in_atomic() fast
+ * path which is enabled again in the following spin_unlock(). We have
+ * one migrate_disable() pending in the slow-path which is reversed
+ * after the raw_spin_unlock_irq() where we leave the atomic context.
+ */
+ migrate_disable();
+
spin_unlock(q.lock_ptr);
/*
* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
@@ -2875,6 +2911,7 @@ retry_private:
*/
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+ migrate_enable();
if (ret) {
if (ret == 1)
@@ -3023,10 +3060,19 @@ retry:
* rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ /*
+ * Magic trickery for now to make the RT migrate disable
+ * logic happy. The following spin_unlock() happens with
+ * interrupts disabled so the internal migrate_enable()
+ * won't undo the migrate_disable() which was issued when
+ * locking hb->lock.
+ */
+ migrate_disable();
spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
+ migrate_enable();
put_pi_state(pi_state);
@@ -3198,7 +3244,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
+ struct futex_hash_bucket *hb, *hb2;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
int res, ret;
@@ -3214,10 +3260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
}
@@ -3226,7 +3271,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- rt_mutex_init_waiter(&rt_waiter);
+ rt_mutex_init_waiter(&rt_waiter, false);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
@@ -3257,20 +3302,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out_put_keys;
+ /*
+ * On RT we must avoid races with requeue and trying to block
+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
+ * serializing access to pi_blocked_on with pi_lock.
+ */
+ raw_spin_lock_irq(&current->pi_lock);
+ if (current->pi_blocked_on) {
+ /*
+ * We have been requeued or are in the process of
+ * being requeued.
+ */
+ raw_spin_unlock_irq(&current->pi_lock);
+ } else {
+ /*
+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
+ * prevents a concurrent requeue from moving us to the
+ * uaddr2 rtmutex. After that we can safely acquire
+ * (and possibly block on) hb->lock.
+ */
+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
+ raw_spin_unlock_irq(&current->pi_lock);
+
+ spin_lock(&hb->lock);
+
+ /*
+ * Clean up pi_blocked_on. We might leak it otherwise
+ * when we succeeded with the hb->lock in the fast
+ * path.
+ */
+ raw_spin_lock_irq(&current->pi_lock);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock_irq(&current->pi_lock);
+
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
+ }
/*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
+ * In order to be here, we have either been requeued, are in
+ * the process of being requeued, or requeue successfully
+ * acquired uaddr2 on our behalf. If pi_blocked_on was
+ * non-null above, we may be racing with a requeue. Do not
+ * rely on q->lock_ptr to be hb2->lock until after blocking on
+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
+ * reference and incremented our key2 reference count.
*/
+ hb2 = hash_futex(&key2);
/* Check if the requeue code acquired the second futex for us. */
if (!q.rt_waiter) {
@@ -3279,7 +3359,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* did a lock-steal - fix up the PI-state in that case.
*/
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
pi_state = q.pi_state;
@@ -3290,7 +3371,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* the requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
- spin_unlock(q.lock_ptr);
+ spin_unlock(&hb2->lock);
}
} else {
struct rt_mutex *pi_mutex;
@@ -3304,7 +3385,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
- spin_lock(q.lock_ptr);
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
@@ -3445,11 +3527,16 @@ err_unlock:
return ret;
}
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
/*
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
{
u32 uval, uninitialized_var(nval), mval;
int err;
@@ -3462,6 +3549,42 @@ retry:
if (get_user(uval, uaddr))
return -1;
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) User space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. The user space futex value is
+ * uncontended and the rest of the user space mutex state is
+ * consistent, so a woken waiter will just take over the
+ * uncontended futex. Setting the OWNER_DIED bit would create
+ * inconsistent state and malfunction of the user space owner died
+ * handling.
+ */
+ if (pending_op && !pi && !uval) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
return 0;
@@ -3581,10 +3704,11 @@ void exit_robust_list(struct task_struct *curr)
* A pending lock might already be on the list, so
* don't process it twice:
*/
- if (entry != pending)
+ if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi))
+ curr, pi, HANDLE_DEATH_LIST))
return;
+ }
if (rc)
return;
entry = next_entry;
@@ -3598,9 +3722,10 @@ void exit_robust_list(struct task_struct *curr)
cond_resched();
}
- if (pending)
+ if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip);
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3777,7 +3902,8 @@ void compat_exit_robust_list(struct task_struct *curr)
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi))
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
return;
}
if (rc)
@@ -3796,7 +3922,7 @@ void compat_exit_robust_list(struct task_struct *curr)
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip);
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..291e0797125b 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -109,9 +109,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c1eccd4f6520..d44c8fd17609 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
@@ -205,8 +206,15 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
- /* Can't do level nor NMIs, sorry */
+ /*
+ * Don't allow injection when the interrupt is:
+ * - Level or NMI type
+ * - not activated
+ * - replaying already
+ */
+ if (irq_settings_is_level(desc) ||
+ !irqd_is_activated(&desc->irq_data) ||
+ (desc->istate & (IRQS_NMI | IRQS_REPLAY))) {
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a4ace611f47f..5c8047979569 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -185,10 +185,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
unsigned int flags = 0;
+ struct pt_regs *regs = get_irq_regs();
+ u64 ip = regs ? instruction_pointer(regs) : 0;
retval = __handle_irq_event_percpu(desc, &flags);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ desc->random_ip = ip;
+#else
+ add_interrupt_randomness(desc->irq_data.irq, flags, ip);
+#endif
if (!noirqdebug)
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 3a948f41ab00..207bac6ad4c3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
-extern int irq_select_affinity_usr(unsigned int irq);
-
extern void irq_set_thread_affinity(struct irq_desc *desc);
extern int irq_do_set_affinity(struct irq_data *data,
@@ -424,6 +422,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
return desc->pending_mask;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return irqd_is_handle_enforce_irqctx(data);
+}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
@@ -450,6 +452,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return false;
+}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9be995fc3c5a..6a27603fe1cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq)
int generic_handle_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_data *data;
if (!desc)
return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ return -EPERM;
+
generic_handle_irq_desc(desc);
return 0;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a453e229f99c..ee549d1007a4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1291,6 +1291,11 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs, void *arg)
{
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
@@ -1328,11 +1333,6 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
- if (!domain->ops->alloc) {
- pr_debug("domain->ops->alloc() is NULL\n");
- return -ENOSYS;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1457,6 +1457,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (rv) {
/* Restore the original irq_data. */
*root_irq_data = *child_irq_data;
+ kfree(child_irq_data);
goto error;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..e3326af3b460 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@
#include "internals.h"
#ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
@@ -33,6 +34,7 @@ static int __init setup_forced_irqthreads(char *arg)
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
+# endif
#endif
static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
@@ -194,9 +196,9 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
static void irq_validate_effective_affinity(struct irq_data *data)
{
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -204,9 +206,19 @@ static void irq_validate_effective_affinity(struct irq_data *data)
return;
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
-#endif
}
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask)
+{
+ cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
+}
+#else
+static inline void irq_validate_effective_affinity(struct irq_data *data) { }
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask) { }
+#endif
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -265,6 +277,30 @@ static int irq_try_set_affinity(struct irq_data *data,
return ret;
}
+static bool irq_set_affinity_deactivated(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ /*
+ * Handle irq chips which can handle affinity only in activated
+ * state correctly
+ *
+ * If the interrupt is not yet activated, just store the affinity
+ * mask and do not call the chip driver at all. On activation the
+ * driver has to make sure anyway that the interrupt is in a
+ * useable state so startup works.
+ */
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+ irqd_is_activated(data) || !irqd_affinity_on_activate(data))
+ return false;
+
+ cpumask_copy(desc->irq_common_data.affinity, mask);
+ irq_init_effective_affinity(data, mask);
+ irqd_set(data, IRQD_AFFINITY_SET);
+ return true;
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -275,6 +311,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ if (irq_set_affinity_deactivated(data, mask, force))
+ return 0;
+
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
ret = irq_try_set_affinity(data, mask, force);
} else {
@@ -284,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
- schedule_work(&desc->affinity_notify->work);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref,
+ desc->affinity_notify->release);
+ }
}
irqd_set(data, IRQD_AFFINITY_SET);
@@ -384,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (old_notify) {
- cancel_work_sync(&old_notify->work);
+ if (cancel_work_sync(&old_notify->work)) {
+ /* Pending work had a ref, put that one too */
+ kref_put(&old_notify->kref, old_notify->release);
+ }
kref_put(&old_notify->kref, old_notify->release);
}
@@ -442,23 +488,9 @@ int irq_setup_affinity(struct irq_desc *desc)
{
return irq_select_affinity(irq_desc_get_irq(desc));
}
-#endif
+#endif /* CONFIG_AUTO_IRQ_AFFINITY */
+#endif /* CONFIG_SMP */
-/*
- * Called when a bogus affinity is set via /proc/irq
- */
-int irq_select_affinity_usr(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_setup_affinity(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
-}
-#endif
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
@@ -1099,6 +1131,12 @@ static int irq_thread(void *data)
if (action_ret == IRQ_WAKE_THREAD)
irq_wake_secondary(desc, action);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ migrate_disable();
+ add_interrupt_randomness(action->irq, 0,
+ desc->random_ip ^ (unsigned long) action);
+ migrate_enable();
+#endif
wake_threads_waitq(desc);
}
@@ -2682,7 +2720,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
- * This function should be called with preemption disabled if the
+ * This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ad26fbcfbfc8..eb95f6106a1e 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..e8c655b7a430 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -115,6 +115,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
return show_irq_affinity(AFFINITY_LIST, m);
}
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ /*
+ * If the interrupt is started up already then this fails. The
+ * interrupt is assigned to an online CPU already. There is no
+ * point to move it around randomly. Tell user space that the
+ * selected mask is bogus.
+ *
+ * If not then any change to the affinity is pointless because the
+ * startup code invokes irq_setup_affinity() which will select
+ * a online CPU anyway.
+ */
+ return -EINVAL;
+}
+#else
+/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ return irq_select_affinity(irq);
+}
+#endif
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 98c04ca5fa43..5064b13b80d6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -72,8 +72,9 @@ void check_irq_resend(struct irq_desc *desc)
desc->istate &= ~IRQS_PENDING;
desc->istate |= IRQS_REPLAY;
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
+ if ((!desc->irq_data.chip->irq_retrigger ||
+ !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) &&
+ !handle_enforce_irqctx(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
unsigned int irq = irq_desc_get_irq(desc);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2ed97a7c9b2a..b116166b0ae8 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..19d2fad379ee 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -267,6 +267,23 @@ void irq_timings_disable(void)
#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
struct irqt_stat {
u64 last_ts;
u64 ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +314,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
{
- int i;
+ int period;
+
+ /*
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
+ */
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
/*
* The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +332,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
* period beginning at the end of the buffer. We do that for
* each suffix.
*/
- for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
- int *begin = &buffer[len - (i * 3)];
- int *ptr = begin;
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
/*
* We look if the suite with period 'i' repeat
* itself. If it is truncated at the end, as it
* repeats we can use the period to find out the next
- * element.
+ * element with the modulo.
*/
- while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
- ptr += i;
- if (ptr >= &buffer[len])
- return begin[((i * 3) % i)];
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
}
}
@@ -493,11 +543,7 @@ u64 irq_timings_next_event(u64 now)
* model while decrementing the counter because we consume the
* data from our circular buffer.
*/
-
- i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-
- for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+ for_each_irqts(i, irqts) {
irq = irq_timing_decode(irqts->values[i], &ts);
s = idr_find(&irqt_stats, irq);
if (s)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf81886..716fa93f0944 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -18,6 +18,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/interrupt.h>
#include <asm/processor.h>
@@ -60,13 +61,19 @@ void __weak arch_irq_work_raise(void)
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
+ struct llist_head *list;
+ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+
+ lazy_work = work->flags & IRQ_WORK_LAZY;
+
/* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
- } else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+ if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (llist_add(&work->llnode, list)) {
+ if (!lazy_work || tick_nohz_tick_stopped())
arch_irq_work_raise();
}
}
@@ -108,9 +115,16 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
preempt_disable();
if (cpu != smp_processor_id()) {
+ struct llist_head *list;
+
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
+ list = &per_cpu(lazy_list, cpu);
+ else
+ list = &per_cpu(raised_list, cpu);
+
+ if (llist_add(&work->llnode, list))
arch_send_call_function_single_ipi(cpu);
} else {
__irq_work_queue_local(work);
@@ -129,9 +143,8 @@ bool irq_work_needs_cpu(void)
raised = this_cpu_ptr(&raised_list);
lazy = this_cpu_ptr(&lazy_list);
- if (llist_empty(raised) || arch_irq_work_has_interrupt())
- if (llist_empty(lazy))
- return false;
+ if (llist_empty(raised) && llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -145,8 +158,12 @@ static void irq_work_run_list(struct llist_head *list)
struct llist_node *llnode;
unsigned long flags;
+#ifndef CONFIG_PREEMPT_RT_FULL
+ /*
+ * nort: On RT IRQ-work may run in SOFTIRQ context.
+ */
BUG_ON(!irqs_disabled());
-
+#endif
if (llist_empty(list))
return;
@@ -178,7 +195,16 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
+ /*
+ * NOTE: we raise softirq via IPI for safety,
+ * and execute in irq_work_tick() to move the
+ * overhead from hard to soft irq context.
+ */
+ if (!llist_empty(this_cpu_ptr(&lazy_list)))
+ raise_softirq(TIMER_SOFTIRQ);
+ } else
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -188,8 +214,17 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
+void irq_work_tick_soft(void)
+{
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
+#endif
/*
* Synchronize against the irq_work @entry, ensures the entry is not
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 136ce049c4ad..61f9d781f70a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -645,19 +645,20 @@ static inline int kallsyms_for_perf(void)
* Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
* block even that).
*/
-int kallsyms_show_value(void)
+bool kallsyms_show_value(const struct cred *cred)
{
switch (kptr_restrict) {
case 0:
if (kallsyms_for_perf())
- return 1;
+ return true;
/* fallthrough */
case 1:
- if (has_capability_noaudit(current, CAP_SYSLOG))
- return 1;
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
/* fallthrough */
default:
- return 0;
+ return false;
}
}
@@ -674,7 +675,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
- iter->show_value = kallsyms_show_value();
+ /*
+ * Instead of checking this on every s_show() call, cache
+ * the result here at open time.
+ */
+ iter->show_value = kallsyms_show_value(file->f_cred);
return 0;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 15d70a90b50d..e7cef4e2e01f 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -972,7 +972,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bc6addd9152b..a2de58de6ab6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,7 +120,7 @@ out:
* invoke it.
*
* If module auto-loading support is disabled then this function
- * becomes a no-operation.
+ * simply returns -ENOENT.
*/
int __request_module(bool wait, const char *fmt, ...)
{
@@ -137,7 +137,7 @@ int __request_module(bool wait, const char *fmt, ...)
WARN_ON_ONCE(wait && current_is_async());
if (!modprobe_path[0])
- return 0;
+ return -ENOENT;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1010bde1146b..a0dfc7d425f0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -326,7 +326,8 @@ struct kprobe *get_kprobe(void *addr)
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist,
+ lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
@@ -510,6 +511,8 @@ static void do_unoptimize_kprobes(void)
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -584,11 +587,12 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
+
+ mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
@@ -610,6 +614,18 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
+static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
+}
+
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
@@ -631,17 +647,21 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
@@ -649,6 +669,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -662,31 +683,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
@@ -1029,9 +1052,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
return ret;
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+ return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
/* Arm a kprobe with text_mutex */
@@ -1191,6 +1225,26 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
+struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
@@ -1208,6 +1262,8 @@ void kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ kprobe_busy_begin();
+
INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
@@ -1221,6 +1277,8 @@ void kprobe_flush_task(struct task_struct *tk)
hlist_del(&ri->hlist);
kfree(ri);
}
+
+ kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
@@ -2033,6 +2091,13 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace.
+ */
+ if (kprobe_ftrace(p))
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
@@ -2290,7 +2355,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p,
else
kprobe_type = "k";
- if (!kallsyms_show_value())
+ if (!kallsyms_show_value(pi->file->f_cred))
addr = NULL;
if (sym)
@@ -2391,7 +2456,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
* If /proc/kallsyms is not showing kernel address, we won't
* show them here either.
*/
- if (!kallsyms_show_value())
+ if (!kallsyms_show_value(m->file->f_cred))
seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
(void *)ent->start_addr);
else
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..340f0c0c347a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_CRASH_CORE */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+static ssize_t realtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -229,6 +238,9 @@ static struct attribute * kernel_attrs[] = {
&rcu_expedited_attr.attr,
&rcu_normal_attr.attr,
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+ &realtime_attr.attr,
+#endif
NULL
};
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6fe2f333aecb..28a6d53690fb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
+obj-y += semaphore.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -12,7 +12,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+endif
+obj-y += rwsem.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -25,6 +29,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
+obj-y += mutex.o rwsem.o rwsem-xadd.o
+endif
+obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4fc8dc30cec0..2b741dae14db 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -421,13 +421,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -437,6 +430,15 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
+unsigned long nr_stack_trace_entries;
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
static int save_trace(struct lock_trace *trace)
{
unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -459,6 +461,7 @@ static int save_trace(struct lock_trace *trace)
return 1;
}
+#endif
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
@@ -472,6 +475,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -489,6 +493,7 @@ static const char *usage_str[] =
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
+#endif
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
@@ -734,7 +739,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ON_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__);
return class;
}
}
@@ -1606,9 +1612,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_forward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
@@ -1633,9 +1641,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_backward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
@@ -2819,10 +2829,6 @@ static inline int validate_chain(struct task_struct *curr,
{
return 1;
}
-
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
-{
-}
#endif
/*
@@ -2876,12 +2882,10 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+#ifdef CONFIG_PROVE_LOCKING
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
-
-
static void
print_usage_bug_scenario(struct held_lock *lock)
{
@@ -3369,8 +3373,12 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
+ if (!check)
+ goto lock_used;
+
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
@@ -3414,6 +3422,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
return 1;
}
@@ -3445,35 +3458,6 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
@@ -3541,6 +3525,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+#else /* CONFIG_PROVE_LOCKING */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROVE_LOCKING */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
@@ -3744,11 +3749,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
- return 0;
-
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -4230,8 +4232,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
*/
static void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
@@ -4245,6 +4246,7 @@ static void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -4259,6 +4261,7 @@ static void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..cc78188b4ffe 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
@@ -697,10 +696,10 @@ static void __torture_print_stats(char *page,
if (statp[i].n_lock_fail)
fail = true;
sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c
new file mode 100644
index 000000000000..4f81595c0f52
--- /dev/null
+++ b/kernel/locking/mutex-rt.c
@@ -0,0 +1,223 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ * (also by Steven Rostedt)
+ * - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ * Doing priority inheritance with help of the scheduler.
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * - major rework based on Esben Nielsens initial patch
+ * - replaced thread_info references by task_struct refs
+ * - removed task->pending_owner dependency
+ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ * in the scheduler return path as discussed with Steven Rostedt
+ *
+ * Copyright (C) 2006, Kihon Technologies Inc.
+ * Steven Rostedt <rostedt@goodmis.org>
+ * - debugged and patched Thomas Gleixner's rework.
+ * - added back the cmpxchg to the rework.
+ * - turned atomic require back on for SMP.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * struct mutex functions
+ */
+void __mutex_do_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map(&mutex->dep_map, name, key, 0);
+#endif
+ mutex->lock.save_state = 0;
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+void __lockfunc _mutex_lock_io(struct mutex *lock)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ _mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_io);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
+{
+ int token;
+
+ token = io_schedule_prepare();
+
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
+{
+ int ret;
+
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
+
+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+ int ret = __rt_mutex_trylock(&lock->lock);
+
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ __rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..ba4b151bf451 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,6 +8,11 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.txt for details.
*/
@@ -19,6 +24,8 @@
#include <linux/sched/wake_q.h>
#include <linux/sched/debug.h>
#include <linux/timer.h>
+#include <linux/ww_mutex.h>
+#include <linux/blkdev.h>
#include "rtmutex_common.h"
@@ -136,6 +143,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
}
+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
+{
+ return waiter && waiter != PI_WAKEUP_INPROGRESS &&
+ waiter != PI_REQUEUE_INPROGRESS;
+}
+
/*
* We can speed up the acquire/release, if there's no debugging state to be
* set up.
@@ -229,7 +242,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
@@ -269,6 +282,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+#define STEAL_NORMAL 0
+#define STEAL_LATERAL 1
+
+static inline int
+rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
+{
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
+
+ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
+ return 1;
+
+ /*
+ * Note that RT tasks are excluded from lateral-steals
+ * to prevent the introduction of an unbounded latency.
+ */
+ if (mode == STEAL_NORMAL || rt_task(waiter->task))
+ return 0;
+
+ return rt_mutex_waiter_equal(waiter, top_waiter);
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -373,6 +407,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
return debug_rt_mutex_detect_deadlock(waiter, chwalk);
}
+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (waiter->savestate)
+ wake_up_lock_sleeper(waiter->task);
+ else
+ wake_up_process(waiter->task);
+}
+
/*
* Max number of times we'll walk the boosting chain:
*/
@@ -380,7 +422,8 @@ int max_lock_depth = 1024;
static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
{
- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+ return rt_mutex_real_waiter(p->pi_blocked_on) ?
+ p->pi_blocked_on->lock : NULL;
}
/*
@@ -516,7 +559,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* reached or the state of the chain has changed while we
* dropped the locks.
*/
- if (!waiter)
+ if (!rt_mutex_real_waiter(waiter))
goto out_unlock_pi;
/*
@@ -697,13 +740,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* follow here. This is the end of the chain we are walking.
*/
if (!rt_mutex_owner(lock)) {
+ struct rt_mutex_waiter *lock_top_waiter;
+
/*
* If the requeue [7] above changed the top waiter,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ lock_top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != lock_top_waiter)
+ rt_mutex_wake_waiter(lock_top_waiter);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -805,9 +851,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* @task: The task which wants to acquire the lock
* @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
+ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int __try_to_take_rt_mutex(struct rt_mutex *lock,
+ struct task_struct *task,
+ struct rt_mutex_waiter *waiter, int mode)
{
lockdep_assert_held(&lock->wait_lock);
@@ -843,12 +891,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (waiter) {
/*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
+ * If waiter is not the highest priority waiter of @lock,
+ * or its peer when lateral steal is allowed, give up.
*/
- if (waiter != rt_mutex_top_waiter(lock))
+ if (!rt_mutex_steal(lock, waiter, mode))
return 0;
-
/*
* We can acquire the lock. Remove the waiter from the
* lock waiters tree.
@@ -866,14 +913,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock)) {
/*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
+ * If @task->prio is greater than the top waiter
+ * priority (kernel view), or equal to it when a
+ * lateral steal is forbidden, @task lost.
*/
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
return 0;
-
/*
* The current top waiter stays enqueued. We
* don't have to change anything in the lock
@@ -920,6 +965,344 @@ takeit:
return 1;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * preemptible spin_lock functions:
+ */
+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ might_sleep_no_state_check();
+
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return;
+ else
+ slowfn(lock);
+}
+
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+ else
+ slowfn(lock);
+}
+#ifdef CONFIG_SMP
+/*
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
+ */
+static int adaptive_wait(struct rt_mutex *lock,
+ struct task_struct *owner)
+{
+ int res = 0;
+
+ rcu_read_lock();
+ for (;;) {
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that owner->on_cpu is dereferenced _after_
+ * checking the above to be valid.
+ */
+ barrier();
+ if (!owner->on_cpu) {
+ res = 1;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
+}
+#else
+static int adaptive_wait(struct rt_mutex *lock,
+ struct task_struct *orig_owner)
+{
+ return 1;
+}
+#endif
+
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ enum rtmutex_chainwalk chwalk);
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * We store the current state under p->pi_lock in p->saved_state and
+ * the try_to_wake_up() code handles this accordingly.
+ */
+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ unsigned long flags)
+{
+ struct task_struct *lock_owner, *self = current;
+ struct rt_mutex_waiter *top_waiter;
+ int ret;
+
+ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
+ return;
+
+ BUG_ON(rt_mutex_owner(lock) == self);
+
+ /*
+ * We save whatever state the task is in and we'll restore it
+ * after acquiring the lock taking real wakeups into account
+ * as well. We are serialized via pi_lock against wakeups. See
+ * try_to_wake_up().
+ */
+ raw_spin_lock(&self->pi_lock);
+ self->saved_state = self->state;
+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock(&self->pi_lock);
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
+ BUG_ON(ret);
+
+ for (;;) {
+ /* Try to acquire the lock again. */
+ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
+ break;
+
+ top_waiter = rt_mutex_top_waiter(lock);
+ lock_owner = rt_mutex_owner(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
+ schedule();
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ raw_spin_lock(&self->pi_lock);
+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock(&self->pi_lock);
+ }
+
+ /*
+ * Restore the task state to current->saved_state. We set it
+ * to the original state above and the try_to_wake_up() code
+ * has possibly updated it when a real (non-rtmutex) wakeup
+ * happened while we were blocked. Clear saved_state so
+ * try_to_wakeup() does not get confused.
+ */
+ raw_spin_lock(&self->pi_lock);
+ __set_current_state_no_track(self->saved_state);
+ self->saved_state = TASK_RUNNING;
+ raw_spin_unlock(&self->pi_lock);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up:
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
+ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
+}
+
+static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter waiter;
+ unsigned long flags;
+
+ rt_mutex_init_waiter(&waiter, true);
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rt_spin_lock_slowlock_locked(lock, &waiter, flags);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ debug_rt_mutex_free_waiter(&waiter);
+}
+
+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
+ struct wake_q_head *wake_q,
+ struct wake_q_head *wq_sleeper);
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_sleeper_q);
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+ migrate_enable();
+ rcu_read_unlock();
+ sleeping_lock_dec();
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+ int ret;
+
+ sleeping_lock_inc();
+ migrate_disable();
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ } else {
+ migrate_enable();
+ sleeping_lock_dec();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ } else
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+ int ret;
+
+ *flags = 0;
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif /* PREEMPT_RT_FULL */
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+ static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+
+ if (!hold_ctx)
+ return 0;
+
+ if (unlikely(ctx == hold_ctx))
+ return -EALREADY;
+
+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+ ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+#else
+ static inline int __sched
+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ BUG();
+ return 0;
+}
+
+#endif
+
+static inline int
+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
+{
+ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
+}
+
/*
* Task blocks on lock.
*
@@ -952,6 +1335,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
+ /*
+ * In the case of futex requeue PI, this will be a proxy
+ * lock. The task will wake unaware that it is enqueueed on
+ * this lock. Avoid blocking on two locks and corrupting
+ * pi_blocked_on via the PI_WAKEUP_INPROGRESS
+ * flag. futex_wait_requeue_pi() sets this when it wakes up
+ * before requeue (due to a signal or timeout). Do not enqueue
+ * the task if PI_WAKEUP_INPROGRESS is set.
+ */
+ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
+ raw_spin_unlock(&task->pi_lock);
+ return -EAGAIN;
+ }
+
+ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
+
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
@@ -975,7 +1374,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_enqueue_pi(owner, waiter);
rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on)
+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
@@ -1017,6 +1416,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+ struct wake_q_head *wake_sleeper_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
@@ -1056,7 +1456,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* Pairs with preempt_enable() in rt_mutex_postunlock();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ if (waiter->savestate)
+ wake_q_add_sleeper(wake_sleeper_q, waiter->task);
+ else
+ wake_q_add(wake_q, waiter->task);
raw_spin_unlock(&current->pi_lock);
}
@@ -1071,7 +1474,7 @@ static void remove_waiter(struct rt_mutex *lock,
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
+ struct rt_mutex *next_lock = NULL;
lockdep_assert_held(&lock->wait_lock);
@@ -1097,7 +1500,8 @@ static void remove_waiter(struct rt_mutex *lock,
rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
+ next_lock = task_blocked_on_lock(owner);
raw_spin_unlock(&owner->pi_lock);
@@ -1133,26 +1537,28 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (!rt_mutex_real_waiter(waiter) ||
+ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
next_lock, NULL, task);
}
-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
{
debug_rt_mutex_init_waiter(waiter);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
RB_CLEAR_NODE(&waiter->tree_entry);
waiter->task = NULL;
+ waiter->savestate = savestate;
}
/**
@@ -1168,7 +1574,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
{
int ret = 0;
@@ -1177,16 +1584,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (try_to_take_rt_mutex(lock, current, waiter))
break;
- /*
- * TASK_INTERRUPTIBLE checks for signals and
- * timeout. Ignored otherwise.
- */
- if (likely(state == TASK_INTERRUPTIBLE)) {
- /* Signal pending? */
- if (signal_pending(current))
- ret = -EINTR;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (ww_ctx && ww_ctx->acquired > 0) {
+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
if (ret)
break;
}
@@ -1225,33 +1633,104 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
}
}
-/*
- * Slow path lock function:
- */
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
- rt_mutex_init_waiter(&waiter);
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
/*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
+ * Naughty, using a different class will lead to undefined behavior!
*/
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
+ struct rt_mutex_waiter *waiter, *n;
+
+ /*
+ * This branch gets optimized out for the common case,
+ * and is only important for ww_mutex_lock.
+ */
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ ww->ctx = ww_ctx;
+
+ /*
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
+ */
+ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
+ tree_entry) {
+ /* XXX debug rt mutex waiter wakeup */
+
+ BUG_ON(waiter->lock != lock);
+ rt_mutex_wake_waiter(waiter);
+ }
+}
+
+#else
+
+static void ww_mutex_account_lock(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ BUG();
+}
+#endif
+
+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (ww_ctx) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base.lock);
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+ }
+#endif
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ if (ww_ctx)
+ ww_mutex_account_lock(lock, ww_ctx);
return 0;
}
@@ -1261,16 +1740,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(timeout))
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
- if (likely(!ret))
+ if (likely(!ret)) {
/* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+ ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
+ ww_ctx);
+ } else if (ww_ctx) {
+ /* ww_mutex received EDEADLK, let it become EALREADY */
+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ BUG_ON(!ret);
+ }
if (unlikely(ret)) {
__set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ remove_waiter(lock, waiter);
+ /* ww_mutex wants to report EDEADLK/EALREADY, let it */
+ if (!ww_ctx)
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ } else if (ww_ctx) {
+ ww_mutex_account_lock(lock, ww_ctx);
}
/*
@@ -1278,6 +1767,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+ return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex_waiter waiter;
+ unsigned long flags;
+ int ret = 0;
+
+ rt_mutex_init_waiter(&waiter, false);
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
+ &waiter);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1338,7 +1857,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
* Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+ struct wake_q_head *wake_q,
+ struct wake_q_head *wake_sleeper_q)
{
unsigned long flags;
@@ -1392,7 +1912,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true; /* call rt_mutex_postunlock() */
@@ -1406,29 +1926,45 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ struct ww_acquire_ctx *ww_ctx,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx))
{
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+ /*
+ * If rt_mutex blocks, the function sched_submit_work will not call
+ * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
+ * We must call blk_schedule_flush_plug here, if we don't call it,
+ * a deadlock in I/O may happen.
+ */
+ if (unlikely(blk_needs_flush_plug(current)))
+ blk_schedule_flush_plug(current);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- return slowfn(lock, state, timeout, chwalk);
+ if (unlikely(blk_needs_flush_plug(current)))
+ blk_schedule_flush_plug(current);
+
+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
}
static inline int
@@ -1444,9 +1980,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
/*
* Performs the wakeup of the the top-waiter and re-enables preemption.
*/
-void rt_mutex_postunlock(struct wake_q_head *wake_q)
+void rt_mutex_postunlock(struct wake_q_head *wake_q,
+ struct wake_q_head *wake_sleeper_q)
{
wake_up_q(wake_q);
+ wake_up_q_sleeper(wake_sleeper_q);
/* Pairs with preempt_disable() in rt_mutex_slowunlock() */
preempt_enable();
@@ -1455,23 +1993,46 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q)
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
- struct wake_q_head *wqh))
+ struct wake_q_head *wqh,
+ struct wake_q_head *wq_sleeper))
{
DEFINE_WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_sleeper_q);
if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
return;
- if (slowfn(lock, &wake_q))
- rt_mutex_postunlock(&wake_q);
+ if (slowfn(lock, &wake_q, &wake_sleeper_q))
+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
}
-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
+int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
{
might_sleep();
+ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
+}
+
+/**
+ * rt_mutex_lock_state - lock a rt_mutex with a given state
+ *
+ * @lock: The rt_mutex to be locked
+ * @state: The state to set when blocking on the rt_mutex
+ */
+static inline int __sched rt_mutex_lock_state(struct rt_mutex *lock,
+ unsigned int subclass, int state)
+{
+ int ret;
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+ ret = __rt_mutex_lock_state(lock, state);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+
+static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
+{
+ rt_mutex_lock_state(lock, subclass, TASK_UNINTERRUPTIBLE);
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -1512,16 +2073,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
- int ret;
-
- might_sleep();
-
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
-
- return ret;
+ return rt_mutex_lock_state(lock, 0, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
@@ -1539,6 +2091,22 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
}
/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return rt_mutex_lock_state(lock, 0, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
* by the caller
@@ -1561,6 +2129,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
RT_MUTEX_MIN_CHAINWALK,
+ NULL,
rt_mutex_slowlock);
if (ret)
mutex_release(&lock->dep_map, 1, _RET_IP_);
@@ -1569,6 +2138,18 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+int __sched __rt_mutex_trylock(struct rt_mutex *lock)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (WARN_ON_ONCE(in_irq() || in_nmi()))
+#else
+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
+#endif
+ return 0;
+
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+
/**
* rt_mutex_trylock - try to lock a rt_mutex
*
@@ -1584,10 +2165,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
int ret;
- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
- return 0;
-
- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+ ret = __rt_mutex_trylock(lock);
if (ret)
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
@@ -1595,6 +2173,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+void __sched __rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+
/**
* rt_mutex_unlock - unlock a rt_mutex
*
@@ -1603,16 +2186,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
mutex_release(&lock->dep_map, 1, _RET_IP_);
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+ __rt_mutex_unlock(lock);
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
- */
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
+ struct wake_q_head *wake_q,
+ struct wake_q_head *wq_sleeper)
{
lockdep_assert_held(&lock->wait_lock);
@@ -1629,23 +2209,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
* avoid inversion prior to the wakeup. preempt_disable()
* therein pairs with rt_mutex_postunlock().
*/
- mark_wakeup_next_waiter(wake_q, lock);
+ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
return true; /* call postunlock() */
}
+/**
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q,
+ struct wake_q_head *wq_sleeper)
+{
+ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
+}
+
void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
{
DEFINE_WAKE_Q(wake_q);
+ DEFINE_WAKE_Q(wake_sleeper_q);
unsigned long flags;
bool postunlock;
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
}
/**
@@ -1684,7 +2276,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
if (name && key)
debug_rt_mutex_init(lock, name, key);
}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+EXPORT_SYMBOL(__rt_mutex_init);
/**
* rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -1704,6 +2296,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
__rt_mutex_init(lock, NULL, NULL);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /*
+ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
+ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
+ * lock.
+ */
+ raw_spin_lock_init(&lock->wait_lock);
+#endif
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
}
@@ -1727,6 +2327,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
rt_mutex_set_owner(lock, NULL);
}
+static void fixup_rt_mutex_blocked(struct rt_mutex *lock)
+{
+ struct task_struct *tsk = current;
+ /*
+ * RT has a problem here when the wait got interrupted by a timeout
+ * or a signal. task->pi_blocked_on is still set. The task must
+ * acquire the hash bucket lock when returning from this function.
+ *
+ * If the hash bucket lock is contended then the
+ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
+ * task_blocks_on_rt_mutex() will trigger. This can be avoided by
+ * clearing task->pi_blocked_on which removes the task from the
+ * boosting chain of the rtmutex. That's correct because the task
+ * is not longer blocked on it.
+ */
+ raw_spin_lock(&tsk->pi_lock);
+ tsk->pi_blocked_on = NULL;
+ raw_spin_unlock(&tsk->pi_lock);
+}
+
/**
* __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
* @lock: the rt_mutex to take
@@ -1757,6 +2377,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /*
+ * In PREEMPT_RT there's an added race.
+ * If the task, that we are about to requeue, times out,
+ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
+ * to skip this task. But right after the task sets
+ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
+ * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
+ * This will replace the PI_WAKEUP_INPROGRESS with the actual
+ * lock that it blocks on. We *must not* place this task
+ * on this proxy lock in that case.
+ *
+ * To prevent this race, we first take the task's pi_lock
+ * and check if it has updated its pi_blocked_on. If it has,
+ * we assume that it woke up and we return -EAGAIN.
+ * Otherwise, we set the task's pi_blocked_on to
+ * PI_REQUEUE_INPROGRESS, so that if the task is waking up
+ * it will know that we are in the process of requeuing it.
+ */
+ raw_spin_lock(&task->pi_lock);
+ if (task->pi_blocked_on) {
+ raw_spin_unlock(&task->pi_lock);
+ return -EAGAIN;
+ }
+ task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
+ raw_spin_unlock(&task->pi_lock);
+#endif
+
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
RT_MUTEX_FULL_CHAINWALK);
@@ -1771,6 +2419,9 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
ret = 0;
}
+ if (ret)
+ fixup_rt_mutex_blocked(lock);
+
debug_rt_mutex_print_deadlock(waiter);
return ret;
@@ -1856,12 +2507,15 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+ if (ret)
+ fixup_rt_mutex_blocked(lock);
+
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1923,3 +2577,99 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
return cleanup;
}
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
+ if (tmp > UINT_MAX/4)
+ tmp = UINT_MAX;
+ else
+ tmp = tmp*2 + tmp + tmp/2;
+
+ ctx->deadlock_inject_interval = tmp;
+ ctx->deadlock_inject_countdown = tmp;
+ ctx->contending_lock = lock;
+
+ ww_mutex_unlock(lock);
+
+ return -EDEADLK;
+ }
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
+ ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ else if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
+ ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ else if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ int nest = !!lock->ctx;
+
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (nest) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+
+ mutex_release(&lock->base.dep_map, nest, _RET_IP_);
+ __rt_mutex_unlock(&lock->base.lock);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+int __rt_mutex_owner_current(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(lock) == current;
+}
+EXPORT_SYMBOL(__rt_mutex_owner_current);
+#endif
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index d1d62f942be2..546aaf058b9e 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,6 +15,7 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
/*
* This is the control structure for tasks blocked on a rt_mutex,
@@ -29,6 +30,7 @@ struct rt_mutex_waiter {
struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
+ bool savestate;
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
struct pid *deadlock_task_pid;
@@ -130,12 +132,15 @@ enum rtmutex_chainwalk {
/*
* PI-futex support (proxy locking functions, etc.):
*/
+#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
+#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
+
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -153,9 +158,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
+ struct wake_q_head *wqh,
+ struct wake_q_head *wq_sleeper);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
+ struct wake_q_head *wake_sleeper_q);
+
+/* RW semaphore special interface */
+struct ww_acquire_ctx;
+
+extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
+extern int __rt_mutex_trylock(struct rt_mutex *lock);
+extern void __rt_mutex_unlock(struct rt_mutex *lock);
+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx,
+ struct rt_mutex_waiter *waiter);
+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ unsigned long flags);
+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c
new file mode 100644
index 000000000000..0ae8c62ea832
--- /dev/null
+++ b/kernel/locking/rwlock-rt.c
@@ -0,0 +1,384 @@
+/*
+ */
+#include <linux/sched/debug.h>
+#include <linux/export.h>
+
+#include "rtmutex_common.h"
+#include <linux/rwlock_types_rt.h>
+
+/*
+ * RT-specific reader/writer locks
+ *
+ * write_lock()
+ * 1) Lock lock->rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical region
+ * 4) Mark it write locked
+ *
+ * write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS so readers can use the fast path again
+ * 3) Unlock lock->rtmutex to release blocked readers
+ *
+ * read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on lock->rtmutex
+ * 5) unlock lock->rtmutex, goto 1)
+ *
+ * read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in write_lock()#3
+ *
+ * read_lock()#3 has the consequence, that rw locks on RT are not writer
+ * fair, but writers, which should be avoided in RT tasks (think tasklist
+ * lock), are subject to the rtmutex priority/DL inheritance mechanism.
+ *
+ * It's possible to make the rw locks writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with
+ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
+ * mechanism is a major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ */
+
+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ atomic_set(&lock->readers, READER_BIAS);
+ rt_mutex_init(&lock->rtmutex);
+ lock->rtmutex.save_state = 1;
+}
+
+int __read_rt_trylock(struct rt_rw_lock *lock)
+{
+ int r, old;
+
+ /*
+ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&lock->readers); r < 0;) {
+ old = atomic_cmpxchg(&lock->readers, r, r + 1);
+ if (likely(old == r))
+ return 1;
+ r = old;
+ }
+ return 0;
+}
+
+void __sched __read_rt_lock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct rt_mutex_waiter waiter;
+ unsigned long flags;
+
+ if (__read_rt_trylock(lock))
+ return;
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ /*
+ * Allow readers as long as the writer has not completely
+ * acquired the semaphore for write.
+ */
+ if (atomic_read(&lock->readers) != WRITER_BIAS) {
+ atomic_inc(&lock->readers);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ return;
+ }
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * read_lock()
+ * write_lock()
+ * rtmutex_lock(m)
+ * swait()
+ * read_lock()
+ * unlock(m->wait_lock)
+ * read_unlock()
+ * swake()
+ * lock(m->wait_lock)
+ * lock->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * write_unlock()
+ * lock->writelocked=false
+ * rtmutex_unlock(m)
+ * read_lock()
+ * write_lock()
+ * rtmutex_lock(m)
+ * swait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call read_unlock() which might be unbound.
+ */
+ rt_mutex_init_waiter(&waiter, true);
+ rt_spin_lock_slowlock_locked(m, &waiter, flags);
+ /*
+ * The slowlock() above is guaranteed to return with the rtmutex is
+ * now held, so there can't be a writer active. Increment the reader
+ * count and immediately drop the rtmutex again.
+ */
+ atomic_inc(&lock->readers);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ rt_spin_lock_slowunlock(m);
+
+ debug_rt_mutex_free_waiter(&waiter);
+}
+
+void __read_rt_unlock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct task_struct *tsk;
+
+ /*
+ * sem->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical region.
+ */
+ if (!atomic_dec_and_test(&lock->readers))
+ return;
+
+ raw_spin_lock_irq(&m->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path, but to clean up the rw
+ * lock it needs to acquire m->wait_lock. The worst case which can
+ * happen is a spurious wakeup.
+ */
+ tsk = rt_mutex_owner(m);
+ if (tsk)
+ wake_up_process(tsk);
+
+ raw_spin_unlock_irq(&m->wait_lock);
+}
+
+static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+
+ atomic_add(READER_BIAS - bias, &lock->readers);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ rt_spin_lock_slowunlock(m);
+}
+
+void __sched __write_rt_lock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct task_struct *self = current;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ __rt_spin_lock(m);
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &lock->readers);
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+
+ raw_spin_lock(&self->pi_lock);
+ self->saved_state = self->state;
+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock(&self->pi_lock);
+
+ for (;;) {
+ /* Have all readers left the critical region? */
+ if (!atomic_read(&lock->readers)) {
+ atomic_set(&lock->readers, WRITER_BIAS);
+ raw_spin_lock(&self->pi_lock);
+ __set_current_state_no_track(self->saved_state);
+ self->saved_state = TASK_RUNNING;
+ raw_spin_unlock(&self->pi_lock);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ return;
+ }
+
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+
+ if (atomic_read(&lock->readers) != 0)
+ schedule();
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+
+ raw_spin_lock(&self->pi_lock);
+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock(&self->pi_lock);
+ }
+}
+
+int __write_rt_trylock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ unsigned long flags;
+
+ if (!__rt_mutex_trylock(m))
+ return 0;
+
+ atomic_sub(READER_BIAS, &lock->readers);
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ if (!atomic_read(&lock->readers)) {
+ atomic_set(&lock->readers, WRITER_BIAS);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ return 1;
+ }
+ __write_unlock_common(lock, 0, flags);
+ return 0;
+}
+
+void __write_rt_unlock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ __write_unlock_common(lock, WRITER_BIAS, flags);
+}
+
+/* Map the reader biased implementation */
+static inline int do_read_rt_trylock(rwlock_t *rwlock)
+{
+ return __read_rt_trylock(rwlock);
+}
+
+static inline int do_write_rt_trylock(rwlock_t *rwlock)
+{
+ return __write_rt_trylock(rwlock);
+}
+
+static inline void do_read_rt_lock(rwlock_t *rwlock)
+{
+ __read_rt_lock(rwlock);
+}
+
+static inline void do_write_rt_lock(rwlock_t *rwlock)
+{
+ __write_rt_lock(rwlock);
+}
+
+static inline void do_read_rt_unlock(rwlock_t *rwlock)
+{
+ __read_rt_unlock(rwlock);
+}
+
+static inline void do_write_rt_unlock(rwlock_t *rwlock)
+{
+ __write_rt_unlock(rwlock);
+}
+
+static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ __rwlock_biased_rt_init(rwlock, name, key);
+}
+
+int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
+{
+ return atomic_read(&rwlock->readers) < 0;
+}
+
+int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
+{
+ return atomic_read(&rwlock->readers) == READER_BIAS;
+}
+
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ sleeping_lock_inc();
+ migrate_disable();
+ ret = do_read_rt_trylock(rwlock);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ } else {
+ migrate_enable();
+ sleeping_lock_dec();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ sleeping_lock_inc();
+ migrate_disable();
+ ret = do_write_rt_trylock(rwlock);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ } else {
+ migrate_enable();
+ sleeping_lock_dec();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ do_read_rt_lock(rwlock);
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+ sleeping_lock_inc();
+ rcu_read_lock();
+ migrate_disable();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ do_write_rt_lock(rwlock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ do_read_rt_unlock(rwlock);
+ migrate_enable();
+ rcu_read_unlock();
+ sleeping_lock_dec();
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ do_write_rt_unlock(rwlock);
+ migrate_enable();
+ rcu_read_unlock();
+ sleeping_lock_dec();
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+ do_rwlock_rt_init(rwlock, name, key);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
new file mode 100644
index 000000000000..f518495bd6cc
--- /dev/null
+++ b/kernel/locking/rwsem-rt.c
@@ -0,0 +1,302 @@
+/*
+ */
+#include <linux/blkdev.h>
+#include <linux/rwsem.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/signal.h>
+#include <linux/export.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * RT-specific reader/writer semaphores
+ *
+ * down_write()
+ * 1) Lock sem->rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical region
+ * 4) Mark it write locked
+ *
+ * up_write()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS so readers can use the fast path again
+ * 3) Unlock sem->rtmutex to release blocked readers
+ *
+ * down_read()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on sem->rtmutex
+ * 5) unlock sem->rtmutex, goto 1)
+ *
+ * up_read()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()#3
+ *
+ * down_read()#3 has the consequence, that rw semaphores on RT are not writer
+ * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
+ * are subject to the rtmutex priority/DL inheritance mechanism.
+ *
+ * It's possible to make the rw semaphores writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers to
+ * block on the rtmutex, but the rtmutex would have to be proxy locked for one
+ * reader after the other. We can't use multi-reader inheritance because there
+ * is no way to support that with SCHED_DEADLINE. Implementing the one by one
+ * reader boosting/handover mechanism is a major surgery for a very dubious
+ * value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ */
+
+void __rwsem_init(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ atomic_set(&sem->readers, READER_BIAS);
+}
+EXPORT_SYMBOL(__rwsem_init);
+
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+ int r, old;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&sem->readers); r < 0;) {
+ old = atomic_cmpxchg(&sem->readers, r, r + 1);
+ if (likely(old == r))
+ return 1;
+ r = old;
+ }
+ return 0;
+}
+
+static int __sched __down_read_common(struct rw_semaphore *sem, int state)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ struct rt_mutex_waiter waiter;
+ int ret;
+
+ if (__down_read_trylock(sem))
+ return 0;
+ /*
+ * If rt_mutex blocks, the function sched_submit_work will not call
+ * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
+ * We must call blk_schedule_flush_plug here, if we don't call it,
+ * a deadlock in I/O may happen.
+ */
+ if (unlikely(blk_needs_flush_plug(current)))
+ blk_schedule_flush_plug(current);
+
+ might_sleep();
+ raw_spin_lock_irq(&m->wait_lock);
+ /*
+ * Allow readers as long as the writer has not completely
+ * acquired the semaphore for write.
+ */
+ if (atomic_read(&sem->readers) != WRITER_BIAS) {
+ atomic_inc(&sem->readers);
+ raw_spin_unlock_irq(&m->wait_lock);
+ return 0;
+ }
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * swait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * swake()
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * swait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read() which might be unbound.
+ */
+ rt_mutex_init_waiter(&waiter, false);
+ ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK,
+ NULL, &waiter);
+ /*
+ * The slowlock() above is guaranteed to return with the rtmutex (for
+ * ret = 0) is now held, so there can't be a writer active. Increment
+ * the reader count and immediately drop the rtmutex again.
+ * For ret != 0 we don't hold the rtmutex and need unlock the wait_lock.
+ * We don't own the lock then.
+ */
+ if (!ret)
+ atomic_inc(&sem->readers);
+ raw_spin_unlock_irq(&m->wait_lock);
+ if (!ret)
+ __rt_mutex_unlock(m);
+
+ debug_rt_mutex_free_waiter(&waiter);
+ return ret;
+}
+
+void __down_read(struct rw_semaphore *sem)
+{
+ int ret;
+
+ ret = __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(ret);
+}
+
+int __down_read_killable(struct rw_semaphore *sem)
+{
+ int ret;
+
+ ret = __down_read_common(sem, TASK_KILLABLE);
+ if (likely(!ret))
+ return ret;
+ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret);
+ return -EINTR;
+}
+
+void __up_read(struct rw_semaphore *sem)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ struct task_struct *tsk;
+
+ /*
+ * sem->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical region.
+ */
+ if (!atomic_dec_and_test(&sem->readers))
+ return;
+
+ might_sleep();
+ raw_spin_lock_irq(&m->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up the rwsem it needs to acquire m->wait_lock. The worst
+ * case which can happen is a spurious wakeup.
+ */
+ tsk = rt_mutex_owner(m);
+ if (tsk)
+ wake_up_process(tsk);
+
+ raw_spin_unlock_irq(&m->wait_lock);
+}
+
+static void __up_write_unlock(struct rw_semaphore *sem, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+
+ atomic_add(READER_BIAS - bias, &sem->readers);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ __rt_mutex_unlock(m);
+}
+
+static int __sched __down_write_common(struct rw_semaphore *sem, int state)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (__rt_mutex_lock_state(m, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &sem->readers);
+ might_sleep();
+
+ set_current_state(state);
+ for (;;) {
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ /* Have all readers left the critical region? */
+ if (!atomic_read(&sem->readers)) {
+ atomic_set(&sem->readers, WRITER_BIAS);
+ __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ return 0;
+ }
+
+ if (signal_pending_state(state, current)) {
+ __set_current_state(TASK_RUNNING);
+ __up_write_unlock(sem, 0, flags);
+ return -EINTR;
+ }
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+
+ if (atomic_read(&sem->readers) != 0) {
+ schedule();
+ set_current_state(state);
+ }
+ }
+}
+
+void __sched __down_write(struct rw_semaphore *sem)
+{
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
+}
+
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ unsigned long flags;
+
+ if (!__rt_mutex_trylock(m))
+ return 0;
+
+ atomic_sub(READER_BIAS, &sem->readers);
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ if (!atomic_read(&sem->readers)) {
+ atomic_set(&sem->readers, WRITER_BIAS);
+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
+ return 1;
+ }
+ __up_write_unlock(sem, 0, flags);
+ return 0;
+}
+
+void __up_write(struct rw_semaphore *sem)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ __up_write_unlock(sem, WRITER_BIAS, flags);
+}
+
+void __downgrade_write(struct rw_semaphore *sem)
+{
+ struct rt_mutex *m = &sem->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&m->wait_lock, flags);
+ /* Release it and account current as reader */
+ __up_write_unlock(sem, WRITER_BIAS - 1, flags);
+}
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..0a7fdad68a58 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -169,6 +169,7 @@ extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore
extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+#ifndef CONFIG_PREEMPT_RT_FULL
/*
* lock for reading
*/
@@ -302,3 +303,4 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
if (tmp < 0)
rwsem_downgrade_wake(sem);
}
+#endif
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ff08380f531..9f76e532fee2 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT_FULL
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT_FULL
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !PREEMPT_RT_FULL */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 399669f7eba8..3bf168e8e869 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT_FULL
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,22 +49,23 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +82,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +101,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT_FULL
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -187,8 +190,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -197,8 +200,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index dcf2cc656e7c..9f35e1927807 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -208,7 +208,8 @@ static struct module *mod_find(unsigned long addr)
{
struct module *mod;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (within_module(addr, mod))
return mod;
}
@@ -442,7 +443,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -602,7 +604,8 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1019,6 +1022,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
return 0;
out:
mutex_unlock(&module_mutex);
@@ -1447,8 +1452,7 @@ static inline bool sect_empty(const Elf_Shdr *sect)
}
struct module_sect_attr {
- struct module_attribute mattr;
- char *name;
+ struct bin_attribute battr;
unsigned long address;
};
@@ -1458,13 +1462,34 @@ struct module_sect_attrs {
struct module_sect_attr attrs[0];
};
-static ssize_t module_sect_show(struct module_attribute *mattr,
- struct module_kobject *mk, char *buf)
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
{
struct module_sect_attr *sattr =
- container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%px\n", kptr_restrict < 2 ?
- (void *)sattr->address : NULL);
+ container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1472,7 +1497,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
unsigned int section;
for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs->attrs[section].battr.attr.name);
kfree(sect_attrs);
}
@@ -1481,7 +1506,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
struct module_sect_attr *sattr;
- struct attribute **gattr;
+ struct bin_attribute **gattr;
/* Count loaded sections and allocate structures */
for (i = 0; i < info->hdr->e_shnum; i++)
@@ -1489,35 +1514,34 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
nloaded++;
size[0] = ALIGN(sizeof(*sect_attrs)
+ nloaded * sizeof(sect_attrs->attrs[0]),
- sizeof(sect_attrs->grp.attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+ sizeof(sect_attrs->grp.bin_attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
if (sect_attrs == NULL)
return;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
- sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+ sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.attrs[0];
+ gattr = &sect_attrs->grp.bin_attrs[0];
for (i = 0; i < info->hdr->e_shnum; i++) {
Elf_Shdr *sec = &info->sechdrs[i];
if (sect_empty(sec))
continue;
+ sysfs_bin_attr_init(&sattr->battr);
sattr->address = sec->sh_addr;
- sattr->name = kstrdup(info->secstrings + sec->sh_name,
- GFP_KERNEL);
- if (sattr->name == NULL)
+ sattr->battr.attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (sattr->battr.attr.name == NULL)
goto out;
sect_attrs->nsections++;
- sysfs_attr_init(&sattr->mattr.attr);
- sattr->mattr.show = module_sect_show;
- sattr->mattr.store = NULL;
- sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUSR;
- *(gattr++) = &(sattr++)->mattr.attr;
+ sattr->battr.read = module_sect_read;
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
+ sattr->battr.attr.mode = 0400;
+ *(gattr++) = &(sattr++)->battr;
}
*gattr = NULL;
@@ -1607,7 +1631,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
continue;
if (info->sechdrs[i].sh_type == SHT_NOTE) {
sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
nattr->attr.mode = S_IRUGO;
nattr->size = info->sechdrs[i].sh_size;
nattr->private = (void *) info->sechdrs[i].sh_addr;
@@ -1725,6 +1749,8 @@ static int module_add_modinfo_attrs(struct module *mod)
error_out:
if (i > 0)
module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
return error;
}
@@ -2957,9 +2983,7 @@ static int setup_load_info(struct load_info *info, int flags)
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
- if (!info->index.info)
- info->name = "(missing .modinfo section)";
- else
+ if (info->index.info)
info->name = get_modinfo(info, "name");
/* Find internal symbols and strings. */
@@ -2974,14 +2998,15 @@ static int setup_load_info(struct load_info *info, int flags)
}
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo name field)");
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
/* This is temporary: point mod into copy of data. */
@@ -4285,7 +4310,7 @@ static int modules_open(struct inode *inode, struct file *file)
if (!err) {
struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
}
return err;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d9f5081d578d..157d7c29f720 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -554,7 +554,7 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/padata.c b/kernel/padata.c
index 15a8ad63f4ff..4f39a5e7a853 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -35,6 +35,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -334,6 +336,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -343,6 +346,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -352,9 +357,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -501,7 +509,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
+ atomic_set(&pd->refcnt, 1);
pd->pinst = pinst;
spin_lock_init(&pd->lock);
@@ -526,31 +534,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- del_timer_sync(&pd->timer);
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -564,10 +547,6 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
@@ -588,8 +567,8 @@ static void padata_replace(struct padata_instance *pinst,
if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
notification_mask |= PADATA_CPU_SERIAL;
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ if (atomic_dec_and_test(&pd_old->refcnt))
+ padata_free_pd(pd_old);
if (notification_mask)
blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
@@ -692,8 +671,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -711,8 +690,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 3406a7a31449..48482af7987b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -229,7 +229,6 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -253,8 +252,6 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -516,9 +513,11 @@ static u64 oops_id;
static int init_oops_id(void)
{
+#ifndef CONFIG_PREEMPT_RT_FULL
if (!oops_id)
get_random_bytes(&oops_id, sizeof(oops_id));
else
+#endif
oops_id++;
return 0;
@@ -592,37 +591,25 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
struct warn_args args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
- &args);
- va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
-
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
-{
- struct warn_args args;
+ if (!fmt) {
+ pr_warn(CUT_HERE);
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
- pr_warn(CUT_HERE);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cd7434e6000d..dcd1977b2535 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -688,6 +688,10 @@ static int load_image_and_restore(void)
return error;
}
+#ifndef CONFIG_SUSPEND
+bool pm_in_action;
+#endif
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
@@ -701,6 +705,8 @@ int hibernate(void)
return -EPERM;
}
+ pm_in_action = true;
+
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -777,6 +783,7 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
+ pm_in_action = false;
pr_info("hibernation exit\n");
return error;
@@ -897,6 +904,13 @@ static int software_resume(void)
error = freeze_processes();
if (error)
goto Close_Finish;
+
+ error = freeze_kernel_threads();
+ if (error) {
+ thaw_processes();
+ goto Close_Finish;
+ }
+
error = load_image_and_restore();
thaw_processes();
Finish:
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 83105874f255..26b9168321e7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -734,8 +734,15 @@ zone_found:
* We have found the zone. Now walk the radix tree to find the leaf node
* for our PFN.
*/
+
+ /*
+ * If the zone we wish to scan is the the current zone and the
+ * pfn falls into the current node then we do not need to walk
+ * the tree.
+ */
node = bm->cur.node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ if (zone == bm->cur.zone &&
+ ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 096211299c07..9b2219d5116d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -605,6 +605,8 @@ static int enter_state(suspend_state_t state)
return error;
}
+bool pm_in_action;
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -619,6 +621,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_in_action = true;
pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
@@ -628,6 +631,7 @@ int pm_suspend(suspend_state_t state)
suspend_stats.success++;
}
pr_info("suspend exit\n");
+ pm_in_action = false;
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..7b219d824c0f 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
deleted file mode 100644
index c8e6ab689d42..000000000000
--- a/kernel/printk/internal.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * internal.h - printk internal definitions
- */
-#include <linux/percpu.h>
-
-#ifdef CONFIG_PRINTK
-
-#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000
-#define PRINTK_NMI_CONTEXT_MASK 0x80000000
-
-extern raw_spinlock_t logbuf_lock;
-
-__printf(5, 0)
-int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args);
-
-__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-
-#define printk_safe_enter_irqsave(flags) \
- do { \
- local_irq_save(flags); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irqrestore(flags) \
- do { \
- __printk_safe_exit(); \
- local_irq_restore(flags); \
- } while (0)
-
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
-
-void defer_console_output(void);
-
-#else
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
-
-/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
- * semaphore and some of console functions (console_unlock()/etc.), so
- * printk-safe must preserve the existing local IRQ guarantees.
- */
-#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 424abf802f02..e22a0b0191f1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,9 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/kthread.h>
+#include <linux/clocksource.h>
+#include <linux/printk_ringbuffer.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
@@ -58,13 +61,13 @@
#include "console_cmdline.h"
#include "braille.h"
-#include "internal.h"
-int console_printk[4] = {
+int console_printk[5] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
+ CONSOLE_LOGLEVEL_EMERGENCY, /* emergency_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);
@@ -215,19 +218,7 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- int lock_failed;
- unsigned long flags;
-
- /*
- * Here and in __up_console_sem() we need to be in safe mode,
- * because spindump/WARN/etc from under console ->lock will
- * deadlock in printk()->down_trylock_console_sem() otherwise.
- */
- printk_safe_enter_irqsave(flags);
- lock_failed = down_trylock(&console_sem);
- printk_safe_exit_irqrestore(flags);
-
- if (lock_failed)
+ if (down_trylock(&console_sem))
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
@@ -236,13 +227,9 @@ static int __down_trylock_console_sem(unsigned long ip)
static void __up_console_sem(unsigned long ip)
{
- unsigned long flags;
-
mutex_release(&console_lock_dep_map, 1, ip);
- printk_safe_enter_irqsave(flags);
up(&console_sem);
- printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)
@@ -257,11 +244,6 @@ static void __up_console_sem(unsigned long ip)
static int console_locked, console_suspended;
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
* Array of consoles built from command line options (console=)
*/
@@ -357,6 +339,7 @@ enum log_flags {
struct printk_log {
u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 cpu; /* cpu that generated record */
u16 len; /* length of entire record */
u16 text_len; /* length of text buffer */
u16 dict_len; /* length of dictionary buffer */
@@ -372,65 +355,22 @@ __packed __aligned(4)
#endif
;
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
+DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
#ifdef CONFIG_PRINTK
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
+/* record buffer */
+DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock);
+
+static DEFINE_MUTEX(syslog_lock);
+DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb);
+
+/* the last printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
-static u32 clear_idx;
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -442,24 +382,30 @@ static u32 clear_idx;
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
-/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define LOG_BUF_LEN_MAX (u32)(1 << 31)
-static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
+#if 0
+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __read_mostly;
+
+bool printk_percpu_data_ready(void)
+{
+ return __printk_percpu_data_ready;
+}
+#endif
/* Return log buffer address */
char *log_buf_addr_get(void)
{
- return log_buf;
+ return printk_rb.buffer;
}
/* Return log buffer size */
u32 log_buf_len_get(void)
{
- return log_buf_len;
+ return (1 << printk_rb.size_bits);
}
/* human readable text of the record */
@@ -474,180 +420,50 @@ static char *log_dict(const struct printk_log *msg)
return (char *)msg + sizeof(struct printk_log) + msg->text_len;
}
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
-
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
-
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
-}
-
-static int log_make_free_space(u32 msg_size)
-{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
-}
-
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
-{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
-}
-
-/*
- * Define how much of the log buffer we could take at maximum. The value
- * must be greater than two. Note that only half of the buffer is available
- * when the index points to the middle.
- */
-#define MAX_LOG_TAKE_PART 4
-static const char trunc_msg[] = "<truncated>";
-
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
-{
- /*
- * The message should not take the whole buffer. Otherwise, it might
- * get removed too soon.
- */
- u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
- if (*text_len > max_text_len)
- *text_len = max_text_len;
- /* enable the warning message */
- *trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
-}
+static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
+ char *text, u16 text_len);
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
- enum log_flags flags, u64 ts_nsec,
+ enum log_flags flags, u64 ts_nsec, u16 cpu,
const char *dict, u16 dict_len,
const char *text, u16 text_len)
{
struct printk_log *msg;
- u32 size, pad_len;
- u16 trunc_msg_len = 0;
-
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
-
- if (log_make_free_space(size)) {
- /* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
- return 0;
- }
+ struct prb_handle h;
+ char *rbuf;
+ u32 size;
+
+ size = sizeof(*msg) + text_len + dict_len;
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
+ rbuf = prb_reserve(&h, &printk_rb, size);
+ if (!rbuf) {
/*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
+ * An emergency message would have been printed, but
+ * it cannot be stored in the log.
*/
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
+ prb_inc_lost(&printk_rb);
+ return 0;
}
/* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
+ msg = (struct printk_log *)rbuf;
memcpy(log_text(msg), text, text_len);
msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
memcpy(log_dict(msg), dict, dict_len);
msg->dict_len = dict_len;
msg->facility = facility;
msg->level = level & 7;
msg->flags = flags & 0x1f;
- if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
- else
- msg->ts_nsec = local_clock();
+ msg->ts_nsec = ts_nsec;
#ifdef CONFIG_PRINTK_CALLER
msg->caller_id = caller_id;
#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
+ msg->cpu = cpu;
msg->len = size;
/* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
+ prb_commit(&h);
return msg->text_len;
}
@@ -717,9 +533,9 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
do_div(ts_usec, 1000);
- return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+ return scnprintf(buf, size, "%u,%llu,%llu,%c%s,%hu;",
(msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ msg->flags & LOG_CONT ? 'c' : '-', caller, msg->cpu);
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -770,13 +586,18 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
return p - buf;
}
+#define PRINTK_SPRINT_MAX (LOG_LINE_MAX + PREFIX_MAX)
+#define PRINTK_RECORD_MAX (sizeof(struct printk_log) + \
+ CONSOLE_EXT_LOG_MAX + PRINTK_SPRINT_MAX)
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
- u32 idx;
+ struct prb_iterator iter;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
+ char msgbuf[PRINTK_RECORD_MAX];
};
static __printf(3, 4) __cold
@@ -859,9 +680,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
+ struct prb_iterator backup_iter;
struct printk_log *msg;
- size_t len;
ssize_t ret;
+ size_t len;
+ u64 seq;
if (!user)
return -EBADF;
@@ -870,52 +693,63 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- logbuf_lock_irq();
- while (user->seq == log_next_seq) {
- if (file->f_flags & O_NONBLOCK) {
- ret = -EAGAIN;
- logbuf_unlock_irq();
- goto out;
- }
+ /* make a backup copy in case there is a problem */
+ prb_iter_copy(&backup_iter, &user->iter);
- logbuf_unlock_irq();
- ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
- if (ret)
- goto out;
- logbuf_lock_irq();
+ if (file->f_flags & O_NONBLOCK) {
+ ret = prb_iter_next(&user->iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
+ } else {
+ ret = prb_iter_wait_next(&user->iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
}
-
- if (user->seq < log_first_seq) {
- /* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ if (ret == 0) {
+ /* end of list */
+ ret = -EAGAIN;
+ goto out;
+ } else if (ret == -EINVAL) {
+ /* iterator invalid, return error and reset */
ret = -EPIPE;
- logbuf_unlock_irq();
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
+ goto out;
+ } else if (ret < 0) {
+ /* interrupted by signal */
goto out;
}
- msg = log_from_idx(user->idx);
+ user->seq++;
+ if (user->seq < seq) {
+ ret = -EPIPE;
+ goto restore_out;
+ }
+
+ msg = (struct printk_log *)&user->msgbuf[0];
len = msg_print_ext_header(user->buf, sizeof(user->buf),
msg, user->seq);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
log_dict(msg), msg->dict_len,
log_text(msg), msg->text_len);
- user->idx = log_next(user->idx);
- user->seq++;
- logbuf_unlock_irq();
-
if (len > count) {
ret = -EINVAL;
- goto out;
+ goto restore_out;
}
if (copy_to_user(buf, user->buf, len)) {
ret = -EFAULT;
- goto out;
+ goto restore_out;
}
+
ret = len;
+ goto out;
+restore_out:
+ /*
+ * There was an error, but this message should not be
+ * lost because of it. Restore the backup and setup
+ * seq so that it will work with the next read.
+ */
+ prb_iter_copy(&user->iter, &backup_iter);
+ user->seq = seq - 1;
out:
mutex_unlock(&user->lock);
return ret;
@@ -924,19 +758,22 @@ out:
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
- loff_t ret = 0;
+ loff_t ret;
+ u64 seq;
if (!user)
return -EBADF;
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
+ ret = mutex_lock_interruptible(&user->lock);
+ if (ret)
+ return ret;
+
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
break;
case SEEK_DATA:
/*
@@ -944,40 +781,87 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ for (;;) {
+ prb_iter_init(&user->iter, &printk_rb, &seq);
+ ret = prb_iter_seek(&user->iter, clear_seq);
+ if (ret > 0) {
+ /* seeked to clear seq */
+ user->seq = clear_seq;
+ break;
+ } else if (ret == 0) {
+ /*
+ * The end of the list was hit without
+ * ever seeing the clear seq. Just
+ * seek to the beginning of the list.
+ */
+ prb_iter_init(&user->iter, &printk_rb,
+ &user->seq);
+ break;
+ }
+ /* iterator invalid, start over */
+
+ /* reset clear_seq if it is no longer available */
+ if (seq > clear_seq)
+ clear_seq = 0;
+ }
+ ret = 0;
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ for (;;) {
+ ret = prb_iter_next(&user->iter, NULL, 0, &user->seq);
+ if (ret == 0)
+ break;
+ else if (ret > 0)
+ continue;
+ /* iterator invalid, start over */
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
+ }
+ ret = 0;
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
+
+ mutex_unlock(&user->lock);
return ret;
}
+struct wait_queue_head *printk_wait_queue(void)
+{
+ /* FIXME: using prb internals! */
+ return printk_rb.wq;
+}
+
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
+ struct prb_iterator iter;
__poll_t ret = 0;
+ int rbret;
+ u64 seq;
if (!user)
return EPOLLERR|EPOLLNVAL;
- poll_wait(file, &log_wait, wait);
+ poll_wait(file, printk_wait_queue(), wait);
- logbuf_lock_irq();
- if (user->seq < log_next_seq) {
- /* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
- ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
- else
- ret = EPOLLIN|EPOLLRDNORM;
- }
- logbuf_unlock_irq();
+ mutex_lock(&user->lock);
+
+ /* use copy so no actual iteration takes place */
+ prb_iter_copy(&iter, &user->iter);
+
+ rbret = prb_iter_next(&iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
+ if (rbret == 0)
+ goto out;
+
+ ret = EPOLLIN|EPOLLRDNORM;
+
+ if (rbret < 0 || (seq - user->seq) != 1)
+ ret |= EPOLLERR|EPOLLPRI;
+out:
+ mutex_unlock(&user->lock);
return ret;
}
@@ -1007,10 +891,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
- logbuf_unlock_irq();
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
file->private_data = user;
return 0;
@@ -1050,11 +931,6 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
/*
* Export struct printk_log size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
@@ -1070,6 +946,8 @@ void log_buf_vmcoreinfo_setup(void)
}
#endif
+/* FIXME: no support for buffer resizing */
+#if 0
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
@@ -1135,13 +1013,33 @@ static void __init log_buf_add_cpu(void)
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
+#endif /* 0 */
+
+#if 0
+static void __init set_percpu_data_ready(void)
+{
+ /* Make sure we set this flag only after printk_safe() init is done */
+ barrier();
+ __printk_percpu_data_ready = true;
+}
+#endif
void __init setup_log_buf(int early)
{
+/* FIXME: no support for buffer resizing */
+#if 0
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ /*
+ * Some archs call setup_log_buf() multiple times - first is very
+ * early, e.g. from setup_arch(), and second - when percpu_areas
+ * are initialised.
+ */
+ if (!early)
+ set_percpu_data_ready();
+
if (log_buf != __log_buf)
return;
@@ -1169,6 +1067,7 @@ void __init setup_log_buf(int early)
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+#endif
}
static bool __read_mostly ignore_loglevel;
@@ -1249,6 +1148,11 @@ static inline void boot_delay_msec(int level)
static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_cpu(u16 cpu, char *buf)
+{
+ return sprintf(buf, "%03hu: ", cpu);
+}
+
static size_t print_syslog(unsigned int level, char *buf)
{
return sprintf(buf, "<%u>", level);
@@ -1292,6 +1196,7 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
buf[len++] = ' ';
buf[len] = '\0';
}
+ len += print_cpu(msg->cpu, buf + len);
return len;
}
@@ -1337,30 +1242,42 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog,
return len;
}
-static int syslog_print(char __user *buf, int size)
+static int syslog_print(char __user *buf, int size, char *text,
+ char *msgbuf, int *locked)
{
- char *text;
+ struct prb_iterator iter;
struct printk_log *msg;
int len = 0;
-
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
- if (!text)
- return -ENOMEM;
+ u64 seq;
+ int ret;
while (size > 0) {
size_t n;
size_t skip;
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
+ for (;;) {
+ prb_iter_copy(&iter, &syslog_iter);
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret < 0) {
+ /* messages are gone, move to first one */
+ prb_iter_init(&syslog_iter, &printk_rb,
+ &syslog_seq);
+ syslog_partial = 0;
+ continue;
+ }
+ break;
}
- if (syslog_seq == log_next_seq) {
- logbuf_unlock_irq();
+ if (ret == 0)
break;
+
+ /*
+ * If messages have been missed, the partial tracker
+ * is no longer valid and must be reset.
+ */
+ if (syslog_seq > 0 && seq - 1 != syslog_seq) {
+ syslog_seq = seq - 1;
+ syslog_partial = 0;
}
/*
@@ -1370,131 +1287,215 @@ static int syslog_print(char __user *buf, int size)
if (!syslog_partial)
syslog_time = printk_time;
+ msg = (struct printk_log *)msgbuf;
+
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ PRINTK_SPRINT_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ prb_iter_next(&syslog_iter, NULL, 0, &syslog_seq);
n -= syslog_partial;
syslog_partial = 0;
- } else if (!len){
+ } else if (!len) {
/* partial read(), remember position */
n = size;
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
if (!n)
break;
+ mutex_unlock(&syslog_lock);
if (copy_to_user(buf, text + skip, n)) {
if (!len)
len = -EFAULT;
+ *locked = 0;
break;
}
+ ret = mutex_lock_interruptible(&syslog_lock);
len += n;
size -= n;
buf += n;
+
+ if (ret) {
+ if (!len)
+ len = ret;
+ *locked = 0;
+ break;
+ }
}
- kfree(text);
return len;
}
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int count_remaining(struct prb_iterator *iter, u64 until_seq,
+ char *msgbuf, int size, bool records, bool time)
{
- char *text;
+ struct prb_iterator local_iter;
+ struct printk_log *msg;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
+ int ret;
+
+ prb_iter_copy(&local_iter, iter);
+ for (;;) {
+ ret = prb_iter_next(&local_iter, msgbuf, size, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /* the iter is invalid, restart from head */
+ prb_iter_init(&local_iter, &printk_rb, NULL);
+ len = 0;
+ continue;
+ }
+
+ if (until_seq && seq >= until_seq)
+ break;
+
+ if (records) {
+ len++;
+ } else {
+ msg = (struct printk_log *)msgbuf;
+ len += msg_print_text(msg, true, time, NULL, 0);
+ }
+ }
+
+ return len;
+}
+
+static void syslog_clear(void)
+{
+ struct prb_iterator iter;
+ int ret;
+
+ prb_iter_init(&iter, &printk_rb, &clear_seq);
+ for (;;) {
+ ret = prb_iter_next(&iter, NULL, 0, &clear_seq);
+ if (ret == 0)
+ break;
+ else if (ret < 0)
+ prb_iter_init(&iter, &printk_rb, &clear_seq);
+ }
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ struct prb_iterator iter;
+ struct printk_log *msg;
+ char *msgbuf = NULL;
+ char *text = NULL;
+ int textlen;
+ u64 seq = 0;
+ int len = 0;
bool time;
+ int ret;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!msgbuf) {
+ kfree(text);
+ return -ENOMEM;
+ }
time = printk_time;
- logbuf_lock_irq();
+
/*
- * Find first record that fits, including all following records,
- * into the user-provided buffer for this dump.
+ * Setup iter to last event before clear. Clear may
+ * be lost, but keep going with a best effort.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ prb_iter_init(&iter, &printk_rb, NULL);
+ prb_iter_seek(&iter, clear_seq);
+
+ /* count the total bytes after clear */
+ len = count_remaining(&iter, 0, msgbuf, PRINTK_RECORD_MAX,
+ false, time);
+
+ /* move iter forward until length fits into the buffer */
+ while (len > size) {
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * The iter is now invalid so clear will
+ * also be invalid. Restart from the head.
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ len = count_remaining(&iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, false, time);
+ continue;
+ }
+ msg = (struct printk_log *)msgbuf;
len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ if (clear)
+ clear_seq = seq;
+ }
+ /* copy messages to buffer */
len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ while (len >= 0 && len < size) {
+ if (clear)
+ clear_seq = seq;
- idx = log_next(idx);
- seq++;
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * The iter is now invalid. Make a best
+ * effort to grab the rest of the log
+ * from the new head.
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
+
+ msg = (struct printk_log *)msgbuf;
+ textlen = msg_print_text(msg, true, time, text,
+ PRINTK_SPRINT_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+
+ if (len + textlen > size)
+ break;
- logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
-
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
}
- if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- }
- logbuf_unlock_irq();
+ if (clear && !seq)
+ syslog_clear();
- kfree(text);
+ if (text)
+ kfree(text);
+ if (msgbuf)
+ kfree(msgbuf);
return len;
}
-static void syslog_clear(void)
-{
- logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- logbuf_unlock_irq();
-}
-
int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
+ struct prb_iterator iter;
+ char *msgbuf = NULL;
+ char *text = NULL;
+ int locked;
int error;
+ int ret;
error = check_syslog_permissions(type, source);
if (error)
@@ -1512,11 +1513,49 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
- error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
+
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!text || !msgbuf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = mutex_lock_interruptible(&syslog_lock);
if (error)
- return error;
- error = syslog_print(buf, len);
+ goto out;
+
+ /*
+ * Wait until a first message is available. Use a copy
+ * because no iteration should occur for syslog now.
+ */
+ for (;;) {
+ prb_iter_copy(&iter, &syslog_iter);
+
+ mutex_unlock(&syslog_lock);
+ ret = prb_iter_wait_next(&iter, NULL, 0, NULL);
+ if (ret == -ERESTARTSYS) {
+ error = ret;
+ goto out;
+ }
+ error = mutex_lock_interruptible(&syslog_lock);
+ if (error)
+ goto out;
+
+ if (ret == -EINVAL) {
+ prb_iter_init(&syslog_iter, &printk_rb,
+ &syslog_seq);
+ syslog_partial = 0;
+ continue;
+ }
+ break;
+ }
+
+ /* print as much as will fit in the user buffer */
+ locked = 1;
+ error = syslog_print(buf, len, text, msgbuf, &locked);
+ if (locked)
+ mutex_unlock(&syslog_lock);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
@@ -1561,47 +1600,45 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!msgbuf)
+ return -ENOMEM;
+
+ error = mutex_lock_interruptible(&syslog_lock);
+ if (error)
+ goto out;
+
if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = count_remaining(&syslog_iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, true,
+ printk_time);
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
- bool time = syslog_partial ? syslog_time : printk_time;
-
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
- time = printk_time;
- idx = log_next(idx);
- seq++;
- }
+ error = count_remaining(&syslog_iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, false,
+ printk_time);
error -= syslog_partial;
}
- logbuf_unlock_irq();
+
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
- error = log_buf_len;
+ error = prb_buffer_size(&printk_rb);
break;
default:
error = -EINVAL;
break;
}
-
+out:
+ if (msgbuf)
+ kfree(msgbuf);
+ if (text)
+ kfree(text);
return error;
}
@@ -1610,144 +1647,128 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
-/*
- * Special console_lock variants that help to reduce the risk of soft-lockups.
- * They allow to pass console_lock to another printk() call using a busy wait.
- */
+int printk_delay_msec __read_mostly;
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map console_owner_dep_map = {
- .name = "console_owner"
-};
-#endif
+static inline void printk_delay(int level)
+{
+ boot_delay_msec(level);
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
-static DEFINE_RAW_SPINLOCK(console_owner_lock);
-static struct task_struct *console_owner;
-static bool console_waiter;
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
-/**
- * console_lock_spinning_enable - mark beginning of code where another
- * thread might safely busy wait
- *
- * This basically converts console_lock into a spinlock. This marks
- * the section where the console_lock owner can not sleep, because
- * there may be a waiter spinning (like a spinlock). Also it must be
- * ready to hand over the lock at the end of the section.
- */
-static void console_lock_spinning_enable(void)
+static void print_console_dropped(struct console *con, u64 count)
{
- raw_spin_lock(&console_owner_lock);
- console_owner = current;
- raw_spin_unlock(&console_owner_lock);
+ char text[64];
+ int len;
- /* The waiter may spin on us after setting console_owner */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+ len = sprintf(text, "** %llu printk message%s dropped **\n",
+ count, count > 1 ? "s" : "");
+ con->write(con, text, len);
}
-/**
- * console_lock_spinning_disable_and_check - mark end of code where another
- * thread was able to busy wait and check if there is a waiter
- *
- * This is called at the end of the section where spinning is allowed.
- * It has two functions. First, it is a signal that it is no longer
- * safe to start busy waiting for the lock. Second, it checks if
- * there is a busy waiter and passes the lock rights to her.
- *
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
- *
- * Return: 1 if the lock rights were passed, 0 otherwise.
- */
-static int console_lock_spinning_disable_and_check(void)
+static void format_text(struct printk_log *msg, u64 seq,
+ char *ext_text, size_t *ext_len,
+ char *text, size_t *len, bool time)
{
- int waiter;
-
- raw_spin_lock(&console_owner_lock);
- waiter = READ_ONCE(console_waiter);
- console_owner = NULL;
- raw_spin_unlock(&console_owner_lock);
+ if (suppress_message_printing(msg->level)) {
+ /*
+ * Skip record that has level above the console
+ * loglevel and update each console's local seq.
+ */
+ *len = 0;
+ *ext_len = 0;
+ return;
+ }
- if (!waiter) {
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
- return 0;
+ *len = msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG,
+ time, text, PRINTK_SPRINT_MAX);
+ if (nr_ext_console_drivers) {
+ *ext_len = msg_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX,
+ msg, seq);
+ *ext_len += msg_print_ext_body(ext_text + *ext_len,
+ CONSOLE_EXT_LOG_MAX - *ext_len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
+ } else {
+ *ext_len = 0;
}
+}
- /* The waiter is now free to continue */
- WRITE_ONCE(console_waiter, false);
+static void printk_write_history(struct console *con, u64 master_seq)
+{
+ struct prb_iterator iter;
+ bool time = printk_time;
+ static char *ext_text;
+ static char *text;
+ static char *buf;
+ u64 seq;
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!ext_text || !text || !buf)
+ return;
- /*
- * Hand off console_lock to waiter. The waiter will perform
- * the up(). After this, the waiter is the console_lock owner.
- */
- mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
- return 1;
-}
+ if (!(con->flags & CON_ENABLED))
+ goto out;
-/**
- * console_trylock_spinning - try to get console_lock by busy waiting
- *
- * This allows to busy wait for the console_lock when the current
- * owner is running in specially marked sections. It means that
- * the current owner is running and cannot reschedule until it
- * is ready to lose the lock.
- *
- * Return: 1 if we got the lock, 0 othrewise
- */
-static int console_trylock_spinning(void)
-{
- struct task_struct *owner = NULL;
- bool waiter;
- bool spin = false;
- unsigned long flags;
+ if (!con->write)
+ goto out;
- if (console_trylock())
- return 1;
+ if (!cpu_online(raw_smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ goto out;
- printk_safe_enter_irqsave(flags);
+ prb_iter_init(&iter, &printk_rb, NULL);
- raw_spin_lock(&console_owner_lock);
- owner = READ_ONCE(console_owner);
- waiter = READ_ONCE(console_waiter);
- if (!waiter && owner && owner != current) {
- WRITE_ONCE(console_waiter, true);
- spin = true;
- }
- raw_spin_unlock(&console_owner_lock);
+ for (;;) {
+ struct printk_log *msg;
+ size_t ext_len;
+ size_t len;
+ int ret;
- /*
- * If there is an active printk() writing to the
- * consoles, instead of having it write our data too,
- * see if we can offload that load from the active
- * printer, and do some printing ourselves.
- * Go into a spin only if there isn't already a waiter
- * spinning, and there is an active printer, and
- * that active printer isn't us (recursive printk?).
- */
- if (!spin) {
- printk_safe_exit_irqrestore(flags);
- return 0;
- }
+ ret = prb_iter_next(&iter, buf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
- /* We spin waiting for the owner to release us */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
- /* Owner will clear console_waiter on hand off */
- while (READ_ONCE(console_waiter))
- cpu_relax();
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ if (seq > master_seq)
+ break;
- printk_safe_exit_irqrestore(flags);
- /*
- * The owner passed the console lock to us.
- * Since we did not spin on console lock, annotate
- * this as a trylock. Otherwise lockdep will
- * complain.
- */
- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ con->printk_seq++;
+ if (con->printk_seq < seq) {
+ print_console_dropped(con, seq - con->printk_seq);
+ con->printk_seq = seq;
+ }
- return 1;
+ msg = (struct printk_log *)buf;
+ format_text(msg, master_seq, ext_text, &ext_len, text,
+ &len, time);
+
+ if (len == 0 && ext_len == 0)
+ continue;
+
+ if (con->flags & CON_EXTENDED)
+ con->write(con, ext_text, ext_len);
+ else
+ con->write(con, text, len);
+
+ printk_delay(msg->level);
+ }
+out:
+ con->wrote_history = 1;
+ kfree(ext_text);
+ kfree(text);
+ kfree(buf);
}
/*
@@ -1755,8 +1776,9 @@ static int console_trylock_spinning(void)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
+static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
+ const char *text, size_t len, int level,
+ int facility)
{
struct console *con;
@@ -1766,15 +1788,40 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
return;
for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
if (!(con->flags & CON_ENABLED))
continue;
+ if (!con->wrote_history) {
+ if (con->flags & CON_PRINTBUFFER) {
+ printk_write_history(con, seq);
+ continue;
+ }
+ con->wrote_history = 1;
+ con->printk_seq = seq - 1;
+ }
+ if (con->flags & CON_BOOT && facility == 0) {
+ /* skip boot messages, already printed */
+ if (con->printk_seq < seq)
+ con->printk_seq = seq;
+ continue;
+ }
if (!con->write)
continue;
- if (!cpu_online(smp_processor_id()) &&
+ if (!cpu_online(raw_smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
+ if (con->printk_seq >= seq)
+ continue;
+
+ con->printk_seq++;
+ if (con->printk_seq < seq) {
+ print_console_dropped(con, seq - con->printk_seq);
+ con->printk_seq = seq;
+ }
+
+ /* for supressed messages, only seq is updated */
+ if (len == 0 && ext_len == 0)
+ continue;
+
if (con->flags & CON_EXTENDED)
con->write(con, ext_text, ext_len);
else
@@ -1782,20 +1829,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
}
}
-int printk_delay_msec __read_mostly;
-
-static inline void printk_delay(void)
-{
- if (unlikely(printk_delay_msec)) {
- int m = printk_delay_msec;
-
- while (m--) {
- mdelay(1);
- touch_nmi_watchdog();
- }
- }
-}
-
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
@@ -1812,101 +1845,94 @@ static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
u32 caller_id; /* printk_caller_id() of first print */
+ int cpu_owner; /* cpu of first print */
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
-} cont;
+} cont[2];
-static void cont_flush(void)
+static void cont_flush(int ctx)
{
- if (cont.len == 0)
+ struct cont *c = &cont[ctx];
+
+ if (c->len == 0)
return;
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
+ log_store(c->caller_id, c->facility, c->level, c->flags,
+ c->ts_nsec, c->cpu_owner, NULL, 0, c->buf, c->len);
+ c->len = 0;
}
-static bool cont_add(u32 caller_id, int facility, int level,
+static void cont_add(int ctx, int cpu, u32 caller_id, int facility, int level,
enum log_flags flags, const char *text, size_t len)
{
+ struct cont *c = &cont[ctx];
+
+ if (cpu != c->cpu_owner || !(flags & LOG_CONT))
+ cont_flush(ctx);
+
/* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
+ while (c->len + len > sizeof(c->buf))
+ cont_flush(ctx);
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
+ if (!c->len) {
+ c->facility = facility;
+ c->level = level;
+ c->caller_id = caller_id;
+ c->ts_nsec = local_clock();
+ c->flags = flags;
+ c->cpu_owner = cpu;
}
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
+ memcpy(c->buf + c->len, text, len);
+ c->len += len;
// The original flags come from the first line,
// but later continuations can add a newline.
if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
+ c->flags |= LOG_NEWLINE;
}
-
- return true;
}
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+/* ring buffer used as memory allocator for temporary sprint buffers */
+DECLARE_STATIC_PRINTKRB(sprint_rb,
+ ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) +
+ sizeof(long)) + 2, &printk_cpulock);
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
+ int ctx = !!in_nmi();
+ enum log_flags lflags = 0;
+ int printed_len = 0;
+ struct prb_handle h;
+ size_t text_len;
+ u64 ts_nsec;
+ char *text;
+ char *rbuf;
+ int cpu;
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
- }
-
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
+ ts_nsec = local_clock();
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_SPRINT_MAX);
+ if (!rbuf) {
+ prb_inc_lost(&printk_rb);
+ return printed_len;
}
- /* Store it in the record log */
- return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
-}
-
-/* Must be called under logbuf_lock. */
-int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
-{
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len;
- enum log_flags lflags = 0;
+ cpu = raw_smp_processor_id();
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * If this turns out to be an emergency message, there
+ * may need to be a prefix added. Leave room for it.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ text = rbuf + PREFIX_MAX;
+ text_len = vscnprintf(text, PRINTK_SPRINT_MAX - PREFIX_MAX, fmt, args);
- /* mark and strip a trailing newline */
+ /* strip and flag a trailing newline */
if (text_len && text[text_len-1] == '\n') {
text_len--;
lflags |= LOG_NEWLINE;
@@ -1937,62 +1963,37 @@ int vprintk_store(int facility, int level,
if (dict)
lflags |= LOG_NEWLINE;
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
-}
-
-asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
-{
- int printed_len;
- bool in_sched = false, pending_output;
- unsigned long flags;
- u64 curr_log_seq;
-
- /* Suppress unimportant messages after panic happens */
- if (unlikely(suppress_printk))
- return 0;
-
- if (level == LOGLEVEL_SCHED) {
- level = LOGLEVEL_DEFAULT;
- in_sched = true;
+ /*
+ * NOTE:
+ * - rbuf points to beginning of allocated buffer
+ * - text points to beginning of text
+ * - there is room before text for prefix
+ */
+ if (facility == 0) {
+ /* only the kernel can create emergency messages */
+ printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len);
}
- boot_delay_msec(level);
- printk_delay();
-
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
- logbuf_unlock_irqrestore(flags);
-
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
- /*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
- /*
- * Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
- */
- if (console_trylock_spinning())
- console_unlock();
- preempt_enable();
+ if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) {
+ cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len);
+ printed_len = text_len;
+ } else {
+ if (cpu == cont[ctx].cpu_owner)
+ cont_flush(ctx);
+ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
+ dict, dictlen, text, text_len);
}
- if (pending_output)
- wake_up_klogd();
+ prb_commit(&h);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
+static __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+}
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
return vprintk_func(fmt, args);
@@ -2049,39 +2050,6 @@ asmlinkage __visible int printk(const char *fmt, ...)
return r;
}
EXPORT_SYMBOL(printk);
-
-#else /* CONFIG_PRINTK */
-
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
-#define printk_time false
-
-static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
-
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
@@ -2312,187 +2280,23 @@ int is_console_locked(void)
}
EXPORT_SYMBOL(is_console_locked);
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(void)
-{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
-}
-
/**
* console_unlock - unlock the console system
*
* Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
- *
* console_unlock(); may be called from any context.
*/
void console_unlock(void)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
-
if (console_suspended) {
up_console_sem();
return;
}
- /*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
- *
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
- */
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
-
- /*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
- */
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
- }
-
- for (;;) {
- struct printk_log *msg;
- size_t ext_len = 0;
- size_t len;
-
- printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = sprintf(text,
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
-
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
-skip:
- if (console_seq == log_next_seq)
- break;
-
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_idx = log_next(console_idx);
- console_seq++;
- goto skip;
- }
-
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
-
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
- if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
- sizeof(ext_text),
- msg, console_seq);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
- }
- console_idx = log_next(console_idx);
- console_seq++;
- raw_spin_unlock(&logbuf_lock);
-
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
-
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
-
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
-
- printk_safe_exit_irqrestore(flags);
-
- if (do_cond_resched)
- cond_resched();
- }
-
console_locked = 0;
-
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
-
- /*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
- */
- raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
- raw_spin_unlock(&logbuf_lock);
- printk_safe_exit_irqrestore(flags);
-
- if (retry && console_trylock())
- goto again;
}
EXPORT_SYMBOL(console_unlock);
@@ -2543,24 +2347,10 @@ void console_unblank(void)
void console_flush_on_panic(enum con_flush_mode mode)
{
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * FIXME: This is currently a NOP. Emergency messages will have been
+ * printed, but what about if write_atomic is not available on the
+ * console? What if the printk kthread is still alive?
*/
- console_trylock();
- console_may_schedule = 0;
-
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- logbuf_unlock_irqrestore(flags);
- }
- console_unlock();
}
/*
@@ -2638,7 +2428,6 @@ early_param("keep_bootcon", keep_bootcon_setup);
void register_console(struct console *newcon)
{
int i;
- unsigned long flags;
struct console *bcon = NULL;
struct console_cmdline *c;
static bool has_preferred;
@@ -2754,27 +2543,6 @@ void register_console(struct console *newcon)
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- logbuf_lock_irqsave(flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
- logbuf_unlock_irqrestore(flags);
- }
console_unlock();
console_sysfs_notify();
@@ -2784,6 +2552,10 @@ void register_console(struct console *newcon)
* boot consoles, real consoles, etc - this is to ensure that end
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
+ *
+ * This message is also important because it will trigger the
+ * printk kthread to begin dumping the log buffer to the newly
+ * registered console.
*/
pr_info("%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
@@ -2927,59 +2699,74 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_OUTPUT 0x02
+static int printk_kthread_func(void *data)
+{
+ struct prb_iterator iter;
+ struct printk_log *msg;
+ size_t ext_len;
+ char *ext_text;
+ u64 master_seq;
+ size_t len;
+ char *text;
+ char *buf;
+ int ret;
-static DEFINE_PER_CPU(int, printk_pending);
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!ext_text || !text || !buf)
+ return -1;
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ prb_iter_init(&iter, &printk_rb, NULL);
- if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ /* the printk kthread never exits */
+ for (;;) {
+ ret = prb_iter_wait_next(&iter, buf,
+ PRINTK_RECORD_MAX, &master_seq);
+ if (ret == -ERESTARTSYS) {
+ continue;
+ } else if (ret < 0) {
+ /* iterator invalid, start over */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
+
+ msg = (struct printk_log *)buf;
+ format_text(msg, master_seq, ext_text, &ext_len, text,
+ &len, printk_time);
+
+ console_lock();
+ call_console_drivers(master_seq, ext_text, ext_len, text, len,
+ msg->level, msg->facility);
+ if (len > 0 || ext_len > 0)
+ printk_delay(msg->level);
+ console_unlock();
}
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
-}
+ kfree(ext_text);
+ kfree(text);
+ kfree(buf);
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
-};
+ return 0;
+}
-void wake_up_klogd(void)
+static int __init init_printk_kthread(void)
{
- preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
+ struct task_struct *thread;
+
+ thread = kthread_run(printk_kthread_func, NULL, "printk");
+ if (IS_ERR(thread)) {
+ pr_err("printk: unable to create printing thread\n");
+ return PTR_ERR(thread);
}
- preempt_enable();
-}
-void defer_console_output(void)
-{
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
+ return 0;
}
+late_initcall(init_printk_kthread);
-int vprintk_deferred(const char *fmt, va_list args)
+static int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
int printk_deferred(const char *fmt, ...)
@@ -3101,8 +2888,8 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
+ struct kmsg_dumper dumper_local;
struct kmsg_dumper *dumper;
- unsigned long flags;
if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
return;
@@ -3112,21 +2899,18 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (dumper->max_reason && reason > dumper->max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
+ /*
+ * use a local copy to avoid modifying the
+ * iterator used by any other cpus/contexts
+ */
+ memcpy(&dumper_local, dumper, sizeof(dumper_local));
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
- logbuf_unlock_irqrestore(flags);
+ /* initialize iterator with data about the stored records */
+ dumper_local.active = true;
+ kmsg_dump_rewind(&dumper_local);
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ dumper_local.dump(&dumper_local, reason);
}
rcu_read_unlock();
}
@@ -3153,33 +2937,67 @@ void kmsg_dump(enum kmsg_dump_reason reason)
bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
+ struct prb_iterator iter;
struct printk_log *msg;
- size_t l = 0;
- bool ret = false;
+ struct prb_handle h;
+ bool cont = false;
+ char *msgbuf;
+ char *rbuf;
+ size_t l;
+ u64 seq;
+ int ret;
if (!dumper->active)
- goto out;
+ return cont;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
+ if (!rbuf)
+ return cont;
+ msgbuf = rbuf;
+retry:
+ for (;;) {
+ prb_iter_init(&iter, &printk_rb, &seq);
+
+ if (dumper->line_seq == seq) {
+ /* already where we want to be */
+ break;
+ } else if (dumper->line_seq < seq) {
+ /* messages are gone, move to first available one */
+ dumper->line_seq = seq;
+ break;
+ }
+
+ ret = prb_iter_seek(&iter, dumper->line_seq);
+ if (ret > 0) {
+ /* seeked to line_seq */
+ break;
+ } else if (ret == 0) {
+ /*
+ * The end of the list was hit without ever seeing
+ * line_seq. Reset it to the beginning of the list.
+ */
+ prb_iter_init(&iter, &printk_rb, &dumper->line_seq);
+ break;
+ }
+ /* iterator invalid, start over */
}
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX,
+ &dumper->line_seq);
+ if (ret == 0)
goto out;
+ else if (ret < 0)
+ goto retry;
- msg = log_from_idx(dumper->cur_idx);
+ msg = (struct printk_log *)msgbuf;
l = msg_print_text(msg, syslog, printk_time, line, size);
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
- ret = true;
-out:
if (len)
*len = l;
- return ret;
+ cont = true;
+out:
+ prb_commit(&h);
+ return cont;
}
/**
@@ -3202,12 +3020,9 @@ out:
bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
- unsigned long flags;
bool ret;
- logbuf_lock_irqsave(flags);
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
return ret;
}
@@ -3235,74 +3050,101 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
char *buf, size_t size, size_t *len)
{
- unsigned long flags;
- u64 seq;
- u32 idx;
- u64 next_seq;
- u32 next_idx;
- size_t l = 0;
- bool ret = false;
+ struct prb_iterator iter;
bool time = printk_time;
+ struct printk_log *msg;
+ u64 new_end_seq = 0;
+ struct prb_handle h;
+ bool cont = false;
+ char *msgbuf;
+ u64 end_seq;
+ int textlen;
+ u64 seq = 0;
+ char *rbuf;
+ int l = 0;
+ int ret;
if (!dumper->active)
- goto out;
+ return cont;
- logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
+ if (!rbuf)
+ return cont;
+ msgbuf = rbuf;
- /* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
- goto out;
- }
+ prb_iter_init(&iter, &printk_rb, NULL);
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /*
+ * seek to the start record, which is set/modified
+ * by kmsg_dump_get_line_nolock()
+ */
+ ret = prb_iter_seek(&iter, dumper->line_seq);
+ if (ret <= 0)
+ prb_iter_init(&iter, &printk_rb, &seq);
+
+ /* work with a local end seq to have a constant value */
+ end_seq = dumper->buffer_end_seq;
+ if (!end_seq) {
+ /* initialize end seq to "infinity" */
+ end_seq = -1;
+ dumper->buffer_end_seq = end_seq;
+ }
+retry:
+ if (seq >= end_seq)
+ goto out;
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ /* count the total bytes after seq */
+ textlen = count_remaining(&iter, end_seq, msgbuf,
+ PRINTK_RECORD_MAX, 0, time);
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /* move iter forward until length fits into the buffer */
+ while (textlen > size) {
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0 || seq >= end_seq) {
+ prb_iter_init(&iter, &printk_rb, &seq);
+ goto retry;
+ }
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ msg = (struct printk_log *)msgbuf;
+ textlen -= msg_print_text(msg, true, time, NULL, 0);
}
- /* last message in next interation */
- next_seq = seq;
- next_idx = idx;
+ /* save end seq for the next interation */
+ new_end_seq = seq + 1;
+
+ /* copy messages to buffer */
+ while (l < size) {
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * iterator (and thus also the start position)
+ * invalid, start over from beginning of list
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
- l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ if (seq >= end_seq)
+ break;
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ msg = (struct printk_log *)msgbuf;
+ textlen = msg_print_text(msg, syslog, time, buf + l, size - l);
+ if (textlen > 0)
+ l += textlen;
+ cont = true;
}
- dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
- ret = true;
- logbuf_unlock_irqrestore(flags);
-out:
- if (len)
+ if (cont && len)
*len = l;
- return ret;
+out:
+ prb_commit(&h);
+ if (new_end_seq)
+ dumper->buffer_end_seq = new_end_seq;
+ return cont;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
@@ -3318,10 +3160,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->line_seq = 0;
+ dumper->buffer_end_seq = 0;
}
/**
@@ -3334,12 +3174,89 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
*/
void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+static bool console_can_emergency(int level)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic && oops_in_progress)
+ return true;
+ if (con->write && (con->flags & CON_BOOT))
+ return true;
+ }
+ return false;
+}
+
+static void call_emergency_console_drivers(int level, const char *text,
+ size_t text_len)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic && oops_in_progress) {
+ con->write_atomic(con, text, text_len);
+ continue;
+ }
+ if (con->write && (con->flags & CON_BOOT)) {
+ con->write(con, text, text_len);
+ continue;
+ }
+ }
+}
+
+static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
+ char *text, u16 text_len)
+{
+ struct printk_log msg;
+ size_t prefix_len;
+
+ if (!console_can_emergency(level))
+ return;
+
+ msg.level = level;
+ msg.ts_nsec = ts_nsec;
+ msg.cpu = cpu;
+ msg.facility = 0;
+
+ /* "text" must have PREFIX_MAX preceding bytes available */
+
+ prefix_len = print_prefix(&msg,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time, buffer);
+ /* move the prefix forward to the beginning of the message text */
+ text -= prefix_len;
+ memmove(text, buffer, prefix_len);
+ text_len += prefix_len;
+
+ text[text_len++] = '\n';
+
+ call_emergency_console_drivers(level, text, text_len);
+
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
+
+ printk_delay(level);
+}
#endif
+
+void console_atomic_lock(unsigned int *flags)
+{
+ prb_lock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_lock);
+
+void console_atomic_unlock(unsigned int flags)
+{
+ prb_unlock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_unlock);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
deleted file mode 100644
index b4045e782743..000000000000
--- a/kernel/printk/printk_safe.c
+++ /dev/null
@@ -1,415 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-
-#include "internal.h"
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examinig current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-static int printk_safe_irq_ready __read_mostly;
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
-static DEFINE_PER_CPU(int, printk_context);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_safe_irq_ready)
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- static raw_spinlock_t read_lock =
- __RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void notrace printk_nmi_enter(void)
-{
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
-}
-
-void notrace printk_nmi_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
-{
- this_cpu_inc(printk_context);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
-{
- this_cpu_dec(printk_context);
-}
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- /*
- * Try to use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
- raw_spin_unlock(&logbuf_lock);
- defer_console_output();
- return len;
- }
-
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
- /* No obstacles. */
- return vprintk_default(fmt, args);
-}
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /*
- * In the highly unlikely event that a NMI were to trigger at
- * this moment. Make sure IRQ work is set up before this
- * variable is set.
- */
- barrier();
- printk_safe_irq_ready = 1;
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 705887f63288..0220e89796e5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -175,7 +175,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !__fatal_signal_pending(task)) {
- task->state = __TASK_TRACED;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (task->state & __TASK_TRACED)
+ task->state = __TASK_TRACED;
+ else
+ task->saved_state = __TASK_TRACED;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -259,12 +266,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
+ unsigned int mode)
{
+ int ret;
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
+
+ return ret == 0;
}
/* Returns 0 on success, -errno on denial. */
@@ -316,7 +328,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(cred, tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -335,7 +347,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
+ !ptrace_has_cap(cred, mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 480edf328b51..5f7f1be29244 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -161,8 +161,8 @@ config RCU_FAST_NO_HZ
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT_FULL
+ default y if PREEMPT_RT_FULL
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
config PROVE_RCU
def_bool PROVE_LOCKING
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efaa5b3f4d3f..ecb82cc432af 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -60,10 +60,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_ATOM_BH 0x40 /* ... disabling bh while atomic */
+#define RCUTORTURE_RDR_ATOM_RBH 0x80 /* ... RBH while atomic */
+#define RCUTORTURE_RDR_NBITS 8 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
- RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED | \
+ RCUTORTURE_RDR_ATOM_BH | RCUTORTURE_RDR_ATOM_RBH)
#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
/* Must be power of two minus one. */
#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
@@ -1092,31 +1095,52 @@ static void rcutorture_one_extend(int *readstate, int newstate,
WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
rtrsp->rt_readstate = newstate;
- /* First, put new protection in place to avoid critical-section gap. */
+ /*
+ * First, put new protection in place to avoid critical-section gap.
+ * Disable preemption around the ATOM disables to ensure that
+ * in_atomic() is true.
+ */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
+ preempt_disable();
+ if (statesnew & RCUTORTURE_RDR_ATOM_BH)
+ local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_ATOM_RBH)
+ rcu_read_lock_bh();
+ preempt_enable();
if (statesnew & RCUTORTURE_RDR_RCU)
idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Disable preemption around the ATOM enables in
+ * case the context was only atomic due to IRQ disabling.
+ */
+ preempt_disable();
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
+ if (statesold & RCUTORTURE_RDR_ATOM_BH)
local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_ATOM_RBH)
+ rcu_read_unlock_bh();
+ preempt_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_RCU)
cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
@@ -1152,6 +1176,12 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
int mask = rcutorture_extend_mask_max();
unsigned long randmask1 = torture_random(trsp) >> 8;
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long nonatomic_bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+ unsigned long atomic_bhs = RCUTORTURE_RDR_ATOM_BH |
+ RCUTORTURE_RDR_ATOM_RBH;
+ unsigned long tmp;
WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
@@ -1159,11 +1189,49 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ tmp = atomic_bhs | nonatomic_bhs;
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & tmp;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
+ /*
+ * Can't release the outermost rcu lock in an irq disabled
+ * section without preemption also being disabled, if irqs
+ * had ever been enabled during this RCU critical section
+ * (could leak a special flag and delay reporting the qs).
+ */
+ if ((oldmask & RCUTORTURE_RDR_RCU) &&
+ (mask & RCUTORTURE_RDR_IRQ) &&
+ !(mask & preempts))
+ mask |= RCUTORTURE_RDR_RCU;
+
+ /* Can't modify atomic bh in non-atomic context */
+ if ((oldmask & atomic_bhs) && (mask & atomic_bhs) &&
+ !(mask & preempts_irq)) {
+ mask |= oldmask & preempts_irq;
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & tmp;
+ }
+ if ((mask & atomic_bhs) && !(mask & preempts_irq))
+ mask |= RCUTORTURE_RDR_PREEMPT;
+
+ /* Can't modify non-atomic bh in atomic context */
+ tmp = nonatomic_bhs;
+ if (oldmask & preempts_irq)
+ mask &= ~tmp;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & tmp;
+ }
+
return mask ?: RCUTORTURE_RDR_RCU;
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..ef73d967edf4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -25,6 +25,7 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/srcu.h>
+#include <linux/locallock.h>
#include "rcu.h"
#include "rcu_segcblist.h"
@@ -530,7 +531,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(ssp);
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
@@ -735,6 +736,7 @@ static void srcu_flip(struct srcu_struct *ssp)
smp_mb(); /* D */ /* Pairs with C. */
}
+static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
/*
* If SRCU is likely idle, return true, otherwise return false.
*
@@ -762,15 +764,16 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long flags;
struct srcu_data *sdp;
unsigned long t;
+ unsigned long tlast;
/* If the local srcu_data structure has callbacks, not idle. */
- local_irq_save(flags);
+ local_lock_irqsave(sp_llock, flags);
sdp = this_cpu_ptr(ssp->sda);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(sp_llock, flags);
return false; /* Callbacks already present, so not idle. */
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(sp_llock, flags);
/*
* No local callbacks, so probabalistically probe global state.
@@ -780,9 +783,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
+ tlast = READ_ONCE(ssp->srcu_last_gp_end);
if (exp_holdoff == 0 ||
- time_in_range_open(t, ssp->srcu_last_gp_end,
- ssp->srcu_last_gp_end + exp_holdoff))
+ time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
@@ -850,7 +853,7 @@ void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
}
rhp->func = func;
idx = srcu_read_lock(ssp);
- local_irq_save(flags);
+ local_lock_irqsave(sp_llock, flags);
sdp = this_cpu_ptr(ssp->sda);
spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
@@ -866,7 +869,8 @@ void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_rcu_node(sdp);
+ local_unlock_irqrestore(sp_llock, flags);
if (needgp)
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index affa7aae758f..2f117164fcc7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -51,6 +51,12 @@
#include <linux/tick.h>
#include <linux/sysrq.h>
#include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/sched/isolation.h>
+#include "../time/tick-internal.h"
#include "tree.h"
#include "rcu.h"
@@ -92,6 +98,11 @@ struct rcu_state rcu_state = {
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
+/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+#ifndef CONFIG_PREEMPT_RT_FULL
+module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -547,7 +558,7 @@ static void rcu_eqs_enter(bool user)
}
lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
@@ -620,14 +631,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+ trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+ atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (irq)
@@ -714,7 +726,7 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+ trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -798,7 +810,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+ rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
@@ -1065,6 +1077,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ rdp->rcu_iw.flags = IRQ_WORK_HARD_IRQ;
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -2253,7 +2266,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/* Perform RCU core processing work for the current CPU. */
-static __latent_entropy void rcu_core(struct softirq_action *unused)
+static __latent_entropy void rcu_core(void)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2295,29 +2308,131 @@ static __latent_entropy void rcu_core(struct softirq_action *unused)
trace_rcu_utilization(TPS("End RCU core"));
}
+static void rcu_core_si(struct softirq_action *h)
+{
+ rcu_core();
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+ wake_up_process(t);
+}
+
+static void invoke_rcu_core_kthread(void)
+{
+ struct task_struct *t;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+ if (t != NULL && t != current)
+ rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+ local_irq_restore(flags);
+}
+
/*
- * Schedule RCU callback invocation. If the running implementation of RCU
- * does not support RCU priority boosting, just do a direct call, otherwise
- * wake up the per-CPU kernel kthread. Note that because we are running
- * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Do RCU callback invocation. Not that if we are running !use_softirq,
+ * we are already in the rcuc kthread. If callbacks are offloaded, then
+ * ->cblist is always empty, so we don't get here. Therefore, we only
+ * ever need to check for the scheduler being operational (some callbacks
+ * do wakeups, so we do need the scheduler).
*/
static void invoke_rcu_callbacks(struct rcu_data *rdp)
{
if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
return;
- if (likely(!rcu_state.boost)) {
- rcu_do_batch(rdp);
- return;
- }
- invoke_rcu_callbacks_kthread();
+ rcu_do_batch(rdp);
}
+/*
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
+ */
static void invoke_rcu_core(void)
{
- if (cpu_online(smp_processor_id()))
+ if (!cpu_online(smp_processor_id()))
+ return;
+ if (use_softirq)
raise_softirq(RCU_SOFTIRQ);
+ else
+ invoke_rcu_core_kthread();
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_core();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
+ }
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
+}
+
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_data.rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+ if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ return 0;
+ WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+ "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
+ return 0;
}
+early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -3355,7 +3470,8 @@ void __init rcu_init(void)
rcu_init_one();
if (dump_tree)
rcu_dump_rcu_node_tree();
- open_softirq(RCU_SOFTIRQ, rcu_core);
+ if (use_softirq)
+ open_softirq(RCU_SOFTIRQ, rcu_core_si);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e253d11af3c4..a1a72a1ecb02 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -407,8 +407,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c990df880d1..91357562f168 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
+ WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
rnp = rnp->parent;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
}
}
@@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
}
@@ -373,12 +373,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- if (!(mask_ofl_ipi & mask))
- continue;
retry_ipi:
if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
@@ -487,7 +485,7 @@ static void synchronize_sched_expedited_wait(void)
struct rcu_data *rdp;
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -499,7 +497,8 @@ static void synchronize_sched_expedited_wait(void)
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rcu_state.expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ READ_ONCE(rnp_root->expmask),
+ ".T"[!!rnp_root->exp_tasks]);
if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rnp) {
@@ -509,7 +508,7 @@ static void synchronize_sched_expedited_wait(void)
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
+ READ_ONCE(rnp->expmask),
".T"[!!rnp->exp_tasks]);
}
pr_cont("\n");
@@ -517,7 +516,7 @@ static void synchronize_sched_expedited_wait(void)
rcu_for_each_leaf_node(rnp) {
for_each_leaf_node_possible_cpu(rnp, cpu) {
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
dump_cpu_task(cpu);
}
@@ -537,14 +536,13 @@ static void rcu_exp_wait_wake(unsigned long s)
struct rcu_node *rnp;
synchronize_sched_expedited_wait();
- rcu_exp_gp_seq_end();
- trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
+ // Switch over to wakeup mode, allowing the next GP to proceed.
+ // End the previous grace period only after acquiring the mutex
+ // to ensure that only one GP runs concurrently with wakeups.
mutex_lock(&rcu_state.exp_wake_mutex);
+ rcu_exp_gp_seq_end();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -555,7 +553,7 @@ static void rcu_exp_wait_wake(unsigned long s)
spin_unlock(&rnp->exp_lock);
}
smp_mb(); /* All above changes before wakeup. */
- wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
}
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
mutex_unlock(&rcu_state.exp_wake_mutex);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1102765f91fd..bbe6d8b3cbd2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -11,29 +11,7 @@
* Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/sched/debug.h>
-#include <linux/smpboot.h>
-#include <linux/sched/isolation.h>
-#include <uapi/linux/sched/types.h>
-#include "../time/tick-internal.h"
-
-#ifdef CONFIG_RCU_BOOST
#include "../locking/rtmutex_common.h"
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
- * all uses are in dead code. Provide a definition to keep the compiler
- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
- * This probably needs to be excluded from -rt builds.
- */
-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
-#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (!use_softirq)
+ pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
pr_info("\tRCU debug extended QS entry/exit.\n");
rcupdate_announce_bootup_oddness();
@@ -240,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
- rnp->gp_tasks = &t->rcu_node_entry;
+ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
}
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
@@ -307,11 +287,15 @@ void rcu_note_context_switch(bool preempt)
struct task_struct *t = current;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp;
+ int sleeping_l = 0;
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
+#if defined(CONFIG_PREEMPT_RT_FULL)
+ sleeping_l = t->sleeping_lock;
+#endif
+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -371,7 +355,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- return rnp->gp_tasks != NULL;
+ return READ_ONCE(rnp->gp_tasks) != NULL;
}
/* Bias and limit values for ->rcu_read_lock_nesting. */
@@ -523,7 +507,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gp_seq, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
+ WRITE_ONCE(rnp->gp_tasks, np);
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
@@ -627,7 +611,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (preempt_bh_were_disabled || irqs_were_disabled) {
WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
/* Need to defer quiescent state until everything is enabled. */
- if (irqs_were_disabled) {
+ if (irqs_were_disabled && use_softirq) {
/* Enabling irqs does not reschedule, so... */
raise_softirq_irqoff(RCU_SOFTIRQ);
} else {
@@ -661,7 +645,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp) &&
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
- rnp->gp_tasks = rnp->blkd_tasks.next;
+ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
t = container_of(rnp->gp_tasks, struct task_struct,
rcu_node_entry);
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
@@ -755,7 +739,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
- __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
+ __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
+ rnp->exp_tasks);
pr_info("%s: ->blkd_tasks", __func__);
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
@@ -944,18 +929,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
#ifdef CONFIG_RCU_BOOST
+ struct sched_param sp;
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
- /*
- * If the thread is yielding, only wake it when this
- * is invoked from idle
- */
- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
- wake_up_process(t);
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
}
+#ifdef CONFIG_RCU_BOOST
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1091,23 +1079,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
- __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
- }
- local_irq_restore(flags);
-}
-
-/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
@@ -1160,59 +1131,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
return 0;
}
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
- struct sched_param sp;
-
- sp.sched_priority = kthread_prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
- per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces
- * the RCU softirq used in configurations of RCU that do not support RCU
- * priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
- unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
- int spincnt;
-
- for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
- local_bh_disable();
- *statusp = RCU_KTHREAD_RUNNING;
- local_irq_disable();
- work = *workp;
- *workp = 0;
- local_irq_enable();
- if (work)
- rcu_do_batch(this_cpu_ptr(&rcu_data));
- local_bh_enable();
- if (*workp == 0) {
- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
- *statusp = RCU_KTHREAD_WAITING;
- return;
- }
- }
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
- *statusp = RCU_KTHREAD_WAITING;
-}
-
/*
* Set the per-rcu_node kthread's affinity to cover all CPUs that are
* served by the rcu_node in question. The CPU hotplug lock is still
@@ -1243,27 +1161,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
free_cpumask_var(cm);
}
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_data.rcu_cpu_kthread_task,
- .thread_should_run = rcu_cpu_kthread_should_run,
- .thread_fn = rcu_cpu_kthread,
- .thread_comm = "rcuc/%u",
- .setup = rcu_cpu_kthread_setup,
- .park = rcu_cpu_kthread_park,
-};
-
/*
* Spawn boost kthreads -- called as soon as the scheduler is running.
*/
static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
- int cpu;
- for_each_possible_cpu(cpu)
- per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
- return;
rcu_for_each_leaf_node(rnp)
(void)rcu_spawn_one_boost_kthread(rnp);
}
@@ -1286,11 +1190,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
static bool rcu_is_callbacks_kthread(void)
{
return false;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..034ea8eddcfe 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -55,15 +55,23 @@ extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+#ifndef CONFIG_PREEMPT_RT_FULL
module_param(rcu_normal_after_boot, int, 0);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
+ *
+ * Returns true if lockdep must be ignored, in which case *ret contains
+ * the best guess described below. Otherwise returns false, in which
+ * case *ret tells the caller nothing and the caller should instead
+ * consult lockdep.
*
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +83,45 @@ module_param(rcu_normal_after_boot, int, 0);
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes. This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * then rcu_read_lock_held() sets *ret to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
*
- * Similarly, we avoid claiming an SRCU read lock held if the current
+ * Similarly, we avoid claiming an RCU read lock held if the current
* CPU is offline.
*/
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = 1;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = 0;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = 0;
+ return true;
+ }
+ return false;
+}
+
int rcu_read_lock_sched_held(void)
{
- int lockdep_opinion = 0;
+ bool ret;
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
- if (debug_locks)
- lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || !preemptible();
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
@@ -136,8 +154,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
- rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -261,12 +278,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
*/
int rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +303,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*/
int rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
diff --git a/kernel/relay.c b/kernel/relay.c
index ade14fb7ce2e..4b760ec16342 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename,
return NULL;
chan->buf = alloc_percpu(struct rchan_buf *);
+ if (!chan->buf) {
+ kfree(chan);
+ return NULL;
+ }
+
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a1ad5b7d5521..49c14137988e 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -29,12 +29,12 @@ void complete(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (x->done != UINT_MAX)
x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -58,10 +58,10 @@ void complete_all(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done = UINT_MAX;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_all_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
@@ -70,20 +70,20 @@ do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DECLARE_SWAITQUEUE(wait);
- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
+ __prepare_to_swait(&x->wait, &wait);
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ __finish_swait(&x->wait, &wait);
if (!x->done)
return timeout;
}
@@ -100,9 +100,9 @@ __wait_for_common(struct completion *x,
complete_acquire(x);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
complete_release(x);
@@ -291,12 +291,12 @@ bool try_wait_for_completion(struct completion *x)
if (!READ_ONCE(x->done))
return false;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = false;
else if (x->done != UINT_MAX)
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -322,8 +322,8 @@ bool completion_done(struct completion *x)
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
*/
- spin_lock_irqsave(&x->wait.lock, flags);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be7c380c07e6..8c9c77f73e69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -45,7 +45,11 @@ const_debug unsigned int sysctl_sched_features =
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#else
const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#endif
/*
* period over which we measure -rt task CPU usage in us.
@@ -317,7 +321,7 @@ static void hrtick_rq_init(struct rq *rq)
rq->hrtick_csd.info = rq;
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
@@ -399,9 +403,15 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
-static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task,
+ bool sleeper)
{
- struct wake_q_node *node = &task->wake_q;
+ struct wake_q_node *node;
+
+ if (sleeper)
+ node = &task->wake_q_sleeper;
+ else
+ node = &task->wake_q;
/*
* Atomically grab the task, if ->wake_q is !nil already it means
@@ -437,7 +447,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
*/
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
- if (__wake_q_add(head, task))
+ if (__wake_q_add(head, task, false))
+ get_task_struct(task);
+}
+
+void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task, true))
get_task_struct(task);
}
@@ -460,28 +476,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
*/
void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{
- if (!__wake_q_add(head, task))
+ if (!__wake_q_add(head, task, false))
put_task_struct(task);
}
-void wake_up_q(struct wake_q_head *head)
+void __wake_up_q(struct wake_q_head *head, bool sleeper)
{
struct wake_q_node *node = head->first;
while (node != WAKE_Q_TAIL) {
struct task_struct *task;
- task = container_of(node, struct task_struct, wake_q);
+ if (sleeper)
+ task = container_of(node, struct task_struct, wake_q_sleeper);
+ else
+ task = container_of(node, struct task_struct, wake_q);
+
BUG_ON(!task);
/* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ if (sleeper)
+ task->wake_q_sleeper.next = NULL;
+ else
+ task->wake_q.next = NULL;
/*
* wake_up_process() executes a full barrier, which pairs with
* the queueing in wake_q_add() so as not to miss wakeups.
*/
- wake_up_process(task);
+ if (sleeper)
+ wake_up_lock_sleeper(task);
+ else
+ wake_up_process(task);
+
put_task_struct(task);
}
}
@@ -517,6 +544,48 @@ void resched_curr(struct rq *rq)
trace_sched_wake_idle_without_ipi(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+ return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_curr(rq);
+ return;
+ }
+
+ lockdep_assert_held(&rq->lock);
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ if (test_tsk_need_resched_lazy(curr))
+ return;
+
+ set_tsk_need_resched_lazy(curr);
+
+ cpu = cpu_of(rq);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(curr))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -930,10 +999,10 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p))
+ if (is_per_cpu_kthread(p) || __migrate_disabled(p))
return cpu_online(cpu);
return cpu_active(cpu);
@@ -982,6 +1051,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct migration_arg {
struct task_struct *task;
int dest_cpu;
+ bool done;
};
/*
@@ -1017,6 +1087,11 @@ static int migration_cpu_stop(void *data)
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
struct rq_flags rf;
+ int dest_cpu = arg->dest_cpu;
+
+ /* We don't look at arg after this point. */
+ smp_mb();
+ arg->done = true;
/*
* The original target CPU might have gone down and we might
@@ -1025,7 +1100,7 @@ static int migration_cpu_stop(void *data)
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1039,9 +1114,9 @@ static int migration_cpu_stop(void *data)
*/
if (task_rq(p) == rq) {
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
}
rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
@@ -1056,10 +1131,19 @@ static int migration_cpu_stop(void *data)
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ cpumask_copy(&p->cpus_mask, new_mask);
+ if (p->cpus_ptr == &p->cpus_mask)
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
+int __migrate_disabled(struct task_struct *p)
+{
+ return p->migrate_disable;
+}
+EXPORT_SYMBOL_GPL(__migrate_disabled);
+#endif
+
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
struct rq *rq = task_rq(p);
@@ -1126,7 +1210,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(&p->cpus_allowed, new_mask))
+ if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
@@ -1148,7 +1232,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
}
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), new_mask) ||
+ p->cpus_ptr != &p->cpus_mask)
goto out;
if (task_running(rq, p) || p->state == TASK_WAKING) {
@@ -1286,10 +1371,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1416,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1345,6 +1430,18 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
+static bool check_task_state(struct task_struct *p, long match_state)
+{
+ bool match = false;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ if (p->state == match_state || p->saved_state == match_state)
+ match = true;
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+}
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1389,7 +1486,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ if (match_state && !check_task_state(p, match_state))
return 0;
cpu_relax();
}
@@ -1404,7 +1501,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if (!match_state || p->state == match_state ||
+ p->saved_state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -1479,7 +1577,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1519,14 +1617,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1570,7 +1668,7 @@ out:
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +1678,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(&p->cpus_allowed);
+ cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -1999,8 +2097,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(p->state & state))
+ if (!(p->state & state)) {
+ /*
+ * The task might be running due to a spinlock sleeper
+ * wakeup. Check the saved state and set it to running
+ * if the wakeup condition is true.
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ success = 1;
+ }
+ }
goto out;
+ }
+
+ /*
+ * If this is a regular wakeup, then we can unconditionally
+ * clear the saved state of a "lock sleeper".
+ */
+ if (!(wake_flags & WF_LOCK_SLEEPER))
+ p->saved_state = TASK_RUNNING;
trace_sched_waking(p);
@@ -2115,6 +2232,18 @@ int wake_up_process(struct task_struct *p)
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
+}
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
@@ -2338,6 +2467,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -2355,6 +2485,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -2395,13 +2528,14 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
+ * - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
+ rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -2682,23 +2816,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* provided by mmdrop(),
* - a sync_core for SYNC_CORE.
*/
+ /*
+ * We use mmdrop_delayed() here so we don't have to do the
+ * full __mmdrop() when we are the last user.
+ */
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_delayed(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
- /* Task is done with its stack. */
- put_task_stack(prev);
-
put_task_struct(prev);
}
@@ -3102,28 +3231,31 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr) || cpu_is_offline(cpu))
+ if (cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
- delta = rq_clock_task(rq) - curr->se.exec_start;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
curr->sched_class->task_tick(rq, curr, 0);
+ calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
-
out_requeue:
+
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
@@ -3368,6 +3500,8 @@ again:
BUG();
}
+static void migrate_disabled_sched(struct task_struct *p);
+
/*
* __schedule() is the main scheduler function.
*
@@ -3438,6 +3572,9 @@ static void __sched notrace __schedule(bool preempt)
rq_lock(rq, &rf);
smp_mb__after_spinlock();
+ if (__migrate_disabled(prev))
+ migrate_disabled_sched(prev);
+
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
@@ -3459,6 +3596,7 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
if (likely(prev != next)) {
@@ -3642,6 +3780,30 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+ if (test_thread_flag(TIF_NEED_RESCHED))
+ return 1;
+ if (current_thread_info()->preempt_lazy_count)
+ return 0;
+ return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+ return 1;
+}
+
+#endif
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -3656,7 +3818,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
+ if (!preemptible_lazy())
+ return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
@@ -3683,6 +3846,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
+
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -3857,7 +4023,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -4311,7 +4478,7 @@ change:
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
- if (!cpumask_subset(span, &p->cpus_allowed) ||
+ if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -4910,7 +5077,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5405,10 +5572,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ __sched_fork(0, idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
@@ -5448,7 +5616,9 @@ void init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
@@ -5487,7 +5657,7 @@ int task_can_attach(struct task_struct *p,
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5514,7 +5684,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5553,6 +5723,8 @@ void sched_setnuma(struct task_struct *p, int nid)
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
/*
* Ensure that the idle task is using init_mm right before its CPU goes
* offline.
@@ -5562,13 +5734,17 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+ /*
+ * Defer the cleanup to an alive cpu. On RT we can neither
+ * call mmdrop() nor mmdrop_delayed() from here.
+ */
+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
}
/*
@@ -5651,8 +5827,10 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
BUG_ON(!next);
put_prev_task(rq, next);
+ WARN_ON_ONCE(__migrate_disabled(next));
+
/*
- * Rules for changing task_struct::cpus_allowed are holding
+ * Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
@@ -5880,6 +6058,10 @@ int sched_cpu_dying(unsigned int cpu)
update_max_interval();
nohz_balance_exit_idle(rq);
hrtick_clear(rq);
+ if (per_cpu(idle_last_mm, cpu)) {
+ mmdrop_delayed(per_cpu(idle_last_mm, cpu));
+ per_cpu(idle_last_mm, cpu) = NULL;
+ }
return 0;
}
#endif
@@ -6113,7 +6295,7 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = preempt_count() + rcu_preempt_depth();
+ int nested = preempt_count() + sched_rcu_preempt_depth();
return (nested == preempt_offset);
}
@@ -6429,8 +6611,15 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
- if (running)
+ if (running) {
set_curr_task(rq, tsk);
+ /*
+ * After changing group, the running task may have joined a
+ * throttled one but it's still the running task. Trigger a
+ * resched to make sure that task can still run.
+ */
+ resched_curr(rq);
+ }
task_rq_unlock(rq, tsk, &rf);
}
@@ -6565,6 +6754,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -6593,6 +6784,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
@@ -7123,3 +7320,165 @@ const u32 sched_prio_to_wmult[40] = {
};
#undef CREATE_TRACE_POINTS
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
+
+static inline void
+update_nr_migratory(struct task_struct *p, long delta)
+{
+ if (unlikely((p->sched_class == &rt_sched_class ||
+ p->sched_class == &dl_sched_class) &&
+ p->nr_cpus_allowed > 1)) {
+ if (p->sched_class == &rt_sched_class)
+ task_rq(p)->rt.rt_nr_migratory += delta;
+ else
+ task_rq(p)->dl.dl_nr_migratory += delta;
+ }
+}
+
+static inline void
+migrate_disable_update_cpus_allowed(struct task_struct *p)
+{
+ p->cpus_ptr = cpumask_of(smp_processor_id());
+ update_nr_migratory(p, -1);
+ p->nr_cpus_allowed = 1;
+}
+
+static inline void
+migrate_enable_update_cpus_allowed(struct task_struct *p)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ p->cpus_ptr = &p->cpus_mask;
+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
+ update_nr_migratory(p, 1);
+ task_rq_unlock(rq, p, &rf);
+}
+
+void migrate_disable(void)
+{
+ preempt_disable();
+
+ if (++current->migrate_disable == 1) {
+ this_rq()->nr_pinned++;
+ preempt_lazy_disable();
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(current->pinned_on_cpu >= 0);
+ current->pinned_on_cpu = smp_processor_id();
+#endif
+ }
+
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+static void migrate_disabled_sched(struct task_struct *p)
+{
+ if (p->migrate_disable_scheduled)
+ return;
+
+ migrate_disable_update_cpus_allowed(p);
+ p->migrate_disable_scheduled = 1;
+}
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+ struct rq *rq = this_rq();
+ int cpu = task_cpu(p);
+
+ WARN_ON_ONCE(p->migrate_disable <= 0);
+ if (p->migrate_disable > 1) {
+ p->migrate_disable--;
+ return;
+ }
+
+ preempt_disable();
+
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(current->pinned_on_cpu != cpu);
+ current->pinned_on_cpu = -1;
+#endif
+
+ WARN_ON_ONCE(rq->nr_pinned < 1);
+
+ p->migrate_disable = 0;
+ rq->nr_pinned--;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
+ takedown_cpu_task)
+ wake_up_process(takedown_cpu_task);
+#endif
+
+ if (!p->migrate_disable_scheduled)
+ goto out;
+
+ p->migrate_disable_scheduled = 0;
+
+ migrate_enable_update_cpus_allowed(p);
+
+ WARN_ON(smp_processor_id() != cpu);
+ if (!is_cpu_allowed(p, cpu)) {
+ struct migration_arg arg = { .task = p };
+ struct cpu_stop_work work;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ arg.dest_cpu = select_fallback_rq(cpu, p);
+ task_rq_unlock(rq, p, &rf);
+
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &arg, &work);
+ __schedule(true);
+ if (!work.disabled) {
+ while (!arg.done)
+ cpu_relax();
+ }
+ }
+
+out:
+ preempt_lazy_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+
+int cpu_nr_pinned(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return rq->nr_pinned;
+}
+
+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
+static void migrate_disabled_sched(struct task_struct *p)
+{
+}
+
+void migrate_disable(void)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ current->migrate_disable++;
+#endif
+ barrier();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ struct task_struct *p = current;
+
+ WARN_ON_ONCE(p->migrate_disable <= 0);
+ p->migrate_disable--;
+#endif
+ barrier();
+}
+EXPORT_SYMBOL(migrate_enable);
+#else
+static void migrate_disabled_sched(struct task_struct *p)
+{
+}
+#endif
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ec4e4a9aab5f..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d83c7f..7c2fe50fd76d 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu)
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e139b54716b4..dd44eb122514 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 9c6480e6d62d..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce89a26c..46ed4e1383e2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
/* We might have scheduled out from guest path */
- if (current->flags & PF_VCPU)
+ if (tsk->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
@@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk)
*/
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
- current->flags |= PF_VCPU;
+ tsk->flags |= PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
- current->flags &= ~PF_VCPU;
+ tsk->flags &= ~PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcdafdcb129c..de9f74874ec1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)
dl_se->dl_non_contending = 1;
get_task_struct(p);
- hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@ -539,7 +539,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -1086,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = dl_task_timer;
}
@@ -1325,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = inactive_task_timer;
}
@@ -1857,7 +1857,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -2007,7 +2007,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@@ -2679,6 +2679,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
+ dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..333ce07c78c6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -272,7 +272,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0444, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
@@ -979,6 +979,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.runtime);
P(dl.deadline);
}
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
+ P(migrate_disable);
+#endif
+ P(nr_cpus_allowed);
#undef PN_SCHEDSTAT
#undef PN
#undef __PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a312c030b8d..ba052a06da61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1653,7 +1653,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
@@ -1751,7 +1751,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
@@ -2659,7 +2659,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -3488,9 +3488,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3596,8 +3593,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3805,7 +3806,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- rq->misfit_task_load = task_h_load(p);
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
@@ -3908,6 +3913,7 @@ static inline void check_schedstat_required(void)
#endif
}
+static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
@@ -3986,10 +3992,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ /*
+ * When bandwidth control is enabled, cfs might have been removed
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
+ * add it unconditionnally.
+ */
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
- }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4104,7 +4116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -4128,7 +4140,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static void
@@ -4270,7 +4282,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
/*
@@ -4334,23 +4346,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)
}
/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
- if (cfs_b->quota == RUNTIME_INF)
- return;
-
- now = sched_clock_cpu(smp_processor_id());
- cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ if (cfs_b->quota != RUNTIME_INF)
+ cfs_b->runtime = cfs_b->quota;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4372,8 +4377,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4390,61 +4394,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
@@ -4456,7 +4416,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static __always_inline
@@ -4535,7 +4495,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4546,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4555,6 +4516,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4593,8 +4555,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4614,31 +4575,64 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
- enqueue = 0;
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
+ cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+ }
+ for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (enqueue)
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
+
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, task_delta);
+
+unthrottle_throttle:
+ /*
+ * The cfs_rq_throttled() breaks in the above iteration can result in
+ * incomplete leaf list maintenance, resulting in triggering the
+ * assertion below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
assert_list_leaf_cfs_rq(rq);
- if (!se)
- add_nr_running(rq, task_delta);
-
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4663,7 +4657,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4688,7 +4681,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4716,8 +4709,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4730,8 +4721,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4808,8 +4798,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4842,7 +4831,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4859,7 +4847,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4868,11 +4855,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4965,29 +4951,37 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5019,17 +5013,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- u64 overrun;
-
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
- overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5169,7 +5159,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (rq->curr == p)
- resched_curr(rq);
+ resched_curr_lazy(rq);
return;
}
hrtick_start(rq, delta);
@@ -5229,6 +5219,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5252,30 +5243,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto enqueue_throttle;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
+ goto enqueue_throttle;
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
+enqueue_throttle:
if (!se) {
add_nr_running(rq, 1);
/*
@@ -5329,20 +5328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto dequeue_throttle;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5361,15 +5358,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
+ goto dequeue_throttle;
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
}
+dequeue_throttle:
if (!se)
sub_nr_running(rq, 1);
@@ -5890,7 +5892,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
- &p->cpus_allowed))
+ p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -6022,7 +6024,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -6062,7 +6064,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
{
int new_cpu = cpu;
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
@@ -6179,7 +6181,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
@@ -6213,7 +6215,7 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -6243,6 +6245,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
@@ -6273,11 +6276,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
if (available_idle_cpu(cpu))
break;
}
@@ -6313,7 +6316,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
available_idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6659,7 +6662,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
/* Skip CPUs that will be overutilized. */
@@ -6748,7 +6751,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed);
+ cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -7009,7 +7012,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -7504,14 +7507,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7531,7 +7534,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7659,7 +7662,15 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
@@ -7754,6 +7765,7 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
@@ -7781,6 +7793,41 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ rq->last_blocked_load_update_tick = jiffies;
+
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
+
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7800,16 +7847,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7818,9 +7860,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7835,23 +7881,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
-
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (done)
- rq->has_blocked_load = 0;
-#endif
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7901,27 +7934,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
+ bool decayed;
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
- rq->has_blocked_load = 0;
-#endif
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7930,6 +7952,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -8158,7 +8198,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8827,7 +8867,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
@@ -9269,7 +9309,7 @@ more_balance:
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
@@ -9633,7 +9673,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -9952,6 +9997,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -9972,14 +10025,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
@@ -10282,7 +10327,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -10306,7 +10351,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (rq->curr == p) {
if (p->prio > oldprio)
- resched_curr(rq);
+ resched_curr_lazy(rq);
} else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..4d020e6b1501 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,11 +46,19 @@ SCHED_FEAT(LB_BIAS, false)
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT_FULL
+SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a516575c18..de22da666ac7 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NO_HZ mode, if there's any pending delta, fold it
- * into the pending NO_HZ delta.
- */
- delta = calc_load_fold_active(this_rq, 0);
+ delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
}
}
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
- delta = calc_load_nohz_fold();
+ delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e3719027e..9154e745f097 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
@@ -1198,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+ if (!nbytes)
+ return -EINVAL;
+
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a910c8de0d49 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -45,8 +47,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
@@ -1614,7 +1616,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1751,7 +1753,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2522,6 +2524,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -2643,7 +2651,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..62d55d5a4fbf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -124,7 +124,13 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
@@ -341,8 +347,6 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
short idle;
short period_active;
@@ -489,7 +493,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -562,8 +567,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -952,6 +955,10 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#if defined(CONFIG_PREEMPT_RT_BASE) && defined(CONFIG_SMP)
+ int nr_pinned;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1591,6 +1598,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1785,6 +1793,15 @@ extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+ resched_curr(rq);
+}
+#endif
+
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
@@ -1797,6 +1814,8 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e83a3f8449f6..c58068d2ee06 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,6 +32,25 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ int wakes = 0;
+
+ while (!list_empty(&q->task_list)) {
+
+ curr = list_first_entry(&q->task_list, typeof(*curr),
+ task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+ wakes++;
+ }
+ if (pm_in_action)
+ return;
+ WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
+}
+EXPORT_SYMBOL(swake_up_all_locked);
+
void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
@@ -51,6 +70,7 @@ void swake_up_all(struct swait_queue_head *q)
struct swait_queue *curr;
LIST_HEAD(tmp);
+ WARN_ON(irqs_disabled());
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
@@ -69,7 +89,7 @@ void swake_up_all(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_all);
-static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..aa7e33d467da 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -502,6 +502,7 @@ static int init_rootdomain(struct root_domain *rd)
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
#endif
init_dl_bw(&rd->dl_bw);
@@ -1332,7 +1333,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
@@ -1872,6 +1873,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1937,7 +1974,7 @@ next_level:
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state;
+ enum s_alloc alloc_state = sa_none;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -1945,6 +1982,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ if (WARN_ON(cpumask_empty(cpu_map)))
+ goto error;
+
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
@@ -1964,6 +2004,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
@@ -2015,7 +2058,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (has_asym)
- static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+ static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
@@ -2110,8 +2153,12 @@ int sched_init_domains(const struct cpumask *cpu_map)
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
+ unsigned int cpu = cpumask_any(cpu_map);
int i;
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
+ static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 811b4a86cdf6..e914d72b3290 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -42,6 +42,14 @@
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
+
enum notify_state {
SECCOMP_NOTIFY_INIT,
SECCOMP_NOTIFY_SENT,
@@ -1161,6 +1169,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
return seccomp_notify_recv(filter, buf);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
default:
@@ -1198,6 +1207,7 @@ static const struct file_operations seccomp_notify_ops = {
.poll = seccomp_notify_poll,
.release = seccomp_notify_release,
.unlocked_ioctl = seccomp_notify_ioctl,
+ .compat_ioctl = seccomp_notify_ioctl,
};
static struct file *init_listener(struct seccomp_filter *filter)
diff --git a/kernel/signal.c b/kernel/signal.c
index 07da8def4329..0f39cc1c6ac1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/rt.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
@@ -398,37 +399,62 @@ void task_join_group_stop(struct task_struct *task)
}
}
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+ struct sigqueue *q = t->sigqueue_cache;
+
+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+ return NULL;
+ return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+ return 0;
+ return 1;
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+ int override_rlimit, int fromslab)
{
struct sigqueue *q = NULL;
struct user_struct *user;
+ int sigpending;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
+ *
+ * NOTE! A pending signal will hold on to the user refcount,
+ * and we get/put the refcount only when the sigpending count
+ * changes from/to zero.
*/
rcu_read_lock();
- user = get_uid(__task_cred(t)->user);
- atomic_inc(&user->sigpending);
+ user = __task_cred(t)->user;
+ sigpending = atomic_inc_return(&user->sigpending);
+ if (sigpending == 1)
+ get_uid(user);
rcu_read_unlock();
- if (override_rlimit ||
- atomic_read(&user->sigpending) <=
- task_rlimit(t, RLIMIT_SIGPENDING)) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
+ if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
+ if (!fromslab)
+ q = get_task_cache(t);
+ if (!q)
+ q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
- atomic_dec(&user->sigpending);
- free_uid(user);
+ if (atomic_dec_and_test(&user->sigpending))
+ free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
@@ -438,15 +464,37 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
return q;
}
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+ int override_rlimit)
+{
+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- atomic_dec(&q->user->sigpending);
- free_uid(q->user);
+ if (atomic_dec_and_test(&q->user->sigpending))
+ free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
}
+static void sigqueue_free_current(struct sigqueue *q)
+{
+ struct user_struct *up;
+
+ if (q->flags & SIGQUEUE_PREALLOC)
+ return;
+
+ up = q->user;
+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+ atomic_dec(&up->sigpending);
+ free_uid(up);
+ } else
+ __sigqueue_free(q);
+}
+
void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q;
@@ -460,6 +508,21 @@ void flush_sigqueue(struct sigpending *queue)
}
/*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+ struct sigqueue *q;
+
+ flush_sigqueue(&tsk->pending);
+
+ q = get_task_cache(tsk);
+ if (q)
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+/*
* Flush all pending signals for this kthread.
*/
void flush_signals(struct task_struct *t)
@@ -583,7 +646,7 @@ still_pending:
(info->si_code == SI_TIMER) &&
(info->si_sys_private);
- __sigqueue_free(first);
+ sigqueue_free_current(first);
} else {
/*
* Ok, it wasn't in the queue. This must be
@@ -620,6 +683,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
bool resched_timer = false;
int signr;
+ WARN_ON_ONCE(tsk != current);
+
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
@@ -1287,8 +1352,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
-int
-force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1317,6 +1382,39 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
return ret;
}
+int force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (in_atomic()) {
+ if (WARN_ON_ONCE(t != current))
+ return 0;
+ if (WARN_ON_ONCE(t->forced_info.si_signo))
+ return 0;
+
+ if (is_si_special(info)) {
+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
+ t->forced_info.si_signo = sig;
+ t->forced_info.si_errno = 0;
+ t->forced_info.si_code = SI_KERNEL;
+ t->forced_info.si_pid = 0;
+ t->forced_info.si_uid = 0;
+ } else {
+ t->forced_info = *info;
+ }
+
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ return 0;
+ }
+#endif
+ return do_force_sig_info(sig, info, t);
+}
+
/*
* Nuke all other threads in the group.
*/
@@ -1486,15 +1584,15 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
unsigned long flags;
int ret = -EINVAL;
+ if (!valid_signal(sig))
+ return ret;
+
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = errno;
info.si_code = SI_ASYNCIO;
*((sigval_t *)&info.si_pid) = addr;
- if (!valid_signal(sig))
- return ret;
-
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (!p) {
@@ -1767,7 +1865,8 @@ EXPORT_SYMBOL(kill_pid);
*/
struct sigqueue *sigqueue_alloc(void)
{
- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+ /* Preallocated sigqueue objects always from the slabcache ! */
+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
if (q)
q->flags |= SIGQUEUE_PREALLOC;
@@ -1876,7 +1975,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
* This is only possible if parent == real_parent.
* Check if it has changed security domain.
*/
- if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
sig = SIGCHLD;
}
@@ -1938,8 +2037,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ /*
+ * Send with __send_signal as si_pid and si_uid are in the
+ * parent's namespaces.
+ */
if (valid_signal(sig) && sig)
- __group_send_sig_info(sig, &info, tsk->parent);
+ __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
@@ -2147,16 +2250,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
} else {
@@ -2907,80 +3002,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(&kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a6b81c6b6bff..6080c9328df1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,9 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#ifdef CONFIG_PREEMPT_RT_FULL
+#include <linux/locallock.h>
+#endif
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -102,6 +105,104 @@ static bool ksoftirqd_running(unsigned long pending)
* softirq and whether we just have bh disabled.
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+static DEFINE_LOCAL_IRQ_LOCK(bh_lock);
+static DEFINE_PER_CPU(long, softirq_counter);
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long __maybe_unused flags;
+ long soft_cnt;
+
+ WARN_ON_ONCE(in_irq());
+ if (!in_atomic()) {
+ local_lock(bh_lock);
+ rcu_read_lock();
+ }
+ soft_cnt = this_cpu_inc_return(softirq_counter);
+ WARN_ON_ONCE(soft_cnt == 0);
+ current->softirq_count += SOFTIRQ_DISABLE_OFFSET;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_save(flags);
+ if (soft_cnt == 1)
+ trace_softirqs_off(ip);
+ local_irq_restore(flags);
+#endif
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void local_bh_disable_rt(void)
+{
+ local_bh_disable();
+}
+
+void _local_bh_enable(void)
+{
+ unsigned long __maybe_unused flags;
+ long soft_cnt;
+
+ soft_cnt = this_cpu_dec_return(softirq_counter);
+ WARN_ON_ONCE(soft_cnt < 0);
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_save(flags);
+ if (soft_cnt == 0)
+ trace_softirqs_on(_RET_IP_);
+ local_irq_restore(flags);
+#endif
+
+ current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
+ if (!in_atomic()) {
+ rcu_read_unlock();
+ local_unlock(bh_lock);
+ }
+}
+
+void _local_bh_enable_rt(void)
+{
+ _local_bh_enable();
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ u32 pending;
+ long count;
+
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
+
+ local_irq_disable();
+ count = this_cpu_read(softirq_counter);
+
+ if (unlikely(count == 1)) {
+ pending = local_softirq_pending();
+ if (pending && !ksoftirqd_running(pending)) {
+ if (!in_atomic())
+ __do_softirq();
+ else
+ wakeup_softirqd();
+ }
+ trace_softirqs_on(ip);
+ }
+ count = this_cpu_dec_return(softirq_counter);
+ WARN_ON_ONCE(count < 0);
+ local_irq_enable();
+
+ if (!in_atomic()) {
+ rcu_read_unlock();
+ local_unlock(bh_lock);
+ }
+
+ current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+#else
+static void local_bh_disable_rt(void) { }
+static void _local_bh_enable_rt(void) { }
+
/*
* This one is for softirq.c-internal use,
* where hardirqs are disabled legitimately:
@@ -196,6 +297,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
preempt_check_resched();
}
EXPORT_SYMBOL(__local_bh_enable_ip);
+#endif
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
@@ -266,7 +368,11 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending = local_softirq_pending();
account_irq_enter_time(current);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ current->softirq_count |= SOFTIRQ_OFFSET;
+#else
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+#endif
in_hardirq = lockdep_softirq_start();
restart:
@@ -300,9 +406,10 @@ restart:
h++;
pending >>= softirq_bit;
}
-
+#ifndef CONFIG_PREEMPT_RT_FULL
if (__this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
+#endif
local_irq_disable();
pending = local_softirq_pending();
@@ -316,11 +423,16 @@ restart:
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ current->softirq_count &= ~SOFTIRQ_OFFSET;
+#else
__local_bh_enable(SOFTIRQ_OFFSET);
+#endif
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
}
+#ifndef CONFIG_PREEMPT_RT_FULL
asmlinkage __visible void do_softirq(void)
{
__u32 pending;
@@ -338,6 +450,7 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
}
+#endif
/*
* Enter an interrupt context.
@@ -358,6 +471,16 @@ void irq_enter(void)
__irq_enter();
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+static inline void invoke_softirq(void)
+{
+ if (this_cpu_read(softirq_counter) == 0)
+ wakeup_softirqd();
+}
+
+#else
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -383,6 +506,7 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
}
+#endif
static inline void tick_irq_exit(void)
{
@@ -420,6 +544,27 @@ void irq_exit(void)
/*
* This function must run with irqs disabled!
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+void raise_softirq_irqoff(unsigned int nr)
+{
+ __raise_softirq_irqoff(nr);
+
+ /*
+ * If we're in an hard interrupt we let irq return code deal
+ * with the wakeup of ksoftirqd.
+ */
+ if (in_irq())
+ return;
+ /*
+ * If were are not in BH-disabled section then we have to wake
+ * ksoftirqd.
+ */
+ if (this_cpu_read(softirq_counter) == 0)
+ wakeup_softirqd();
+}
+
+#else
+
inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
@@ -437,6 +582,8 @@ inline void raise_softirq_irqoff(unsigned int nr)
wakeup_softirqd();
}
+#endif
+
void raise_softirq(unsigned int nr)
{
unsigned long flags;
@@ -564,7 +711,8 @@ void tasklet_kill(struct tasklet_struct *t)
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
do {
- yield();
+ local_bh_disable();
+ local_bh_enable();
} while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
@@ -594,6 +742,7 @@ static int ksoftirqd_should_run(unsigned int cpu)
static void run_ksoftirqd(unsigned int cpu)
{
+ local_bh_disable_rt();
local_irq_disable();
if (local_softirq_pending()) {
/*
@@ -602,10 +751,12 @@ static void run_ksoftirqd(unsigned int cpu)
*/
__do_softirq();
local_irq_enable();
+ _local_bh_enable_rt();
cond_resched();
return;
}
local_irq_enable();
+ _local_bh_enable_rt();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -679,6 +830,13 @@ static struct smp_hotplug_thread softirq_threads = {
static __init int spawn_ksoftirqd(void)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ lockdep_set_novalidate_class(per_cpu_ptr(&bh_lock.lock, cpu));
+#endif
+
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
@@ -687,6 +845,75 @@ static __init int spawn_ksoftirqd(void)
}
early_initcall(spawn_ksoftirqd);
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+/*
+ * On preempt-rt a softirq running context might be blocked on a
+ * lock. There might be no other runnable task on this CPU because the
+ * lock owner runs on some other CPU. So we have to go into idle with
+ * the pending bit set. Therefor we need to check this otherwise we
+ * warn about false positives which confuses users and defeats the
+ * whole purpose of this test.
+ *
+ * This code is called with interrupts disabled.
+ */
+void softirq_check_pending_idle(void)
+{
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ static int rate_limit;
+ bool okay = false;
+ u32 warnpending;
+
+ if (rate_limit >= 10)
+ return;
+
+ warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
+ if (!warnpending)
+ return;
+
+ if (!tsk)
+ return;
+ /*
+ * If ksoftirqd is blocked on a lock then we may go idle with pending
+ * softirq.
+ */
+ raw_spin_lock(&tsk->pi_lock);
+ if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING ||
+ (tsk->state == TASK_UNINTERRUPTIBLE && tsk->sleeping_lock)) {
+ okay = true;
+ }
+ raw_spin_unlock(&tsk->pi_lock);
+ if (okay)
+ return;
+ /*
+ * The softirq lock is held in non-atomic context and the owner is
+ * blocking on a lock. It will schedule softirqs once the counter goes
+ * back to zero.
+ */
+ if (this_cpu_read(softirq_counter) > 0)
+ return;
+
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ warnpending);
+ rate_limit++;
+}
+
+#else
+
+void softirq_check_pending_idle(void)
+{
+ static int ratelimit;
+
+ if (ratelimit < 10 &&
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+ pr_warn("NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
+ ratelimit++;
+ }
+}
+
+#endif
+
/*
* [ These __weak aliases are kept in a separate compilation unit, so that
* GCC does not inline them incorrectly. ]
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 899b726c9e98..97c44cff0229 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
struct stacktrace_cookie c = {
.store = store,
.size = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + !!(current == tsk),
};
if (!try_get_task_stack(tsk))
@@ -306,7 +307,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
struct stack_trace trace = {
.entries = store,
.max_entries = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + !!(current == task),
};
save_stack_trace_tsk(task, &trace);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2b5a6754646f..fa53a472dd44 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -85,8 +85,11 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
enabled = stopper->enabled;
if (enabled)
__cpu_stop_queue_work(stopper, work, &wakeq);
- else if (work->done)
- cpu_stop_signal_done(work->done);
+ else {
+ work->disabled = true;
+ if (work->done)
+ cpu_stop_signal_done(work->done);
+ }
raw_spin_unlock_irqrestore(&stopper->lock, flags);
wake_up_q(&wakeq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1beca96fb625..952fc547ea50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1452,7 +1452,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = &one,
.extra2 = &four,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13a0f2e6ebc2..e2ac0e37c4ae 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index bfe0e0656f02..c59218867288 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -88,6 +88,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct wakeup_source *__ws;
+ struct platform_device *pdev;
+ int ret = 0;
if (rtcdev)
return -EBUSY;
@@ -98,12 +100,14 @@ static int alarmtimer_rtc_add_device(struct device *dev,
return -1;
__ws = wakeup_source_register("alarmtimer");
+ pdev = platform_device_register_data(dev, "alarmtimer",
+ PLATFORM_DEVID_AUTO, NULL, 0);
spin_lock_irqsave(&rtcdev_lock, flags);
- if (!rtcdev) {
+ if (__ws && !IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
- spin_unlock_irqrestore(&rtcdev_lock, flags);
- return -1;
+ ret = -1;
+ goto unlock;
}
rtcdev = rtc;
@@ -111,12 +115,17 @@ static int alarmtimer_rtc_add_device(struct device *dev,
get_device(dev);
ws = __ws;
__ws = NULL;
+ pdev = NULL;
+ } else {
+ ret = -1;
}
+unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
+ platform_device_unregister(pdev);
wakeup_source_unregister(__ws);
- return 0;
+ return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
@@ -433,7 +442,7 @@ int alarm_cancel(struct alarm *alarm)
int ret = alarm_try_to_cancel(alarm);
if (ret >= 0)
return ret;
- cpu_relax();
+ hrtimer_grab_expiry_lock(&alarm->timer);
}
}
EXPORT_SYMBOL_GPL(alarm_cancel);
@@ -861,8 +870,7 @@ static struct platform_driver alarmtimer_driver = {
*/
static int __init alarmtimer_init(void)
{
- struct platform_device *pdev;
- int error = 0;
+ int error;
int i;
alarmtimer_rtc_timer_init();
@@ -885,15 +893,7 @@ static int __init alarmtimer_init(void)
if (error)
goto out_if;
- pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- error = PTR_ERR(pdev);
- goto out_drv;
- }
return 0;
-
-out_drv:
- platform_driver_unregister(&alarmtimer_driver);
out_if:
alarmtimer_rtc_interface_remove();
return error;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..cf74160e79c7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused)
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..0c5d7a90c86d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -141,6 +141,11 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -265,6 +270,11 @@ again:
#else /* CONFIG_SMP */
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
@@ -930,6 +940,16 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(hrtimer_forward);
+void hrtimer_grab_expiry_lock(const struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
+ if (timer->is_soft && is_migration_base(base)) {
+ spin_lock(&base->cpu_base->softirq_expiry_lock);
+ spin_unlock(&base->cpu_base->softirq_expiry_lock);
+ }
+}
+
/*
* enqueue_hrtimer - internal function to (re)start a timer
*
@@ -946,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index;
- timer->state = HRTIMER_STATE_ENQUEUED;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
return timerqueue_add(&base->active, &timer->node);
}
@@ -968,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
u8 state = timer->state;
- timer->state = newstate;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, newstate);
if (!(state & HRTIMER_STATE_ENQUEUED))
return;
@@ -993,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
- if (hrtimer_is_queued(timer)) {
- u8 state = timer->state;
+ u8 state = timer->state;
+
+ if (state & HRTIMER_STATE_ENQUEUED) {
int reprogram;
/*
@@ -1099,7 +1122,9 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
* match.
*/
+#ifndef CONFIG_PREEMPT_RT_BASE
WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+#endif
base = lock_hrtimer_base(timer, &flags);
@@ -1162,7 +1187,7 @@ int hrtimer_cancel(struct hrtimer *timer)
if (ret >= 0)
return ret;
- cpu_relax();
+ hrtimer_grab_expiry_lock(timer);
}
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1259,10 +1284,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
- bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
- int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
+ bool softtimer;
+ int base;
struct hrtimer_cpu_base *cpu_base;
+ softtimer = !!(mode & HRTIMER_MODE_SOFT);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (!softtimer && !(mode & HRTIMER_MODE_HARD))
+ softtimer = true;
+#endif
+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
+
memset(timer, 0, sizeof(struct hrtimer));
cpu_base = raw_cpu_ptr(&hrtimer_bases);
@@ -1459,6 +1491,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
unsigned long flags;
ktime_t now;
+ spin_lock(&cpu_base->softirq_expiry_lock);
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
@@ -1468,6 +1501,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ spin_unlock(&cpu_base->softirq_expiry_lock);
}
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1639,13 +1673,52 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id,
+ enum hrtimer_mode mode,
+ struct task_struct *task)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
+ if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
+ mode |= HRTIMER_MODE_HARD;
+ else
+ mode |= HRTIMER_MODE_SOFT;
+ }
+#endif
+ __hrtimer_init(&sl->timer, clock_id, mode);
sl->timer.function = hrtimer_wakeup;
sl->task = task;
}
+
+/**
+ * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * @sl: sleeper to be initialized
+ * @clock_id: the clock to be used
+ * @mode: timer mode abs/rel
+ * @task: the task to wake up
+ */
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode, struct task_struct *task)
+{
+ debug_init(&sl->timer, clock_id, mode);
+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
+
+}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+ clockid_t clock_id,
+ enum hrtimer_mode mode,
+ struct task_struct *task)
+{
+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
+#endif
+
int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
@@ -1669,8 +1742,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
{
struct restart_block *restart;
- hrtimer_init_sleeper(t, current);
-
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t->timer, mode);
@@ -1678,12 +1749,12 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
if (likely(t->task))
freezable_schedule();
+ __set_current_state(TASK_RUNNING);
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
} while (t->task && !signal_pending(current));
- __set_current_state(TASK_RUNNING);
if (!t->task)
return 0;
@@ -1707,10 +1778,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct hrtimer_sleeper t;
int ret;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
- HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
+ HRTIMER_MODE_ABS, current);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
-
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -1728,7 +1798,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
if (dl_task(current) || rt_task(current))
slack = 0;
- hrtimer_init_on_stack(&t.timer, clockid, mode);
+ hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
@@ -1788,6 +1858,38 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
}
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
+ */
+void cpu_chill(void)
+{
+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
+ struct task_struct *self = current;
+ ktime_t chill_time;
+
+ raw_spin_lock_irq(&self->pi_lock);
+ self->saved_state = self->state;
+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock_irq(&self->pi_lock);
+
+ chill_time = ktime_set(0, NSEC_PER_MSEC);
+
+ current->flags |= PF_NOFREEZE;
+ sleeping_lock_inc();
+ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
+ sleeping_lock_dec();
+ if (!freeze_flag)
+ current->flags &= ~PF_NOFREEZE;
+
+ raw_spin_lock_irq(&self->pi_lock);
+ __set_current_state_no_track(self->saved_state);
+ self->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&self->pi_lock);
+}
+EXPORT_SYMBOL(cpu_chill);
+#endif
+
/*
* Functions related to boot-time initialization:
*/
@@ -1809,6 +1911,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ spin_lock_init(&cpu_base->softirq_expiry_lock);
return 0;
}
@@ -1927,11 +2030,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock_id, mode);
+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
- hrtimer_init_sleeper(&t, current);
-
hrtimer_start_expires(&t.timer, mode);
if (likely(t.task))
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 02068b2d5862..d999294adeef 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -213,6 +213,7 @@ again:
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
spin_unlock_irq(&tsk->sighand->siglock);
+ hrtimer_grab_expiry_lock(timer);
goto again;
}
expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index d23b434c2ca7..eddcf4970444 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = {
.max_cycles = 10,
};
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
@@ -67,9 +68,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
ret = jiffies_64;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 65eb796610dc..069ca78fb0bf 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
/* fill PPS status fields */
pps_fill_timex(txc);
- txc->time.tv_sec = (time_t)ts->tv_sec;
+ txc->time.tv_sec = ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..200fb2d3be99 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -14,8 +14,6 @@
#include "posix-timers.h"
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5bbad147a90c..22e4e85724af 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -3,8 +3,10 @@
* Implement CPU time clocks for the POSIX clock interface.
*/
+#include <uapi/linux/sched/types.h>
#include <linux/sched/signal.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/rt.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
@@ -15,6 +17,7 @@
#include <linux/workqueue.h>
#include <linux/compat.h>
#include <linux/sched/deadline.h>
+#include <linux/smpboot.h>
#include "posix-timers.h"
@@ -788,6 +791,7 @@ check_timers_list(struct list_head *timers,
return t->expires;
t->firing = 1;
+ t->firing_cpu = smp_processor_id();
list_move_tail(&t->entry, firing);
}
@@ -1131,18 +1135,31 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
return 0;
}
+static DEFINE_PER_CPU(spinlock_t, cpu_timer_expiry_lock) = __SPIN_LOCK_UNLOCKED(cpu_timer_expiry_lock);
+
+void cpu_timers_grab_expiry_lock(struct k_itimer *timer)
+{
+ int cpu = timer->it.cpu.firing_cpu;
+
+ if (cpu >= 0) {
+ spinlock_t *expiry_lock = per_cpu_ptr(&cpu_timer_expiry_lock, cpu);
+
+ spin_lock_irq(expiry_lock);
+ spin_unlock_irq(expiry_lock);
+ }
+}
+
/*
* This is called from the timer interrupt handler. The irq handler has
* already updated our counts. We need to check if any timers fire now.
* Interrupts are disabled.
*/
-void run_posix_cpu_timers(struct task_struct *tsk)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
{
LIST_HEAD(firing);
struct k_itimer *timer, *next;
unsigned long flags;
-
- lockdep_assert_irqs_disabled();
+ spinlock_t *expiry_lock;
/*
* The fast path checks that there are no expired thread or thread
@@ -1151,8 +1168,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
if (!fastpath_timer_check(tsk))
return;
- if (!lock_task_sighand(tsk, &flags))
+ expiry_lock = this_cpu_ptr(&cpu_timer_expiry_lock);
+ spin_lock(expiry_lock);
+
+ if (!lock_task_sighand(tsk, &flags)) {
+ spin_unlock(expiry_lock);
return;
+ }
/*
* Here we take off tsk->signal->cpu_timers[N] and
* tsk->cpu_timers[N] all the timers that are firing, and
@@ -1185,6 +1207,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
list_del_init(&timer->it.cpu.entry);
cpu_firing = timer->it.cpu.firing;
timer->it.cpu.firing = 0;
+ timer->it.cpu.firing_cpu = -1;
/*
* The firing flag is -1 if we collided with a reset
* of the timer, which already reported this
@@ -1194,8 +1217,156 @@ void run_posix_cpu_timers(struct task_struct *tsk)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
+ spin_unlock(expiry_lock);
+}
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+DEFINE_PER_CPU(bool, posix_timer_th_active);
+
+static void posix_cpu_kthread_fn(unsigned int cpu)
+{
+ struct task_struct *tsk = NULL;
+ struct task_struct *next = NULL;
+
+ BUG_ON(per_cpu(posix_timer_task, cpu) != current);
+
+ /* grab task list */
+ raw_local_irq_disable();
+ tsk = per_cpu(posix_timer_tasklist, cpu);
+ per_cpu(posix_timer_tasklist, cpu) = NULL;
+ raw_local_irq_enable();
+
+ /* its possible the list is empty, just return */
+ if (!tsk)
+ return;
+
+ /* Process task list */
+ while (1) {
+ /* save next */
+ next = tsk->posix_timer_list;
+
+ /* run the task timers, clear its ptr and
+ * unreference it
+ */
+ __run_posix_cpu_timers(tsk);
+ tsk->posix_timer_list = NULL;
+ put_task_struct(tsk);
+
+ /* check if this is the last on the list */
+ if (next == tsk)
+ break;
+ tsk = next;
+ }
+}
+
+static inline int __fastpath_timer_check(struct task_struct *tsk)
+{
+ /* tsk == current, ensure it is safe to use ->signal/sighand */
+ if (unlikely(tsk->exit_state))
+ return 0;
+
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return 1;
+
+ if (!task_cputime_zero(&tsk->signal->cputime_expires))
+ return 1;
+
+ return 0;
}
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ unsigned int cpu = smp_processor_id();
+ struct task_struct *tasklist;
+
+ BUG_ON(!irqs_disabled());
+
+ if (per_cpu(posix_timer_th_active, cpu) != true)
+ return;
+
+ /* get per-cpu references */
+ tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+ /* check to see if we're already queued */
+ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
+ get_task_struct(tsk);
+ if (tasklist) {
+ tsk->posix_timer_list = tasklist;
+ } else {
+ /*
+ * The list is terminated by a self-pointing
+ * task_struct
+ */
+ tsk->posix_timer_list = tsk;
+ }
+ per_cpu(posix_timer_tasklist, cpu) = tsk;
+
+ wake_up_process(per_cpu(posix_timer_task, cpu));
+ }
+}
+
+static int posix_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(posix_timer_tasklist) != NULL;
+}
+
+static void posix_cpu_kthread_park(unsigned int cpu)
+{
+ this_cpu_write(posix_timer_th_active, false);
+}
+
+static void posix_cpu_kthread_unpark(unsigned int cpu)
+{
+ this_cpu_write(posix_timer_th_active, true);
+}
+
+static void posix_cpu_kthread_setup(unsigned int cpu)
+{
+ struct sched_param sp;
+
+ sp.sched_priority = MAX_RT_PRIO - 1;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ posix_cpu_kthread_unpark(cpu);
+}
+
+static struct smp_hotplug_thread posix_cpu_thread = {
+ .store = &posix_timer_task,
+ .thread_should_run = posix_cpu_kthread_should_run,
+ .thread_fn = posix_cpu_kthread_fn,
+ .thread_comm = "posixcputmr/%u",
+ .setup = posix_cpu_kthread_setup,
+ .park = posix_cpu_kthread_park,
+ .unpark = posix_cpu_kthread_unpark,
+};
+
+static int __init posix_cpu_thread_init(void)
+{
+ /* Start one for boot CPU. */
+ unsigned long cpu;
+ int ret;
+
+ /* init the per-cpu posix_timer_tasklets */
+ for_each_possible_cpu(cpu)
+ per_cpu(posix_timer_tasklist, cpu) = NULL;
+
+ ret = smpboot_register_percpu_thread(&posix_cpu_thread);
+ WARN_ON(ret);
+
+ return 0;
+}
+early_initcall(posix_cpu_thread_init);
+#else /* CONFIG_PREEMPT_RT_BASE */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ lockdep_assert_irqs_disabled();
+ __run_posix_cpu_timers(tsk);
+}
+#endif /* CONFIG_PREEMPT_RT_BASE */
+
/*
* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
* The tsk->sighand->siglock must be held by the caller.
@@ -1314,6 +1485,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
spin_unlock_irq(&timer.it_lock);
while (error == TIMER_RETRY) {
+
+ cpu_timers_grab_expiry_lock(&timer);
/*
* We need to handle case when timer was or is in the
* middle of firing. In other cases we already freed
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 29176635991f..1cf04812b3f5 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -442,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void)
static void k_itimer_rcu_free(struct rcu_head *head)
{
- struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+ struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
kmem_cache_free(posix_timers_cache, tmr);
}
@@ -459,7 +459,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+ call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
static int common_timer_create(struct k_itimer *new_timer)
@@ -805,6 +805,17 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
return hrtimer_try_to_cancel(&timr->it.real.timer);
}
+static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timer)
+{
+ if (kc->timer_arm == common_hrtimer_arm)
+ hrtimer_grab_expiry_lock(&timer->it.real.timer);
+ else if (kc == &alarm_clock)
+ hrtimer_grab_expiry_lock(&timer->it.alarm.alarmtimer.timer);
+ else
+ /* posix-cpu-timers */
+ cpu_timers_grab_expiry_lock(timer);
+}
+
/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
@@ -870,11 +881,15 @@ retry:
else
error = kc->timer_set(timr, flags, new_spec64, old_spec64);
- unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
+ rcu_read_lock();
+ unlock_timer(timr, flag);
+ timer_wait_for_callback(kc, timr);
+ rcu_read_unlock();
old_spec64 = NULL; // We already got the old time...
goto retry;
}
+ unlock_timer(timr, flag);
return error;
}
@@ -936,13 +951,21 @@ int common_timer_del(struct k_itimer *timer)
return 0;
}
-static inline int timer_delete_hook(struct k_itimer *timer)
+static int timer_delete_hook(struct k_itimer *timer)
{
const struct k_clock *kc = timer->kclock;
+ int ret;
if (WARN_ON_ONCE(!kc || !kc->timer_del))
return -EINVAL;
- return kc->timer_del(timer);
+ ret = kc->timer_del(timer);
+ if (ret == TIMER_RETRY) {
+ rcu_read_lock();
+ spin_unlock_irq(&timer->it_lock);
+ timer_wait_for_callback(kc, timer);
+ rcu_read_unlock();
+ }
+ return ret;
}
/* Delete a POSIX.1b interval timer. */
@@ -956,10 +979,8 @@ retry_delete:
if (!timer)
return -EINVAL;
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- unlock_timer(timer, flags);
+ if (timer_delete_hook(timer) == TIMER_RETRY)
goto retry_delete;
- }
spin_lock(&current->sighand->siglock);
list_del(&timer->list);
@@ -985,10 +1006,9 @@ static void itimer_delete(struct k_itimer *timer)
retry_delete:
spin_lock_irqsave(&timer->it_lock, flags);
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- unlock_timer(timer, flags);
+ if (timer_delete_hook(timer) == TIMER_RETRY)
goto retry_delete;
- }
+
list_del(&timer->list);
/*
* This keeps any tasks waiting on the spin lock from thinking
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index de5daa6d975a..c106ef83e405 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -32,6 +32,8 @@ extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern void cpu_timers_grab_expiry_lock(struct k_itimer *timer);
+
int posix_timer_event(struct k_itimer *timr, int si_private);
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 142b07619918..3f16750ec7a3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -205,7 +205,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (sched_clock_timer.function != NULL) {
/* update timeout for clock wrap */
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt,
+ HRTIMER_MODE_REL_HARD);
}
r = rate;
@@ -249,9 +250,9 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
/*
@@ -288,7 +289,7 @@ void sched_clock_resume(void)
struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock();
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
rd->read_sched_clock = cd.actual_read_sched_clock;
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 99fbfb8d9117..9faa7d5d6321 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -105,7 +105,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
bctimer.function = bc_handler;
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 59225b484e4e..885cc1a86269 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -83,13 +83,15 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -161,9 +163,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
ktime_t next;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
next = tick_next_period;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..857441e18d16 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,20 +58,23 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Do a quick check without holding jiffies_lock:
+ * The READ_ONCE() pairs with two updates done later in this function.
*/
- delta = ktime_sub(now, last_jiffies_update);
+ delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
/* Reevaluate with jiffies_lock held */
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, last_jiffies_update);
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
- last_jiffies_update = ktime_add(last_jiffies_update,
- tick_period);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
@@ -79,18 +82,22 @@ static void tick_do_update_jiffies64(ktime_t now)
ticks = ktime_divns(delta, incr);
- last_jiffies_update = ktime_add_ns(last_jiffies_update,
- incr * ticks);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add_ns(last_jiffies_update,
+ incr * ticks));
}
do_timer(++ticks);
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return;
}
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -101,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return period;
}
@@ -230,6 +239,7 @@ static void nohz_full_kick_func(struct irq_work *work)
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
.func = nohz_full_kick_func,
+ .flags = IRQ_WORK_HARD_IRQ,
};
/*
@@ -659,10 +669,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
@@ -893,14 +903,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
if (unlikely(local_softirq_pending())) {
- static int ratelimit;
-
- if (ratelimit < 10 &&
- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- pr_warn("NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
+ softirq_check_pending_idle();
return false;
}
@@ -1329,7 +1332,7 @@ void tick_setup_sched_timer(void)
/*
* Emulate tick processing via per-CPU hrtimers:
*/
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per-CPU) */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7f7d6914ddd5..f5a6814c6743 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -877,7 +877,8 @@ int get_timespec64(struct timespec64 *ts,
ts->tv_sec = kts.tv_sec;
/* Zero out the padding for 32 bit systems or in compat mode */
- if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+ if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) ||
+ in_compat_syscall()))
kts.tv_nsec &= 0xFFFFFFFFUL;
ts->tv_nsec = kts.tv_nsec;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 44b726bab4bd..98f82eae082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -819,7 +819,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
} while (read_seqcount_retry(&tk_core.seq, seq));
- return base + nsecs;
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
@@ -2392,8 +2392,10 @@ EXPORT_SYMBOL(hardpps);
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
do_timer(ticks);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 141ab3ab0354..099737f6f10c 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { }
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
#define CS_NAME_LEN 32
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7d63b7347066..4a243860097a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,6 +43,7 @@
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/random.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -196,6 +197,7 @@ EXPORT_SYMBOL(jiffies_64);
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
+ spinlock_t expiry_lock;
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
@@ -518,8 +520,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
* Force expire obscene large timeouts to expire at the
* capacity limit of the wheel.
*/
- if (expires >= WHEEL_TIMEOUT_CUTOFF)
- expires = WHEEL_TIMEOUT_MAX;
+ if (delta >= WHEEL_TIMEOUT_CUTOFF)
+ expires = clk + WHEEL_TIMEOUT_MAX;
idx = calc_index(expires, LVL_DEPTH - 1);
}
@@ -581,7 +583,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* Set the next expiry time and kick the CPU so it can reevaluate the
* wheel:
*/
- base->next_expiry = timer->expires;
+ if (time_before(timer->expires, base->clk)) {
+ /*
+ * Prevent from forward_timer_base() moving the base->clk
+ * backward
+ */
+ base->next_expiry = base->clk;
+ } else {
+ base->next_expiry = timer->expires;
+ }
wake_up_nohz_cpu(base->cpu);
}
@@ -893,10 +903,13 @@ static inline void forward_timer_base(struct timer_base *base)
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow))
+ if (time_after(base->next_expiry, jnow)) {
base->clk = jnow;
- else
+ } else {
+ if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
+ return;
base->clk = base->next_expiry;
+ }
#endif
}
@@ -1201,14 +1214,8 @@ int del_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(del_timer);
-/**
- * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer to delete
- *
- * This function tries to deactivate a timer. Upon successful (ret >= 0)
- * exit the timer is not queued and the handler is not running on any CPU.
- */
-int try_to_del_timer_sync(struct timer_list *timer)
+static int __try_to_del_timer_sync(struct timer_list *timer,
+ struct timer_base **basep)
{
struct timer_base *base;
unsigned long flags;
@@ -1216,7 +1223,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
debug_assert_init(timer);
- base = lock_timer_base(timer, &flags);
+ *basep = base = lock_timer_base(timer, &flags);
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
@@ -1225,9 +1232,42 @@ int try_to_del_timer_sync(struct timer_list *timer)
return ret;
}
+
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer to delete
+ *
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ struct timer_base *base;
+
+ return __try_to_del_timer_sync(timer, &base);
+}
EXPORT_SYMBOL(try_to_del_timer_sync);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
+static int __del_timer_sync(struct timer_list *timer)
+{
+ struct timer_base *base;
+ int ret;
+
+ for (;;) {
+ ret = __try_to_del_timer_sync(timer, &base);
+ if (ret >= 0)
+ return ret;
+
+ /*
+ * When accessing the lock, timers of base are no longer expired
+ * and so timer is no longer running.
+ */
+ spin_lock(&base->expiry_lock);
+ spin_unlock(&base->expiry_lock);
+ }
+}
+
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1283,12 +1323,8 @@ int del_timer_sync(struct timer_list *timer)
* could lead to deadlock.
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
- for (;;) {
- int ret = try_to_del_timer_sync(timer);
- if (ret >= 0)
- return ret;
- cpu_relax();
- }
+
+ return __del_timer_sync(timer);
}
EXPORT_SYMBOL(del_timer_sync);
#endif
@@ -1357,13 +1393,20 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
fn = timer->function;
- if (timer->flags & TIMER_IRQSAFE) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
+ timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ base->running_timer = NULL;
+ spin_unlock(&base->expiry_lock);
+ spin_lock(&base->expiry_lock);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ base->running_timer = NULL;
+ spin_unlock(&base->expiry_lock);
+ spin_lock(&base->expiry_lock);
raw_spin_lock_irq(&base->lock);
}
}
@@ -1646,6 +1689,13 @@ void update_process_times(int user_tick)
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers(p);
+
+ /* The current CPU might make use of net randoms without receiving IRQs
+ * to renew them often enough. Let's update the net_rand_state from a
+ * non-constant value that's not affine to the number of calls to make
+ * sure it's updated when there's some activity (we don't care in idle).
+ */
+ this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick);
}
/**
@@ -1660,6 +1710,7 @@ static inline void __run_timers(struct timer_base *base)
if (!time_after_eq(jiffies, base->clk))
return;
+ spin_lock(&base->expiry_lock);
raw_spin_lock_irq(&base->lock);
/*
@@ -1686,8 +1737,8 @@ static inline void __run_timers(struct timer_base *base)
while (levels--)
expire_timers(base, heads + levels);
}
- base->running_timer = NULL;
raw_spin_unlock_irq(&base->lock);
+ spin_unlock(&base->expiry_lock);
}
/*
@@ -1697,6 +1748,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ irq_work_tick_soft();
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
@@ -1932,6 +1985,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ spin_lock_init(&base->expiry_lock);
}
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 564e5fdb025f..9669aaf0cd76 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -371,7 +371,6 @@ config PROFILE_ANNOTATED_BRANCHES
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
- imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e1c6d79fb4cc..8f18311f9be5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -336,6 +339,7 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt)
{
+ synchronize_rcu();
blk_trace_free(bt);
put_probe_ref();
}
@@ -344,7 +348,8 @@ static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (!bt)
return -EINVAL;
@@ -479,6 +484,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct dentry *dir = NULL;
int ret;
+ lockdep_assert_held(&q->blk_trace_mutex);
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
@@ -494,6 +501,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -507,13 +525,31 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;
- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
+#ifdef CONFIG_BLK_DEBUG_FS
+ /*
+ * When tracing whole make_request drivers (multiqueue) block devices,
+ * reuse the existing debugfs directory created by the block layer on
+ * init. For request-based block devices, all partitions block devices,
+ * and scsi-generic block devices we create a temporary new debugfs
+ * directory that will be removed once the trace ends.
+ */
+ if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains)
+ dir = q->debugfs_dir;
+ else
+#endif
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
- if (!dir)
+
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ ret = -ENOENT;
goto err;
+ }
bt->dev = dev;
atomic_set(&bt->dropped, 0);
@@ -522,12 +558,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
&blk_dropped_fops);
- if (!bt->dropped_file)
- goto err;
bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- if (!bt->msg_file)
- goto err;
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -549,16 +581,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto err;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;
@@ -636,8 +663,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -747,8 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
void blk_trace_shutdown(struct request_queue *q)
{
mutex_lock(&q->blk_trace_mutex);
-
- if (q->blk_trace) {
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
@@ -760,8 +789,10 @@ void blk_trace_shutdown(struct request_queue *q)
static union kernfs_node_id *
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ /* We don't use the 'bt' value here except as an optimization... */
+ bt = rcu_dereference_protected(q->blk_trace, 1);
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return NULL;
@@ -806,10 +837,14 @@ static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what,
union kernfs_node_id *cgid)
{
- struct blk_trace *bt = rq->q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(rq->q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
@@ -818,6 +853,7 @@ static void blk_add_trace_rq(struct request *rq, int error,
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, error, 0, NULL, cgid);
+ rcu_read_unlock();
}
static void blk_add_trace_rq_insert(void *ignore,
@@ -863,14 +899,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
+ rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -915,11 +956,14 @@ static void blk_add_trace_getrq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
NULL, NULL);
+ rcu_read_unlock();
}
}
@@ -931,27 +975,35 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL, NULL);
+ rcu_read_unlock();
}
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
+ rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
@@ -963,22 +1015,28 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
+ rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu, blk_trace_bio_get_cgid(q, bio));
+ BLK_TA_SPLIT,
+ blk_status_to_errno(bio->bi_status),
+ sizeof(rpdu), &rpdu,
+ blk_trace_bio_get_cgid(q, bio));
}
+ rcu_read_unlock();
}
/**
@@ -998,19 +1056,25 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
+ rcu_read_unlock();
}
/**
@@ -1031,11 +1095,15 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev,
sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -1044,6 +1112,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ rcu_read_unlock();
}
/**
@@ -1061,14 +1130,19 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq,
void *data, size_t len)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(q, rq));
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1220,21 +1294,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent, has_cg);
+ const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
-static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r, bool has_cg)
-{
- const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- __u64 sector_from = __r->sector_from;
-
- r->device_from = be32_to_cpu(__r->device_from);
- r->device_to = be32_to_cpu(__r->device_to);
- r->sector_from = be64_to_cpu(sector_from);
-}
-
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
@@ -1360,13 +1423,13 @@ static void blk_log_with_error(struct trace_seq *s,
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
- struct blk_io_trace_remap r = { .device_from = 0, };
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ MAJOR(be32_to_cpu(__r->device_from)),
+ MINOR(be32_to_cpu(__r->device_from)),
+ be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
@@ -1590,11 +1653,13 @@ static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
put_probe_ref();
+ synchronize_rcu();
blk_trace_free(bt);
return 0;
}
@@ -1621,10 +1686,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto free_bt;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
@@ -1756,6 +1818,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q;
struct block_device *bdev;
+ struct blk_trace *bt;
ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p));
@@ -1768,21 +1831,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
- if (q->blk_trace == NULL)
+ if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
- ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
- ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+ ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
mutex_unlock(&q->blk_trace_mutex);
@@ -1799,6 +1864,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev;
struct request_queue *q;
struct hd_struct *p;
+ struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1829,8 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- if (!!value == !!q->blk_trace) {
+ if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
@@ -1842,18 +1910,21 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
- if (q->blk_trace == NULL)
+ if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
+ }
if (ret == 0) {
if (attr == &dev_attr_act_mask)
- q->blk_trace->act_mask = value;
+ bt->act_mask = value;
else if (attr == &dev_attr_pid)
- q->blk_trace->pid = value;
+ bt->pid = value;
else if (attr == &dev_attr_start_lba)
- q->blk_trace->start_lba = value;
+ bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
- q->blk_trace->end_lba = value;
+ bt->end_lba = value;
}
out_unlock_bdev:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index aaf66cd9daa6..a2c30e1eb7e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -155,7 +155,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
{
/*
@@ -178,10 +178,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- if (!access_ok(unsafe_ptr, size))
- return -EPERM;
- return probe_kernel_write(unsafe_ptr, src, size);
+ return probe_user_write(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index baa54a76c6aa..98751023c05a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -523,8 +523,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -550,7 +549,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
@@ -5085,8 +5085,8 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
-struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH;
enum graph_filter_type {
GRAPH_FILTER_NOTRACE = 0,
@@ -5357,8 +5357,15 @@ ftrace_graph_release(struct inode *inode, struct file *file)
mutex_unlock(&graph_lock);
- /* Wait till all users are no longer using the old hash */
- synchronize_rcu();
+ /*
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ schedule_on_each_cpu(ftrace_sync);
free_ftrace_hash(old_hash);
}
@@ -5671,8 +5678,11 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ }
}
return cnt;
@@ -5849,8 +5859,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -6435,12 +6445,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
@@ -6513,9 +6523,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
- if (v == FTRACE_NO_PIDS)
+ if (v == FTRACE_NO_PIDS) {
+ (*pos)++;
return NULL;
-
+ }
return trace_pid_next(pid_list, v, pos);
}
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * can use rcu_dereference_raw_check() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_check() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_notrace(list); \
+ op = rcu_dereference_raw_check(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ while (likely(op = rcu_dereference_raw_check((op)->next)) && \
unlikely((op) != &ftrace_list_end))
extern struct ftrace_ops __rcu *ftrace_ops_list;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05b0b3139ebc..0e4af9941a23 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2366,7 +2366,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp)) {
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
- event = rb_add_time_stamp(event, info->delta, abs);
+ event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a65ca0a91f6..6afa88420792 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1726,6 +1726,7 @@ static __init int init_trace_selftests(void)
pr_info("Running postponed tracer tests:\n");
+ tracing_selftest_running = true;
list_for_each_entry_safe(p, n, &postponed_selftests, list) {
/* This loop can take minutes when sanitizers are enabled, so
* lets make sure we allow RCU processing.
@@ -1748,6 +1749,7 @@ static __init int init_trace_selftests(void)
list_del(&p->list);
kfree(p);
}
+ tracing_selftest_running = false;
out:
mutex_unlock(&trace_types_lock);
@@ -2318,6 +2320,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
struct task_struct *tsk = current;
entry->preempt_count = pc & 0xff;
+ entry->preempt_lazy_count = preempt_lazy_count();
entry->pid = (tsk) ? tsk->pid : 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
@@ -2328,8 +2331,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+
+ entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -2642,10 +2648,10 @@ static void ftrace_exports(struct ring_buffer_event *event)
preempt_disable_notrace();
- export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event);
- export = rcu_dereference_raw_notrace(export->next);
+ export = rcu_dereference_raw_check(export->next);
}
preempt_enable_notrace();
@@ -3553,14 +3559,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n"
+ "# / _-------=> irqs-off \n"
+ "# | / _------=> need-resched \n"
+ "# || / _-----=> need-resched_lazy \n"
+ "# ||| / _----=> hardirq/softirq \n"
+ "# |||| / _---=> preempt-depth \n"
+ "# ||||| / _--=> preempt-lazy-depth\n"
+ "# |||||| / _-=> migrate-disable \n"
+ "# ||||||| / delay \n"
+ "# cmd pid |||||||| time | caller \n"
+ "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -3596,11 +3605,12 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s| / _----=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s|| / _---=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s||| / _--=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s||||/ delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*sCPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
}
void
@@ -4569,6 +4579,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ if ((mask == TRACE_ITER_RECORD_TGID) ||
+ (mask == TRACE_ITER_RECORD_CMD))
+ lockdep_assert_held(&event_mutex);
+
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
@@ -4588,7 +4602,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map)
- tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+ tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
sizeof(*tgid_map),
GFP_KERNEL);
if (!tgid_map) {
@@ -4636,6 +4650,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
cmp += len;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = match_string(trace_options, -1, cmp);
@@ -4646,6 +4661,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
/*
* If the first trailing whitespace is replaced with '\0' by strstrip,
@@ -6009,6 +6025,7 @@ waitagain:
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
cpumask_clear(iter->started);
+ trace_seq_init(&iter->seq);
iter->pos = -1;
trace_event_read_lock();
@@ -7947,9 +7964,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = set_tracer_flag(tr, 1 << index, val);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (ret < 0)
return ret;
@@ -8277,6 +8296,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
*/
allocate_snapshot = false;
#endif
+
+ /*
+ * Because of some magic with the way alloc_percpu() works on
+ * x86_64, we need to synchronize the pgd of all the tables,
+ * otherwise the trace events that happen in x86_64 page fault
+ * handlers can't cope with accessing the chance that a
+ * alloc_percpu()'d memory might be touched in the page fault trace
+ * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
+ * calls in tracing, because something might get triggered within a
+ * page fault trace event!
+ */
+ vmalloc_sync_mappings();
+
return 0;
}
@@ -8887,7 +8919,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -8964,7 +8995,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..eb7f91e5e1de 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -126,6 +126,7 @@ struct kretprobe_trace_entry_head {
* NEED_RESCHED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
+ * NEED_RESCHED_LAZY - lazy reschedule is requested
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -135,6 +136,7 @@ enum trace_flag_type {
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
TRACE_FLAG_NMI = 0x40,
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
};
#define TRACE_BUF_SIZE 1024
@@ -931,22 +933,31 @@ extern void __trace_graph_return(struct trace_array *tr,
unsigned long flags, int pc);
#ifdef CONFIG_DYNAMIC_FTRACE
-extern struct ftrace_hash *ftrace_graph_hash;
-extern struct ftrace_hash *ftrace_graph_notrace_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
+ struct ftrace_hash *hash;
preempt_disable_notrace();
- if (ftrace_hash_empty(ftrace_graph_hash)) {
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());
+
+ if (ftrace_hash_empty(hash)) {
ret = 1;
goto out;
}
- if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ if (ftrace_lookup_ip(hash, addr)) {
/*
* This needs to be cleared on the return functions
@@ -982,10 +993,20 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
+ struct ftrace_hash *notrace_hash;
preempt_disable_notrace();
- if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ !preemptible());
+
+ if (ftrace_lookup_ip(notrace_hash, addr))
ret = 1;
preempt_enable_notrace();
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0ce3db67f556..6593a4f3b244 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+ __common_field(unsigned short, migrate_disable);
+ __common_field(unsigned short, padding);
return ret;
}
@@ -327,7 +329,8 @@ void trace_event_enable_cmd_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
@@ -341,7 +344,6 @@ void trace_event_enable_cmd_record(bool enable)
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
void trace_event_enable_tgid_record(bool enable)
@@ -349,7 +351,8 @@ void trace_event_enable_tgid_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
@@ -363,7 +366,6 @@ void trace_event_enable_tgid_record(bool enable)
&file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
static int __ftrace_event_enable_disable(struct trace_event_file *file,
@@ -534,12 +536,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
@@ -795,7 +797,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
return ret;
}
-static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
int ret;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d5a4bda9ce7b..b878d0ec8181 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1659,7 +1659,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
- kfree(filter);
+ __free_filter(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
tracepoint_synchronize_unregister();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dd310d3b5843..21caeb2052ef 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -111,6 +111,7 @@ struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
+ unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
@@ -464,11 +465,12 @@ struct action_data {
* When a histogram trigger is hit, the values of any
* references to variables, including variables being passed
* as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
+ * var_ref_vals array. This var_ref_idx array is an array of
+ * indices into the var_ref_vals array, one for each synthetic
+ * event param, and is passed to the synthetic event
+ * invocation.
*/
- unsigned int var_ref_idx;
+ unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -674,6 +676,8 @@ static bool synth_field_signed(char *type)
{
if (str_has_prefix(type, "u"))
return false;
+ if (strcmp(type, "gfp_t") == 0)
+ return false;
return true;
}
@@ -852,14 +856,14 @@ static struct trace_event_functions synth_event_funcs = {
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct trace_event_file *trace_file = __data;
struct synth_trace_event *entry;
struct trace_event_buffer fbuffer;
struct ring_buffer *buffer;
struct synth_event *event;
- unsigned int i, n_u64;
+ unsigned int i, n_u64, val_idx;
int fields_size = 0;
event = trace_file->event_call->data;
@@ -882,14 +886,34 @@ static notrace void trace_event_raw_event_synth(void *__data,
goto out;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
char *str_field = (char *)&entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
- entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[val_idx];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
n_u64++;
}
}
@@ -1071,10 +1095,10 @@ static struct tracepoint *alloc_synth_tracepoint(char *name)
}
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
- unsigned int var_ref_idx);
+ unsigned int *var_ref_idx);
static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct tracepoint *tp = event->tp;
@@ -1985,12 +2009,6 @@ static int parse_map_size(char *str)
unsigned long size, map_bits;
int ret;
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
-
ret = kstrtoul(str, 0, &size);
if (ret)
goto out;
@@ -2050,25 +2068,25 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
static int parse_assignment(struct trace_array *tr,
char *str, struct hist_trigger_attrs *attrs)
{
- int ret = 0;
+ int len, ret = 0;
- if ((str_has_prefix(str, "key=")) ||
- (str_has_prefix(str, "keys="))) {
- attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ if ((len = str_has_prefix(str, "key=")) ||
+ (len = str_has_prefix(str, "keys="))) {
+ attrs->keys_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->keys_str) {
ret = -ENOMEM;
goto out;
}
- } else if ((str_has_prefix(str, "val=")) ||
- (str_has_prefix(str, "vals=")) ||
- (str_has_prefix(str, "values="))) {
- attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "val=")) ||
+ (len = str_has_prefix(str, "vals=")) ||
+ (len = str_has_prefix(str, "values="))) {
+ attrs->vals_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->vals_str) {
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "sort=")) {
- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "sort="))) {
+ attrs->sort_key_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->sort_key_str) {
ret = -ENOMEM;
goto out;
@@ -2079,12 +2097,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "clock=")) {
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
+ } else if ((len = str_has_prefix(str, "clock="))) {
+ str += len;
str = strstrip(str);
attrs->clock = kstrdup(str, GFP_KERNEL);
@@ -2092,8 +2106,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "size=")) {
- int map_bits = parse_map_size(str);
+ } else if ((len = str_has_prefix(str, "size="))) {
+ int map_bits = parse_map_size(str + len);
if (map_bits < 0) {
ret = map_bits;
@@ -2133,8 +2147,14 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
while (trigger_str) {
char *str = strsep(&trigger_str, ":");
+ char *rhs;
- if (strchr(str, '=')) {
+ rhs = strchr(str, '=');
+ if (rhs) {
+ if (!strlen(++rhs)) {
+ ret = -EINVAL;
+ goto free;
+ }
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
@@ -2378,12 +2398,23 @@ static int contains_operator(char *str)
return field_op;
}
+static void get_hist_field(struct hist_field *hist_field)
+{
+ hist_field->ref++;
+}
+
static void __destroy_hist_field(struct hist_field *hist_field)
{
+ if (--hist_field->ref > 1)
+ return;
+
kfree(hist_field->var.name);
kfree(hist_field->name);
kfree(hist_field->type);
+ kfree(hist_field->system);
+ kfree(hist_field->event_name);
+
kfree(hist_field);
}
@@ -2421,6 +2452,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field)
return NULL;
+ hist_field->ref = 1;
+
hist_field->hist_data = hist_data;
if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
@@ -2595,6 +2628,22 @@ static int init_var_ref(struct hist_field *ref_field,
goto out;
}
+static int find_var_ref_idx(struct hist_trigger_data *hist_data,
+ struct hist_field *var_field)
+{
+ struct hist_field *ref_field;
+ int i;
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
/**
* create_var_ref - Create a variable reference and attach it to trigger
* @hist_data: The trigger that will be referencing the variable
@@ -2616,6 +2665,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
struct hist_field *ref_field;
+ int i;
+
+ /* Check if the variable already exists */
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data) {
+ get_hist_field(ref_field);
+ return ref_field;
+ }
+ }
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
@@ -3413,6 +3473,7 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
goto out;
}
+ var->ref = 1;
var->flags = HIST_FIELD_FL_VAR;
var->var.idx = idx;
var->var.hist_data = var->hist_data = hist_data;
@@ -4042,6 +4103,9 @@ static void destroy_field_vars(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_field_vars; i++)
destroy_field_var(hist_data->field_vars[i]);
+
+ for (i = 0; i < hist_data->n_save_vars; i++)
+ destroy_field_var(hist_data->save_vars[i]);
}
static void save_field_var(struct hist_trigger_data *hist_data,
@@ -4153,11 +4217,11 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
struct trace_array *tr = hist_data->event_file->tr;
char *event_name, *param, *system = NULL;
struct hist_field *hist_field, *var_ref;
- unsigned int i, var_ref_idx;
+ unsigned int i;
unsigned int field_pos = 0;
struct synth_event *event;
char *synth_event_name;
- int ret = 0;
+ int var_ref_idx, ret = 0;
lockdep_assert_held(&event_mutex);
@@ -4174,8 +4238,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
event->ref++;
- var_ref_idx = hist_data->n_var_refs;
-
for (i = 0; i < data->n_params; i++) {
char *p;
@@ -4224,6 +4286,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
goto err;
}
+ var_ref_idx = find_var_ref_idx(hist_data, var_ref);
+ if (WARN_ON(var_ref_idx < 0)) {
+ ret = var_ref_idx;
+ goto err;
+ }
+
+ data->var_ref_idx[i] = var_ref_idx;
+
field_pos++;
kfree(p);
continue;
@@ -4242,7 +4312,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
}
data->synth_event = event;
- data->var_ref_idx = var_ref_idx;
out:
return ret;
err:
@@ -4461,10 +4530,6 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
j < TRACING_MAP_VALS_MAX; i++) {
field_str = strsep(&fields_str, ",");
@@ -4559,10 +4624,6 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
field_str = strsep(&fields_str, ",");
if (!field_str)
@@ -4720,12 +4781,6 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str) {
- ret = -EINVAL;
- goto out;
- }
-
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
struct hist_field *hist_field;
char *field_str, *field_name;
@@ -4734,9 +4789,11 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
sort_key = &hist_data->sort_keys[i];
field_str = strsep(&fields_str, ",");
- if (!field_str) {
- if (i == 0)
- ret = -EINVAL;
+ if (!field_str)
+ break;
+
+ if (!*field_str) {
+ ret = -EINVAL;
break;
}
@@ -4746,7 +4803,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
field_name = strsep(&field_str, ".");
- if (!field_name) {
+ if (!field_name || !*field_name) {
ret = -EINVAL;
break;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2a2912cb4533..586ea94e0db1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -115,9 +115,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
struct trace_event_file *event_file = event_file_data(m->private);
- if (t == SHOW_AVAILABLE_TRIGGERS)
+ if (t == SHOW_AVAILABLE_TRIGGERS) {
+ (*pos)++;
return NULL;
-
+ }
return seq_list_next(t, &event_file->triggers, pos);
}
@@ -210,11 +211,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
static int trigger_process_regex(struct trace_event_file *file, char *buff)
{
- char *command, *next = buff;
+ char *command, *next;
struct event_command *p;
int ret = -EINVAL;
+ next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
command = (command[0] != '!') ? command : command + 1;
mutex_lock(&trigger_cmd_mutex);
@@ -617,8 +624,14 @@ event_trigger_callback(struct event_command *cmd_ops,
int ret;
/* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0]))
+ if (param && isdigit(param[0])) {
trigger = strsep(&param, " \t");
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
+ }
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
@@ -1075,14 +1088,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = register_trigger(glob, ops, data, file);
-
- if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
- unregister_trigger(glob, ops, data, file);
- ret = 0;
- }
+ if (tracing_alloc_snapshot_instance(file->tr) != 0)
+ return 0;
- return ret;
+ return register_trigger(glob, ops, data, file);
}
static int
@@ -1359,6 +1368,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
trigger = strsep(&param, " \t");
if (!trigger)
return -EINVAL;
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
system = strsep(&trigger, ":");
if (!trigger)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 8030e24dbf14..35512ed26d9f 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -270,6 +270,7 @@ static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
if (disable_migrate)
@@ -279,11 +280,11 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
- if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(smp_processor_id(), current_mask);
put_online_cpus();
@@ -360,7 +361,7 @@ static int start_kthread(struct trace_array *tr)
/* Just pick the first CPU on first iteration */
current_mask = &save_cpumask;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
put_online_cpus();
next_cpu = cpumask_first(current_mask);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7d736248a070..4666bceb1383 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -389,11 +389,10 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
#if defined(CONFIG_KPROBES_ON_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
-static bool within_notrace_func(struct trace_kprobe *tk)
+static bool __within_notrace_func(unsigned long addr)
{
- unsigned long offset, size, addr;
+ unsigned long offset, size;
- addr = trace_kprobe_address(tk);
if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
return false;
@@ -406,6 +405,28 @@ static bool within_notrace_func(struct trace_kprobe *tk)
*/
return !ftrace_location_range(addr, addr + size - 1);
}
+
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+ unsigned long addr = trace_kprobe_address(tk);
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (!__within_notrace_func(addr))
+ return false;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return true;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_notrace_func(addr);
+ }
+
+ return true;
+}
#else
#define within_notrace_func(tk) (false)
#endif
@@ -767,6 +788,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
int i;
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
+ if (trace_kprobe_is_return(tk) && tk->rp.maxactive)
+ seq_printf(m, "%d", tk->rp.maxactive);
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
trace_event_name(&tk->tp.call));
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cab4a5398f1d..d323f789698a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -426,6 +426,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
char hardsoft_irq;
char need_resched;
+ char need_resched_lazy;
char irqs_off;
int hardirq;
int softirq;
@@ -456,6 +457,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
break;
}
+ need_resched_lazy =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
hardsoft_irq =
(nmi && hardirq) ? 'Z' :
nmi ? 'z' :
@@ -464,14 +468,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.' ;
- trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq);
+ trace_seq_printf(s, "%c%c%c%c",
+ irqs_off, need_resched, need_resched_lazy,
+ hardsoft_irq);
if (entry->preempt_count)
trace_seq_printf(s, "%x", entry->preempt_count);
else
trace_seq_putc(s, '.');
+ if (entry->preempt_lazy_count)
+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+ else
+ trace_seq_putc(s, '.');
+
+ if (entry->migrate_disable)
+ trace_seq_printf(s, "%x", entry->migrate_disable);
+ else
+ trace_seq_putc(s, '.');
+
return !trace_seq_has_overflowed(s);
}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e288168661e1..e304196d7c28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -89,8 +89,10 @@ static void tracing_sched_unregister(void)
static void tracing_start_sched_switch(int ops)
{
- bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
+ bool sched_register;
+
mutex_lock(&sched_register_mutex);
+ sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
switch (ops) {
case RECORD_CMDLINE:
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..7152062d6af6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -631,7 +631,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -649,6 +649,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 642a850af81a..5b4f50932478 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -184,6 +184,11 @@ static void check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75bf1bcb4a8a..92b76f9e25ed 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -278,18 +278,22 @@ static int tracing_stat_init(void)
d_tracing = tracing_init_dentry();
if (IS_ERR(d_tracing))
- return 0;
+ return -ENODEV;
stat_dir = tracefs_create_dir("trace_stat", d_tracing);
- if (!stat_dir)
+ if (!stat_dir) {
pr_warn("Could not create tracefs 'trace_stat' entry\n");
+ return -ENOMEM;
+ }
return 0;
}
static int init_stat_file(struct stat_session *session)
{
- if (!stat_dir && tracing_stat_init())
- return -ENODEV;
+ int ret;
+
+ if (!stat_dir && (ret = tracing_stat_init()))
+ return ret;
session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
@@ -302,7 +306,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret;
+ int ret = -EINVAL;
if (!trace)
return -EINVAL;
@@ -313,17 +317,15 @@ int register_stat_tracer(struct tracer_stat *trace)
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
- if (node->ts == trace) {
- mutex_unlock(&all_stat_sessions_mutex);
- return -EINVAL;
- }
+ if (node->ts == trace)
+ goto out;
}
- mutex_unlock(&all_stat_sessions_mutex);
+ ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- return -ENOMEM;
+ goto out;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -332,15 +334,16 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- return ret;
+ goto out;
}
+ ret = 0;
/* Register */
- mutex_lock(&all_stat_sessions_mutex);
list_add_tail(&session->session_list, &all_stat_sessions);
+ out:
mutex_unlock(&all_stat_sessions_mutex);
- return 0;
+ return ret;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9a1c22310323..9e31bfc818ff 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
#define DEFINE_TRACING_MAP_CMP_FN(type) \
static int tracing_map_cmp_##type(void *val_a, void *val_b) \
{ \
- type a = *(type *)val_a; \
- type b = *(type *)val_b; \
+ type a = (type)(*(u64 *)val_a); \
+ type b = (type)(*(u64 *)val_b); \
\
return (a > b) ? 1 : ((a < b) ? -1 : 0); \
}
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..3474d6aa55d8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -475,6 +475,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
{
struct umh_info *umh_info = info->data;
+ /* cleanup if umh_pipe_setup() was successful but exec failed */
+ if (info->pid && info->retval) {
+ fput(umh_info->pipe_to_umh);
+ fput(umh_info->pipe_from_umh);
+ }
+
argv_free(info->argv);
umh_info->pid = info->pid;
}
@@ -544,6 +550,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of system workqueues.
* (ie. it runs with full root capabilities and optimized affinity).
+ *
+ * Note: successful return value does not guarantee the helper was called at
+ * all. You can't rely on sub_info->{init,cleanup} being called even for
+ * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
+ * into a successful no-op.
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..6133ec40abaf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+#define SOFTLOCKUP_RESET ULONG_MAX
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -274,7 +276,7 @@ notrace void touch_softlockup_watchdog_sched(void)
* Preemption can be enabled. It doesn't matter which CPU's timestamp
* gets zeroed here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, 0);
+ raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
notrace void touch_softlockup_watchdog(void)
@@ -298,14 +300,14 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
wq_watchdog_touch(-1);
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, 0);
+ __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
static int is_softlockup(unsigned long touch_ts)
@@ -383,7 +385,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == 0) {
+ if (touch_ts == SOFTLOCKUP_RESET) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -490,7 +492,7 @@ static void watchdog_enable(unsigned int cpu)
* Start the timer first to prevent the NMI watchdog triggering
* before the timer has a chance to fire.
*/
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer->function = watchdog_timer_fn;
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 95aea04ff722..bd46bdf733cf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -50,6 +50,7 @@
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/nmi.h>
+#include <linux/swait.h>
#include "workqueue_internal.h"
@@ -145,7 +146,7 @@ enum {
/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
- spinlock_t lock; /* the pool lock */
+ raw_spinlock_t lock; /* the pool lock */
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
@@ -300,8 +301,8 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
-static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
+static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DECLARE_SWAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -425,7 +426,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* ignored.
*/
#define for_each_pwq(pwq, wq) \
- list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
+ lockdep_is_held(&wq->mutex)) \
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
else
@@ -831,7 +833,7 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
* Wake up the first idle worker of @pool.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void wake_up_worker(struct worker_pool *pool)
{
@@ -884,7 +886,7 @@ void wq_worker_sleeping(struct task_struct *task)
return;
worker->sleeping = 1;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -903,7 +905,7 @@ void wq_worker_sleeping(struct task_struct *task)
if (next)
wake_up_process(next->task);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**
@@ -914,7 +916,7 @@ void wq_worker_sleeping(struct task_struct *task)
* the scheduler to get a worker's last known identity.
*
* CONTEXT:
- * spin_lock_irq(rq->lock)
+ * raw_spin_lock_irq(rq->lock)
*
* This function is called during schedule() when a kworker is going
* to sleep. It's used by psi to identify aggregation workers during
@@ -945,7 +947,7 @@ work_func_t wq_worker_last_func(struct task_struct *task)
* Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
@@ -970,7 +972,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
@@ -1018,7 +1020,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
* actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*
* Return:
* Pointer to worker which is executing @work if found, %NULL
@@ -1053,7 +1055,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
@@ -1131,9 +1133,9 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
* As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
put_pwq(pwq);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
}
}
@@ -1166,7 +1168,7 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
* decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
@@ -1265,7 +1267,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
if (!pool)
goto fail;
- spin_lock(&pool->lock);
+ raw_spin_lock(&pool->lock);
/*
* work->data is guaranteed to point to pwq only while the work
* item is queued on pwq->wq, and both updating work->data to point
@@ -1294,11 +1296,11 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
return 1;
}
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
fail:
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1319,7 +1321,7 @@ fail:
* work_struct flags.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
@@ -1416,14 +1418,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
rcu_read_lock();
retry:
- if (req_cpu == WORK_CPU_UNBOUND)
- cpu = wq_select_unbound_cpu(raw_smp_processor_id());
-
/* pwq which will be used unless @work is executing elsewhere */
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
+ if (wq->flags & WQ_UNBOUND) {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+ } else {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ }
/*
* If @work was previously on a different pool, it might still be
@@ -1434,7 +1438,7 @@ retry:
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
- spin_lock(&last_pool->lock);
+ raw_spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
@@ -1442,11 +1446,11 @@ retry:
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
- spin_unlock(&last_pool->lock);
- spin_lock(&pwq->pool->lock);
+ raw_spin_unlock(&last_pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
} else {
- spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
/*
@@ -1459,7 +1463,7 @@ retry:
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
@@ -1491,7 +1495,7 @@ retry:
insert_work(pwq, work, worklist, work_flags);
out:
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}
@@ -1611,9 +1615,11 @@ EXPORT_SYMBOL_GPL(queue_work_node);
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
+ unsigned long flags;
- /* should have been called from irqsafe timer with irq already off */
+ local_irq_save(flags);
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL(delayed_work_timer_fn);
@@ -1760,7 +1766,7 @@ EXPORT_SYMBOL(queue_rcu_work);
* necessary.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
@@ -1800,7 +1806,7 @@ static void worker_enter_idle(struct worker *worker)
* @worker is leaving idle state. Update stats.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
@@ -1938,11 +1944,11 @@ static struct worker *create_worker(struct worker_pool *pool)
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
return worker;
@@ -1961,7 +1967,7 @@ fail:
* be idle.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
@@ -1987,7 +1993,7 @@ static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
@@ -2005,7 +2011,7 @@ static void idle_worker_timeout(struct timer_list *t)
destroy_worker(worker);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
static void send_mayday(struct work_struct *work)
@@ -2036,8 +2042,8 @@ static void pool_mayday_timeout(struct timer_list *t)
struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
- spin_lock_irq(&pool->lock);
- spin_lock(&wq_mayday_lock); /* for wq->maydays */
+ raw_spin_lock_irq(&pool->lock);
+ raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -2050,8 +2056,8 @@ static void pool_mayday_timeout(struct timer_list *t)
send_mayday(work);
}
- spin_unlock(&wq_mayday_lock);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2070,7 +2076,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* may_start_working() %true.
*
* LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*/
@@ -2079,7 +2085,7 @@ __releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2095,7 +2101,7 @@ restart:
}
del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
* created as @pool->lock was dropped and the new worker might have
@@ -2118,7 +2124,7 @@ restart:
* and may_start_working() is true.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
@@ -2141,7 +2147,7 @@ static bool manage_workers(struct worker *worker)
pool->manager = NULL;
pool->flags &= ~POOL_MANAGER_ACTIVE;
- wake_up(&wq_manager_wait);
+ swake_up_one(&wq_manager_wait);
return true;
}
@@ -2157,7 +2163,7 @@ static bool manage_workers(struct worker *worker)
* call this function to process a work.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
@@ -2239,7 +2245,7 @@ __acquires(&pool->lock)
*/
set_work_pool_and_clear_pending(work, pool->id);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
@@ -2294,7 +2300,7 @@ __acquires(&pool->lock)
*/
cond_resched();
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
@@ -2320,7 +2326,7 @@ __acquires(&pool->lock)
* fetches a work from the top and executes it.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
@@ -2362,11 +2368,11 @@ static int worker_thread(void *__worker)
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
@@ -2432,7 +2438,7 @@ sleep:
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
@@ -2486,7 +2492,7 @@ repeat:
should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
while (!list_empty(&wq->maydays)) {
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
@@ -2498,11 +2504,11 @@ repeat:
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
worker_attach_to_pool(rescuer, pool);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* Slurp in all works issued via this workqueue and
@@ -2531,10 +2537,10 @@ repeat:
* incur MAYDAY_INTERVAL delay inbetween.
*/
if (need_to_create_worker(pool)) {
- spin_lock(&wq_mayday_lock);
+ raw_spin_lock(&wq_mayday_lock);
get_pwq(pwq);
list_move_tail(&pwq->mayday_node, &wq->maydays);
- spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock(&wq_mayday_lock);
}
}
@@ -2552,14 +2558,14 @@ repeat:
if (need_more_worker(pool))
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
worker_detach_from_pool(rescuer);
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
}
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
if (should_stop) {
__set_current_state(TASK_RUNNING);
@@ -2639,7 +2645,7 @@ static void wq_barrier_func(struct work_struct *work)
* underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
@@ -2726,7 +2732,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
for_each_pwq(pwq, wq) {
struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1);
@@ -2743,7 +2749,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
pwq->work_color = work_color;
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
@@ -2943,9 +2949,9 @@ reflush:
for_each_pwq(pwq, wq) {
bool drained;
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
@@ -2981,7 +2987,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
return false;
}
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -2997,7 +3003,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/*
* Force a lock recursion deadlock when using flush_work() inside a
@@ -3016,7 +3022,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
rcu_read_unlock();
return true;
already_gone:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}
@@ -3329,7 +3335,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
*
* Undo alloc_workqueue_attrs().
*/
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
+static void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
@@ -3339,21 +3345,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
/**
* alloc_workqueue_attrs - allocate a workqueue_attrs
- * @gfp_mask: allocation mask to use
*
* Allocate a new workqueue_attrs, initialize with default settings and
* return it.
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static struct workqueue_attrs *alloc_workqueue_attrs(void)
{
struct workqueue_attrs *attrs;
- attrs = kzalloc(sizeof(*attrs), gfp_mask);
+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
if (!attrs)
goto fail;
- if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+ if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
@@ -3410,7 +3415,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
*/
static int init_worker_pool(struct worker_pool *pool)
{
- spin_lock_init(&pool->lock);
+ raw_spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
@@ -3431,7 +3436,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->refcnt = 1;
/* shouldn't fail above this point */
- pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
return 0;
@@ -3487,7 +3492,6 @@ static void rcu_free_wq(struct rcu_head *rcu)
else
free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq->rescuer);
kfree(wq);
}
@@ -3536,15 +3540,15 @@ static void put_unbound_pool(struct worker_pool *pool)
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
*/
- spin_lock_irq(&pool->lock);
- wait_event_lock_irq(wq_manager_wait,
+ raw_spin_lock_irq(&pool->lock);
+ swait_event_lock_irq(wq_manager_wait,
!(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
pool->flags |= POOL_MANAGER_ACTIVE;
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
@@ -3700,7 +3704,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
return;
/* this function can be called during early boot w/ irq disabled */
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
/*
* During [un]freezing, the caller is responsible for ensuring that
@@ -3723,7 +3727,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
pwq->max_active = 0;
}
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
/* initialize newly alloced @pwq which is associated with @wq and @pool */
@@ -3896,8 +3900,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
- new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
- tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ new_attrs = alloc_workqueue_attrs();
+ tmp_attrs = alloc_workqueue_attrs();
if (!ctx || !new_attrs || !tmp_attrs)
goto out_free;
@@ -4033,7 +4037,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Return: 0 on success and -errno on failure.
*/
-int apply_workqueue_attrs(struct workqueue_struct *wq,
+static int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
int ret;
@@ -4044,7 +4048,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
return ret;
}
-EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
/**
* wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -4122,9 +4125,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
use_dfl_pwq:
mutex_lock(&wq->mutex);
- spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
- spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
@@ -4242,7 +4245,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ wq->unbound_attrs = alloc_workqueue_attrs();
if (!wq->unbound_attrs)
goto err_free_wq;
}
@@ -4318,9 +4321,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct pool_workqueue *pwq;
int node;
+ /*
+ * Remove it from sysfs first so that sanity check failure doesn't
+ * lead to sysfs name conflicts.
+ */
+ workqueue_sysfs_unregister(wq);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+ if (wq->rescuer) {
+ struct worker *rescuer = wq->rescuer;
+
+ /* this prevents new queueing */
+ raw_spin_lock_irq(&wq_mayday_lock);
+ wq->rescuer = NULL;
+ raw_spin_unlock_irq(&wq_mayday_lock);
+
+ /* rescuer will empty maydays list before exiting */
+ kthread_stop(rescuer->task);
+ kfree(rescuer);
+ }
+
/* sanity checks */
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
@@ -4352,11 +4375,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- workqueue_sysfs_unregister(wq);
-
- if (wq->rescuer)
- kthread_stop(wq->rescuer->task);
-
if (!(wq->flags & WQ_UNBOUND)) {
wq_unregister_lockdep(wq);
/*
@@ -4515,10 +4533,10 @@ unsigned int work_busy(struct work_struct *work)
rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
}
rcu_read_unlock();
@@ -4631,7 +4649,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ pr_cont(" active=%d/%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -4724,10 +4743,10 @@ void show_workqueue_state(void)
pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
for_each_pwq(pwq, wq) {
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4741,7 +4760,7 @@ void show_workqueue_state(void)
struct worker *worker;
bool first = true;
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
@@ -4760,7 +4779,7 @@ void show_workqueue_state(void)
}
pr_cont("\n");
next_pool:
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4790,7 +4809,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
struct worker_pool *pool = worker->pool;
if (pool) {
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* ->desc tracks information (wq name or
* set_worker_desc()) for the latest execution. If
@@ -4804,7 +4823,7 @@ void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
scnprintf(buf + off, size - off, "-%s",
worker->desc);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4835,7 +4854,7 @@ static void unbind_workers(int cpu)
for_each_cpu_worker_pool(pool, cpu) {
mutex_lock(&wq_pool_attach_mutex);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* We've blocked all attach/detach operations. Make all workers
@@ -4849,7 +4868,7 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_unlock(&wq_pool_attach_mutex);
/*
@@ -4875,9 +4894,9 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4904,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
pool->flags &= ~POOL_DISASSOCIATED;
@@ -4943,7 +4962,7 @@ static void rebind_workers(struct worker_pool *pool)
WRITE_ONCE(worker->flags, worker_flags);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**
@@ -5395,7 +5414,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
lockdep_assert_held(&wq_pool_mutex);
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ attrs = alloc_workqueue_attrs();
if (!attrs)
return NULL;
@@ -5817,7 +5836,7 @@ static void __init wq_numa_init(void)
return;
}
- wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+ wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_unbound_numa_attrs_buf);
/*
@@ -5892,7 +5911,7 @@ int __init workqueue_init_early(void)
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
- BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
@@ -5901,7 +5920,7 @@ int __init workqueue_init_early(void)
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
- BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;