aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c95
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c87
-rw-r--r--kernel/bpf/btf.c5
-rw-r--r--kernel/bpf/cgroup.c8
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/inode.c3
-rw-r--r--kernel/bpf/offload.c2
-rw-r--r--kernel/bpf/stackmap.c7
-rw-r--r--kernel/bpf/syscall.c47
-rw-r--r--kernel/bpf/tnum.c9
-rw-r--r--kernel/bpf/verifier.c175
-rw-r--r--kernel/cgroup/cgroup-v1.c3
-rw-r--r--kernel/cgroup/cgroup.c80
-rw-r--r--kernel/cgroup/cpuset.c3
-rw-r--r--kernel/cgroup/freezer.c9
-rw-r--r--kernel/cgroup/pids.c11
-rw-r--r--kernel/cgroup/rstat.c16
-rw-r--r--kernel/cpu.c174
-rw-r--r--kernel/cpu_pm.c4
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/debug_core.c9
-rw-r--r--kernel/debug/kdb/kdb_main.c8
-rw-r--r--kernel/dma/coherent.c27
-rw-r--r--kernel/dma/debug.c10
-rw-r--r--kernel/dma/direct.c5
-rw-r--r--kernel/events/core.c88
-rw-r--r--kernel/events/uprobes.c18
-rw-r--r--kernel/exit.c37
-rw-r--r--kernel/futex.c151
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/irq/debugfs.c12
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irqdesc.c6
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/irq/manage.c70
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/timings.c74
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c129
-rw-r--r--kernel/locking/lockdep.c102
-rw-r--r--kernel/locking/locktorture.c8
-rw-r--r--kernel/locking/spinlock_debug.c32
-rw-r--r--kernel/module.c89
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/padata.c49
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/printk/internal.h5
-rw-r--r--kernel/printk/printk.c38
-rw-r--r--kernel/printk/printk_safe.c11
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/srcutree.c7
-rw-r--r--kernel/rcu/tree.c11
-rw-r--r--kernel/rcu/tree_exp.h32
-rw-r--r--kernel/rcu/tree_plugin.h11
-rw-r--r--kernel/rcu/update.c103
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched/core.c51
-rw-r--r--kernel/sched/cpufreq.c18
-rw-r--r--kernel/sched/cpufreq_schedutil.c8
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c1
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c421
-rw-r--r--kernel/sched/loadavg.c33
-rw-r--r--kernel/sched/psi.c8
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h17
-rw-r--r--kernel/sched/topology.c52
-rw-r--r--kernel/seccomp.c10
-rw-r--r--kernel/signal.c106
-rw-r--r--kernel/stacktrace.c6
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/alarmtimer.c28
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/hrtimer.c11
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-clock.c31
-rw-r--r--kernel/time/sched_clock.c9
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/time.c3
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c29
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c213
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/ftrace.c41
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c30
-rw-r--r--kernel/trace/trace.h29
-rw-r--r--kernel/trace/trace_events.c14
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c173
-rw-r--r--kernel/trace/trace_events_trigger.c36
-rw-r--r--kernel/trace/trace_hwlat.c5
-rw-r--r--kernel/trace/trace_kprobe.c29
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_stat.c31
-rw-r--r--kernel/trace/tracing_map.c4
-rw-r--r--kernel/umh.c11
-rw-r--r--kernel/watchdog.c10
-rw-r--r--kernel/workqueue.c56
115 files changed, 2368 insertions, 1332 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 486c968214d9..a664fe710289 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -879,7 +879,7 @@ main_queue:
return 0;
}
-int audit_send_list(void *_dest)
+int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
@@ -923,19 +923,30 @@ out_kfree_skb:
return NULL;
}
+static void audit_free_reply(struct audit_reply *reply)
+{
+ if (!reply)
+ return;
+
+ if (reply->skb)
+ kfree_skb(reply->skb);
+ if (reply->net)
+ put_net(reply->net);
+ kfree(reply);
+}
+
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct sock *sk = audit_get_sk(reply->net);
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(sk, reply->skb, reply->portid, 0);
- put_net(reply->net);
- kfree(reply);
+ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
+ reply->skb = NULL;
+ audit_free_reply(reply);
return 0;
}
@@ -949,35 +960,32 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the port id.
- * No failure notifications.
+ * Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct sk_buff *skb;
struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
+ struct audit_reply *reply;
+ reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
- skb = audit_make_reply(seq, type, done, multi, payload, size);
- if (!skb)
- goto out;
-
- reply->net = get_net(net);
+ reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
+ if (!reply->skb)
+ goto err;
+ reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
- reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ if (IS_ERR(tsk))
+ goto err;
+
+ return;
+
+err:
+ audit_free_reply(reply);
}
/*
@@ -1100,13 +1108,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
audit_log_end(ab);
}
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
{
- struct audit_features *uaf;
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
- uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -1174,6 +1180,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
+ int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
@@ -1187,6 +1194,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
+ data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
@@ -1210,7 +1218,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
@@ -1314,7 +1322,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
break;
case AUDIT_SET_FEATURE:
- err = audit_set_feature(skb);
+ if (data_len < sizeof(struct audit_features))
+ return -EINVAL;
+ err = audit_set_feature(data);
if (err)
return err;
break;
@@ -1323,9 +1333,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
+ char *str = data;
+
err = 0;
if (msg_type == AUDIT_USER_TTY) {
err = tty_audit_push();
@@ -1333,26 +1348,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
audit_log_user_recv_msg(&ab, msg_type);
- if (msg_type != AUDIT_USER_TTY)
+ if (msg_type != AUDIT_USER_TTY) {
+ /* ensure NULL termination */
+ str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
- (char *)data);
- else {
- int size;
-
+ str);
+ } else {
audit_log_format(ab, " data=");
- size = nlmsg_len(nlh);
- if (size > 0 &&
- ((unsigned char *)data)[size - 1] == '\0')
- size--;
- audit_log_n_untrustedstring(ab, data, size);
+ if (data_len > 0 && str[data_len - 1] == '\0')
+ data_len--;
+ audit_log_n_untrustedstring(ab, str, data_len);
}
audit_log_end(ab);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(audit_context(), &ab,
@@ -1364,7 +1377,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_end(ab);
return -EPERM;
}
- err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
+ err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
@@ -1379,7 +1392,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
- size_t msglen = nlmsg_len(nlh);
+ size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
@@ -1455,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
diff --git a/kernel/audit.h b/kernel/audit.h
index 6c076d4982da..c018dcab4b11 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -229,7 +229,7 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *_dest);
+int audit_send_list_thread(void *_dest);
extern int selinux_audit_rule_update(void);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 1f31c2f1e6fc..4508d5e0cf69 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
+ inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9f8e190e3bea..d26db3c9a015 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -439,6 +439,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
+ u32 f_val;
err = -EINVAL;
@@ -447,12 +448,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
f->type = data->fields[i];
- f->val = data->values[i];
+ f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
- f->val = 0;
+ f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
@@ -468,7 +469,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
- f->uid = make_kuid(current_user_ns(), f->val);
+ f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
@@ -477,11 +478,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
- f->gid = make_kgid(current_user_ns(), f->val);
+ f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
+ f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
@@ -494,11 +496,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
+ entry->rule.buflen += f_val;
+ f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
(void **)&f->lsm_rule);
/* Keep currently invalid fields around in case they
@@ -507,68 +511,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
- }
- if (err) {
- kfree(str);
+ } else if (err)
goto exit_free;
- } else
- f->lsm_str = str;
break;
case AUDIT_WATCH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
- err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ }
+ err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
+ entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
+ entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
+ f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
- if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
+ }
+ entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
- if (entry->rule.exe || f->val > PATH_MAX)
+ if (entry->rule.exe || f_val > PATH_MAX)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
+ str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
- entry->rule.buflen += f->val;
-
- audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
goto exit_free;
}
+ entry->rule.buflen += f_val;
entry->rule.exe = audit_mark;
break;
+ default:
+ f->val = f_val;
+ break;
}
}
@@ -1137,11 +1144,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
- int err = 0;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1149,25 +1153,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
+ dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
+ put_net(dest->net);
kfree(dest);
- err = PTR_ERR(tsk);
+ return PTR_ERR(tsk);
}
- return err;
+ return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5fcc7a17eb5a..6d37931145d8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2309,7 +2309,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
- if (struct_size - bytes_offset < sizeof(int)) {
+ if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
@@ -3449,7 +3449,7 @@ int btf_get_info_by_fd(const struct btf *btf,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo;
- struct bpf_btf_info info = {};
+ struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
u32 uinfo_len;
@@ -3458,6 +3458,7 @@ int btf_get_info_by_fd(const struct btf *btf,
uinfo_len = attr->info.info_len;
info_copy = min_t(u32, uinfo_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 92a7d0cf8d13..b6ad8cb50efc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -239,8 +239,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
{
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
- struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
- *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
enum bpf_cgroup_storage_type stype;
struct bpf_prog_list *pl;
bool pl_was_allocated;
@@ -1046,12 +1046,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
return false;
switch (off) {
- case offsetof(struct bpf_sysctl, write):
+ case bpf_ctx_range(struct bpf_sysctl, write):
if (type != BPF_READ)
return false;
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
- case offsetof(struct bpf_sysctl, file_pos):
+ case bpf_ctx_range(struct bpf_sysctl, file_pos):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ceee0730fba5..fe8a72dc3a47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
}
-void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
+static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ebd0fa826f8..ae5de038f4a6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -476,7 +476,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOVERFLOW;
/* Make sure CPU is a valid possible cpu */
- if (!cpu_possible(key_cpu))
+ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
return -ENODEV;
if (qsize == 0) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 583df5cb302d..09ba58465758 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -679,15 +679,20 @@ static void htab_elem_free_rcu(struct rcu_head *head)
preempt_enable();
}
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
{
struct bpf_map *map = &htab->map;
+ void *ptr;
if (map->ops->map_fd_put_ptr) {
- void *ptr = fd_htab_map_get_ptr(map, l);
-
+ ptr = fd_htab_map_get_ptr(map, l);
map->ops->map_fd_put_ptr(ptr);
}
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
__pcpu_freelist_push(&htab->freelist, &l->fnode);
@@ -739,6 +744,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
+ htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cc0d0cf114e3..482e259ffcfb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -195,6 +195,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
void *key = map_iter(m)->key;
void *prev_key;
+ (*pos)++;
if (map_iter(m)->done)
return NULL;
@@ -207,8 +208,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
map_iter(m)->done = true;
return NULL;
}
-
- ++(*pos);
return key;
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ba635209ae9a..1853aa420f2f 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
ulen = info->jited_prog_len;
info->jited_prog_len = aux->offload->jited_len;
- if (info->jited_prog_len & ulen) {
+ if (info->jited_prog_len && ulen) {
uinsns = u64_to_user_ptr(info->jited_prog_insns);
ulen = min_t(u32, info->jited_prog_len, ulen);
if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d38e49f943a1..d41d4af5e3f3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
bool irq_work_busy = false;
struct stack_map_irq_work *work = NULL;
- if (in_nmi()) {
+ if (irqs_disabled()) {
work = this_cpu_ptr(&up_read_work);
if (work->irq_work.flags & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
@@ -297,8 +297,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
/*
- * We cannot do up_read() in nmi context. To do build_id lookup
- * in nmi context, we need to run up_read() in irq_work. We use
+ * We cannot do up_read() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To do build_id lookup when the
+ * irqs are disabled, we need to run up_read() in irq_work. We use
* a percpu variable to do the irq_work. If the irq_work is
* already used by another lookup, we fall back to report ips.
*
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d2146277071f..e1cc2d1f4003 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1102,7 +1102,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1130,8 +1131,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
if (err)
goto free_value;
- if (copy_to_user(uvalue, value, value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0) {
+ err = -EFAULT;
goto free_value;
+ }
err = 0;
@@ -1313,24 +1316,32 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+ kvfree(aux->func_info);
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
+static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
+{
+ bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ bpf_prog_free_linfo(prog);
+
+ if (deferred)
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ __bpf_prog_put_rcu(&prog->aux->rcu);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- bpf_prog_kallsyms_del_all(prog);
- btf_put(prog->aux->btf);
- kvfree(prog->aux->func_info);
- bpf_prog_free_linfo(prog);
-
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ __bpf_prog_put_noref(prog, true);
}
}
@@ -1709,11 +1720,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
return err;
free_used_maps:
- bpf_prog_free_linfo(prog);
- kvfree(prog->aux->func_info);
- btf_put(prog->aux->btf);
- bpf_prog_kallsyms_del_subprogs(prog);
- free_used_maps(prog->aux);
+ /* In case we have subprogs, we need to wait for a grace
+ * period before we can tear down JIT memory since symbols
+ * are already exposed under kallsyms.
+ */
+ __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ return err;
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
@@ -2266,7 +2278,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_prog_info info = {};
+ struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_stats stats;
char __user *uinsns;
@@ -2278,6 +2290,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
@@ -2541,7 +2554,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_map_info info = {};
+ struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;
@@ -2550,6 +2563,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -2777,7 +2791,7 @@ out:
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
- union bpf_attr attr = {};
+ union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
@@ -2789,6 +2803,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ca52b9642943..d4f335a9a899 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
-struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
/* if a.value is negative, arithmetic shifting by minimum shift
* will have larger negative offset compared to more shifting.
* If a.value is nonnegative, arithmetic shifting by minimum shift
* will have larger positive offset compare to more shifting.
*/
- return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
}
struct tnum tnum_add(struct tnum a, struct tnum b)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11528bdaa9dc..187ea0cfbcca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -201,8 +201,7 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
- s64 msize_smax_value;
- u64 msize_umax_value;
+ u64 msize_max_value;
int ref_obj_id;
int func_id;
};
@@ -2657,8 +2656,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* remember the mem_size which may be used later
* to refine return values.
*/
- meta->msize_smax_value = reg->smax_value;
- meta->msize_umax_value = reg->umax_value;
+ meta->msize_max_value = reg->umax_value;
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
@@ -3141,21 +3139,44 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
- int func_id,
- struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+ struct bpf_reg_state tmp_reg = *ret_reg;
+ bool ret;
if (ret_type != RET_INTEGER ||
(func_id != BPF_FUNC_get_stack &&
func_id != BPF_FUNC_probe_read_str))
- return;
+ return 0;
+
+ /* Error case where ret is in interval [S32MIN, -1]. */
+ ret_reg->smin_value = S32_MIN;
+ ret_reg->smax_value = -1;
+
+ __reg_deduce_bounds(ret_reg);
+ __reg_bound_offset(ret_reg);
+ __update_reg_bounds(ret_reg);
+
+ ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (!ret)
+ return -EFAULT;
+
+ *ret_reg = tmp_reg;
+
+ /* Success case where ret is in range [0, msize_max_value]. */
+ ret_reg->smin_value = 0;
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->umin_value = ret_reg->smin_value;
+ ret_reg->umax_value = ret_reg->smax_value;
- ret_reg->smax_value = meta->msize_smax_value;
- ret_reg->umax_value = meta->msize_umax_value;
__reg_deduce_bounds(ret_reg);
__reg_bound_offset(ret_reg);
+ __update_reg_bounds(ret_reg);
+
+ return 0;
}
static int
@@ -3384,7 +3405,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].ref_obj_id = id;
}
- do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
@@ -4084,9 +4107,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+ if (insn_bitness == 32) {
+ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
+ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
+ } else {
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ }
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
+ insn_bitness);
/* blow away the dst_reg umin_value/umax_value and rely on
* dst_reg var_off to refine the result.
@@ -4580,6 +4610,70 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
}
+/* Constrain the possible values of @reg with unsigned upper bound @bound.
+ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
+ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
+ * of @reg.
+ */
+static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
+ bool is_exclusive)
+{
+ if (is_exclusive) {
+ /* There are no values for `reg` that make `reg<0` true. */
+ if (bound == 0)
+ return;
+ bound--;
+ }
+ if (is_jmp32) {
+ /* Constrain the register's value in the tnum representation.
+ * For 64-bit comparisons this happens later in
+ * __reg_bound_offset(), but for 32-bit comparisons, we can be
+ * more precise than what can be derived from the updated
+ * numeric bounds.
+ */
+ struct tnum t = tnum_range(0, bound);
+
+ t.mask |= ~0xffffffffULL; /* upper half is unknown */
+ reg->var_off = tnum_intersect(reg->var_off, t);
+
+ /* Compute the 64-bit bound from the 32-bit bound. */
+ bound += gen_hi_max(reg->var_off);
+ }
+ reg->umax_value = min(reg->umax_value, bound);
+}
+
+/* Constrain the possible values of @reg with unsigned lower bound @bound.
+ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
+ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
+ * of @reg.
+ */
+static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
+ bool is_exclusive)
+{
+ if (is_exclusive) {
+ /* There are no values for `reg` that make `reg>MAX` true. */
+ if (bound == (is_jmp32 ? U32_MAX : U64_MAX))
+ return;
+ bound++;
+ }
+ if (is_jmp32) {
+ /* Constrain the register's value in the tnum representation.
+ * For 64-bit comparisons this happens later in
+ * __reg_bound_offset(), but for 32-bit comparisons, we can be
+ * more precise than what can be derived from the updated
+ * numeric bounds.
+ */
+ struct tnum t = tnum_range(bound, U32_MAX);
+
+ t.mask |= ~0xffffffffULL; /* upper half is unknown */
+ reg->var_off = tnum_intersect(reg->var_off, t);
+
+ /* Compute the 64-bit bound from the 32-bit bound. */
+ bound += gen_hi_min(reg->var_off);
+ }
+ reg->umin_value = max(reg->umin_value, bound);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4635,15 +4729,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGE:
case BPF_JGT:
{
- u64 false_umax = opcode == BPF_JGT ? val : val - 1;
- u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
-
- if (is_jmp32) {
- false_umax += gen_hi_max(false_reg->var_off);
- true_umin += gen_hi_min(true_reg->var_off);
- }
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE);
+ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT);
break;
}
case BPF_JSGE:
@@ -4664,15 +4751,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JLE:
case BPF_JLT:
{
- u64 false_umin = opcode == BPF_JLT ? val : val + 1;
- u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
-
- if (is_jmp32) {
- false_umin += gen_hi_min(false_reg->var_off);
- true_umax += gen_hi_max(true_reg->var_off);
- }
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE);
+ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT);
break;
}
case BPF_JSLE:
@@ -4747,15 +4827,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGE:
case BPF_JGT:
{
- u64 false_umin = opcode == BPF_JGT ? val : val + 1;
- u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
-
- if (is_jmp32) {
- false_umin += gen_hi_min(false_reg->var_off);
- true_umax += gen_hi_max(true_reg->var_off);
- }
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JGE);
+ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JGT);
break;
}
case BPF_JSGE:
@@ -4773,15 +4846,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JLE:
case BPF_JLT:
{
- u64 false_umax = opcode == BPF_JLT ? val : val - 1;
- u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
-
- if (is_jmp32) {
- false_umax += gen_hi_max(false_reg->var_off);
- true_umin += gen_hi_min(true_reg->var_off);
- }
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JLE);
+ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JLT);
break;
}
case BPF_JSLE:
@@ -5268,6 +5334,7 @@ static bool may_access_skb(enum bpf_prog_type type)
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -5301,7 +5368,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(env, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
return err;
@@ -5320,7 +5387,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
@@ -5333,6 +5400,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 88006be40ea3..9dc41608b013 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -492,6 +492,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
*/
p++;
if (p >= end) {
+ (*pos)++;
return NULL;
} else {
*pos = *p;
@@ -802,7 +803,7 @@ void cgroup1_release_agent(struct work_struct *work)
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
+ if (!pathbuf || !agentbuf || !strlen(agentbuf))
goto out;
spin_lock_irq(&css_set_lock);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d2cba714d3ee..fb24e7424411 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2119,11 +2119,12 @@ int cgroup_do_get_tree(struct fs_context *fc)
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
- fc->root = nsdentry;
if (IS_ERR(nsdentry)) {
- ret = PTR_ERR(nsdentry);
deactivate_locked_super(sb);
+ ret = PTR_ERR(nsdentry);
+ nsdentry = NULL;
}
+ fc->root = nsdentry;
}
if (!ctx->kfc.new_sb_created)
@@ -3067,8 +3068,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
@@ -3078,6 +3077,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
return PTR_ERR(css);
}
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
@@ -3113,11 +3114,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!css)
continue;
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
@@ -3404,7 +3405,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn, false);
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
@@ -4415,12 +4417,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
}
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
- if (!list_empty(&cset->tasks))
+ if (!list_empty(&cset->tasks)) {
it->task_pos = cset->tasks.next;
- else if (!list_empty(&cset->mg_tasks))
+ it->cur_tasks_head = &cset->tasks;
+ } else if (!list_empty(&cset->mg_tasks)) {
it->task_pos = cset->mg_tasks.next;
- else
+ it->cur_tasks_head = &cset->mg_tasks;
+ } else {
it->task_pos = cset->dying_tasks.next;
+ it->cur_tasks_head = &cset->dying_tasks;
+ }
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
@@ -4478,10 +4484,14 @@ repeat:
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head)
+ if (it->task_pos == it->tasks_head) {
it->task_pos = it->mg_tasks_head->next;
- if (it->task_pos == it->mg_tasks_head)
+ it->cur_tasks_head = it->mg_tasks_head;
+ }
+ if (it->task_pos == it->mg_tasks_head) {
it->task_pos = it->dying_tasks_head->next;
+ it->cur_tasks_head = it->dying_tasks_head;
+ }
if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
} else {
@@ -4500,11 +4510,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (!atomic_read(&task->signal->live))
+ if (it->cur_tasks_head == it->dying_tasks_head &&
+ !atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (task->flags & PF_EXITING)
+ if (it->cur_tasks_head == it->dying_tasks_head)
goto repeat;
}
}
@@ -4613,6 +4624,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
+ if (pos)
+ (*pos)++;
+
return css_task_iter_next(it);
}
@@ -4628,7 +4642,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
* from position 0, so we can simply keep iterating on !0 *pos.
*/
if (!it) {
- if (WARN_ON_ONCE((*pos)++))
+ if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
it = kzalloc(sizeof(*it), GFP_KERNEL);
@@ -4636,10 +4650,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
return ERR_PTR(-ENOMEM);
of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
- } else if (!(*pos)++) {
+ } else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
- }
+ } else
+ return it->cur_task;
return cgroup_procs_next(s, NULL, NULL);
}
@@ -6276,19 +6291,14 @@ void cgroup_sk_alloc_disable(void)
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled)
+ if (cgroup_sk_alloc_disabled) {
+ skcd->no_refcnt = 1;
return;
+ }
- /* Socket clone path */
- if (skcd->val) {
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt())
return;
- }
rcu_read_lock();
@@ -6306,8 +6316,24 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
rcu_read_unlock();
}
+void cgroup_sk_clone(struct sock_cgroup_data *skcd)
+{
+ /* Socket clone path */
+ if (skcd->val) {
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(sock_cgroup_ptr(skcd));
+ }
+}
+
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
+ if (skcd->no_refcnt)
+ return;
+
cgroup_put(sock_cgroup_ptr(skcd));
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 515525ff1cfd..e843e6351561 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -839,7 +839,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
- if (is_sched_load_balance(cp))
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 8cf010680678..3984dd6b8ddb 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -231,6 +231,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
return;
/*
+ * It's not necessary to do changes if both of the src and dst cgroups
+ * are not freezing and task is not frozen.
+ */
+ if (!test_bit(CGRP_FREEZE, &src->flags) &&
+ !test_bit(CGRP_FREEZE, &dst->flags) &&
+ !task->frozen)
+ return;
+
+ /*
* Adjust counters of freezing and frozen tasks.
* Note, that if the task is frozen, but the destination cgroup is not
* frozen, we bump both counters to keep them balanced.
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 8e513a573fe9..138059eb730d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -45,7 +45,7 @@ struct pids_cgroup {
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
*/
atomic64_t counter;
- int64_t limit;
+ atomic64_t limit;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->limit, PIDS_MAX);
atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
for (p = pids; parent_pids(p); p = parent_pids(p)) {
int64_t new = atomic64_add_return(num, &p->counter);
+ int64_t limit = atomic64_read(&p->limit);
/*
* Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > p->limit)
+ if (new > limit)
goto revert;
}
@@ -277,7 +278,7 @@ set_limit:
* Limit updates don't need to be mutex'd, since it isn't
* critical that any racing fork()s follow the new limit.
*/
- pids->limit = limit;
+ atomic64_set(&pids->limit, limit);
return nbytes;
}
@@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v)
{
struct cgroup_subsys_state *css = seq_css(sf);
struct pids_cgroup *pids = css_pids(css);
- int64_t limit = pids->limit;
+ int64_t limit = atomic64_read(&pids->limit);
if (limit >= PIDS_MAX)
seq_printf(sf, "%s\n", PIDS_MAX_STR);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ca19b4c8acf5..4a942d4e9763 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
return;
/*
- * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
- * see NULL updated_next or they see our updated stat.
- */
- smp_mb();
-
- /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
* Because @parent's updated_children is terminated with @parent
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
@@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
*nextp = rstatc->updated_next;
rstatc->updated_next = NULL;
- /*
- * Paired with the one in cgroup_rstat_cpu_updated().
- * Either they see NULL updated_next or we see their
- * updated stat.
- */
- smp_mb();
-
return pos;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 44fe5c7cfc07..0e483a4aaf84 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -515,8 +516,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
- /* Unpark the stopper thread and the hotplug thread of the target cpu */
- stop_machine_unpark(cpu);
+ /* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
@@ -555,6 +555,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}
+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1079,8 +1094,8 @@ void notify_cpu_starting(unsigned int cpu)
/*
* Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
@@ -1090,6 +1105,12 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ /*
+ * Unpart the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
@@ -1412,7 +1433,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
@@ -1892,6 +1913,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
EXPORT_SYMBOL(__cpuhp_remove_state);
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -2046,77 +2139,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static void cpuhp_offline_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = true;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
-}
-
-static void cpuhp_online_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = false;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_ONLINE);
-}
-
-int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- for_each_online_cpu(cpu) {
- if (topology_is_primary_thread(cpu))
- continue;
- ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
- if (ret)
- break;
- /*
- * As this needs to hold the cpu maps lock it's impossible
- * to call device_offline() because that ends up calling
- * cpu_down() which takes cpu maps lock. cpu maps lock
- * needs to be held as this might race against in kernel
- * abusers of the hotplug machinery (thermal management).
- *
- * So nothing would update device:offline state. That would
- * leave the sysfs entry stale and prevent onlining after
- * smt control has been changed to 'off' again. This is
- * called under the sysfs hotplug lock, so it is properly
- * serialized against the regular offline usage.
- */
- cpuhp_offline_cpu_device(cpu);
- }
- if (!ret)
- cpu_smt_control = ctrlval;
- cpu_maps_update_done();
- return ret;
-}
-
-int cpuhp_smt_enable(void)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- cpu_smt_control = CPU_SMT_ENABLED;
- for_each_present_cpu(cpu) {
- /* Skip online CPUs and CPUs on offline nodes */
- if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
- continue;
- ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
- if (ret)
- break;
- /* See comment in cpuhp_smt_disable() */
- cpuhp_online_cpu_device(cpu);
- }
- cpu_maps_update_done();
- return ret;
-}
-
-
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index cbca6879ab7d..44a259338e33 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
diff --git a/kernel/cred.c b/kernel/cred.c
index 153ae369e024..7cfd1db529d4 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -218,7 +218,7 @@ struct cred *cred_alloc_blank(void)
new->magic = CRED_MAGIC;
#endif
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -277,7 +277,7 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
@@ -706,7 +706,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..18a6a8642495 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -501,6 +501,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -545,6 +546,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -601,6 +603,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -619,6 +622,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -634,6 +638,8 @@ return_normal:
if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
goto kgdb_restore;
+ atomic_inc(&ignore_console_lock_warning);
+
/* Call the I/O driver's pre_exception routine */
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
@@ -706,6 +712,8 @@ cpu_master_loop:
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
+ atomic_dec(&ignore_console_lock_warning);
+
if (!kgdb_single_step) {
raw_spin_unlock(&dbg_slave_lock);
/* Wait till all the CPUs have quit from the debugger. */
@@ -738,6 +746,7 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 9ecfa37c7fbf..bb5dec762b69 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1103,12 +1103,12 @@ static int handle_ctrl_cmd(char *cmd)
case CTRL_P:
if (cmdptr != cmd_tail)
cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
case CTRL_N:
if (cmdptr != cmd_head)
cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
}
return 0;
@@ -1315,7 +1315,7 @@ do_full_getstr:
if (*cmdbuf != '\n') {
if (*cmdbuf < 32) {
if (cmdptr == cmd_head) {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
*(cmd_hist[cmd_head] +
strlen(cmd_hist[cmd_head])-1) = '\0';
@@ -1325,7 +1325,7 @@ do_full_getstr:
cmdbuf = cmd_cur;
goto do_full_getstr;
} else {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
}
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 29fd6590dc1e..3fe6e0119ee6 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -135,8 +135,9 @@ void dma_release_declared_memory(struct device *dev)
}
EXPORT_SYMBOL(dma_release_declared_memory);
-static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
- ssize_t size, dma_addr_t *dma_handle)
+static void *__dma_alloc_from_coherent(struct device *dev,
+ struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
{
int order = get_order(size);
unsigned long flags;
@@ -145,7 +146,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
spin_lock_irqsave(&mem->spinlock, flags);
- if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT)))
goto err;
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
@@ -155,8 +156,9 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
/*
* Memory was found in the coherent area.
*/
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ *dma_handle = dma_get_device_base(dev, mem) +
+ ((dma_addr_t)pageno << PAGE_SHIFT);
+ ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT);
spin_unlock_irqrestore(&mem->spinlock, flags);
memset(ret, 0, size);
return ret;
@@ -187,24 +189,25 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
if (!mem)
return 0;
- *ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+ *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
return 1;
}
-void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
{
if (!dma_coherent_default_memory)
return NULL;
- return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
- dma_handle);
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
}
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
int order, void *vaddr)
{
if (mem && vaddr >= mem->virt_base && vaddr <
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
unsigned long flags;
@@ -248,10 +251,10 @@ static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
{
if (mem && vaddr >= mem->virt_base && vaddr + size <=
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
unsigned long off = vma->vm_pgoff;
int start = (vaddr - mem->virt_base) >> PAGE_SHIFT;
- int user_count = vma_pages(vma);
+ unsigned long user_count = vma_pages(vma);
int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
*ret = -ENXIO;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 099002d84f46..cb6425e52bf7 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -137,9 +137,12 @@ static const char *const maperr2str[] = {
[MAP_ERR_CHECKED] = "dma map error checked",
};
-static const char *type2name[5] = { "single", "page",
- "scather-gather", "coherent",
- "resource" };
+static const char *type2name[] = {
+ [dma_debug_single] = "single",
+ [dma_debug_sg] = "scather-gather",
+ [dma_debug_coherent] = "coherent",
+ [dma_debug_resource] = "resource",
+};
static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
"DMA_FROM_DEVICE", "DMA_NONE" };
@@ -420,6 +423,7 @@ void debug_dma_dump_mappings(struct device *dev)
}
spin_unlock_irqrestore(&bucket->lock, flags);
+ cond_resched();
}
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9912be7a970d..bebc5ee62cbb 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -53,7 +53,8 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
u64 dma_direct_get_required_mask(struct device *dev)
{
- u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+ phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ u64 max_dma = phys_to_dma_direct(dev, phys);
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
@@ -357,7 +358,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
{
dma_addr_t dma_addr = paddr;
- if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size))) {
report_addr(dev, dma_addr, size);
return DMA_MAPPING_ERROR;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a690905247fe..0e6168c4cb7a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -93,11 +93,11 @@ static void remote_function(void *data)
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
+ * be on the current CPU, which just calls the function directly. This will
+ * retry due to any failures in smp_call_function_single(), such as if the
+ * task_cpu() goes offline concurrently.
*
- * returns: @func return value, or
- * -ESRCH - when the process isn't running
- * -EAGAIN - when the process moved away
+ * returns @func return value or -ESRCH when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -110,11 +110,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
};
int ret;
- do {
- ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
- if (!ret)
- ret = data.ret;
- } while (ret == -EAGAIN);
+ for (;;) {
+ ret = smp_call_function_single(task_cpu(p), remote_function,
+ &data, 1);
+ ret = !ret ? data.ret : -EAGAIN;
+
+ if (ret != -EAGAIN)
+ break;
+
+ cond_resched();
+ }
return ret;
}
@@ -3685,11 +3690,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3721,9 +3738,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_event_to_rotate(ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
@@ -5510,7 +5527,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
@@ -5583,7 +5600,8 @@ again:
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
@@ -5727,8 +5745,20 @@ accounting:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
- if (user_locked > user_lock_limit)
+ if (user_locked <= user_lock_limit) {
+ /* charge all to locked_vm */
+ } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
+ /* charge all to pinned_vm */
+ extra = user_extra;
+ user_extra = 0;
+ } else {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
extra = user_locked - user_lock_limit;
+ user_extra -= extra;
+ }
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
@@ -6419,9 +6449,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
@@ -6931,10 +6964,17 @@ static void perf_event_task_output(struct perf_event *event,
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
@@ -11686,7 +11726,7 @@ inherit_event(struct perf_event *parent_event,
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..673044edd9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -815,10 +815,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
@@ -1114,6 +1110,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (offset > i_size_read(inode))
return -EINVAL;
+ /*
+ * This ensures that copy_from_page(), copy_to_page() and
+ * __update_ref_ctr() can't cross page boundary.
+ */
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return -EINVAL;
+ if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
+ return -EINVAL;
+
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
@@ -1962,6 +1967,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
@@ -2145,7 +2153,7 @@ static void handle_swbp(struct pt_regs *regs)
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
- send_sig(SIGTRAP, current, 0);
+ force_sig(SIGTRAP, current);
} else {
/*
* Either we raced with uprobe_unregister() or we can't
diff --git a/kernel/exit.c b/kernel/exit.c
index 235cf6825593..2dbaa5c86b73 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -579,10 +579,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
- }
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -839,8 +835,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -858,6 +858,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -895,19 +905,20 @@ void __noreturn do_exit(long code)
raw_spin_lock_irq(&tsk->pi_lock);
raw_spin_unlock_irq(&tsk->pi_lock);
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
diff --git a/kernel/futex.c b/kernel/futex.c
index 4b5b468c58b6..5758f44c994a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -379,9 +379,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
+
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -423,7 +423,7 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies smp_mb(); (B) */
+ smp_mb(); /* explicit smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies smp_mb(); (B) */
@@ -457,7 +457,6 @@ static void drop_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- iput(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
@@ -470,6 +469,46 @@ enum futex_access {
FUTEX_WRITE
};
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that match_futex() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
@@ -482,9 +521,15 @@ enum futex_access {
*
* The key words are stored in @key on success.
*
- * For shared mappings, it's (page->index, file_inode(vma->vm_file),
- * offset_within_page). For private mappings, it's (uaddr, current->mm).
- * We can usually work out the index without swapping in the page.
+ * For shared mappings (when @fshared), the key is:
+ * ( inode->i_sequence, page->index, offset_within_page )
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
@@ -624,8 +669,6 @@ again:
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */
-
} else {
struct inode *inode;
@@ -657,40 +700,14 @@ again:
goto again;
}
- /*
- * Take a reference unless it is about to be freed. Previously
- * this reference was taken by ihold under the page lock
- * pinning the inode in place so i_lock was unnecessary. The
- * only way for this check to fail is if the inode was
- * truncated in parallel which is almost certainly an
- * application bug. In such a case, just retry.
- *
- * We are not calling into get_futex_key_refs() in file-backed
- * cases, therefore a successful atomic_inc return below will
- * guarantee that get_futex_key() will still imply smp_mb(); (B).
- */
- if (!atomic_inc_not_zero(&inode->i_count)) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- /* Should be impossible but lets be paranoid for now */
- if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
- err = -EFAULT;
- rcu_read_unlock();
- iput(inode);
-
- goto out;
- }
-
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = inode;
+ key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
out:
put_page(page);
return err;
@@ -3445,11 +3462,16 @@ err_unlock:
return ret;
}
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
/*
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
{
u32 uval, uninitialized_var(nval), mval;
int err;
@@ -3462,6 +3484,42 @@ retry:
if (get_user(uval, uaddr))
return -1;
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) User space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. The user space futex value is
+ * uncontended and the rest of the user space mutex state is
+ * consistent, so a woken waiter will just take over the
+ * uncontended futex. Setting the OWNER_DIED bit would create
+ * inconsistent state and malfunction of the user space owner died
+ * handling.
+ */
+ if (pending_op && !pi && !uval) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
return 0;
@@ -3581,10 +3639,11 @@ void exit_robust_list(struct task_struct *curr)
* A pending lock might already be on the list, so
* don't process it twice:
*/
- if (entry != pending)
+ if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi))
+ curr, pi, HANDLE_DEATH_LIST))
return;
+ }
if (rc)
return;
entry = next_entry;
@@ -3598,9 +3657,10 @@ void exit_robust_list(struct task_struct *curr)
cond_resched();
}
- if (pending)
+ if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip);
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3777,7 +3837,8 @@ void compat_exit_robust_list(struct task_struct *curr)
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi))
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
return;
}
if (rc)
@@ -3796,7 +3857,7 @@ void compat_exit_robust_list(struct task_struct *curr)
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip);
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..291e0797125b 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -109,9 +109,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c1eccd4f6520..d44c8fd17609 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+ BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
@@ -205,8 +206,15 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
- /* Can't do level nor NMIs, sorry */
+ /*
+ * Don't allow injection when the interrupt is:
+ * - Level or NMI type
+ * - not activated
+ * - replaying already
+ */
+ if (irq_settings_is_level(desc) ||
+ !irqd_is_activated(&desc->irq_data) ||
+ (desc->istate & (IRQS_NMI | IRQS_REPLAY))) {
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 3a948f41ab00..207bac6ad4c3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
-extern int irq_select_affinity_usr(unsigned int irq);
-
extern void irq_set_thread_affinity(struct irq_desc *desc);
extern int irq_do_set_affinity(struct irq_data *data,
@@ -424,6 +422,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
return desc->pending_mask;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return irqd_is_handle_enforce_irqctx(data);
+}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
@@ -450,6 +452,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return false;
+}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9be995fc3c5a..6a27603fe1cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq)
int generic_handle_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_data *data;
if (!desc)
return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ return -EPERM;
+
generic_handle_irq_desc(desc);
return 0;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a453e229f99c..ee549d1007a4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1291,6 +1291,11 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs, void *arg)
{
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
@@ -1328,11 +1333,6 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
- if (!domain->ops->alloc) {
- pr_debug("domain->ops->alloc() is NULL\n");
- return -ENOSYS;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1457,6 +1457,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (rv) {
/* Restore the original irq_data. */
*root_irq_data = *child_irq_data;
+ kfree(child_irq_data);
goto error;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..e557a375cd02 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -194,9 +194,9 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
static void irq_validate_effective_affinity(struct irq_data *data)
{
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -204,9 +204,19 @@ static void irq_validate_effective_affinity(struct irq_data *data)
return;
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
-#endif
}
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask)
+{
+ cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
+}
+#else
+static inline void irq_validate_effective_affinity(struct irq_data *data) { }
+static inline void irq_init_effective_affinity(struct irq_data *data,
+ const struct cpumask *mask) { }
+#endif
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -265,6 +275,30 @@ static int irq_try_set_affinity(struct irq_data *data,
return ret;
}
+static bool irq_set_affinity_deactivated(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ /*
+ * Handle irq chips which can handle affinity only in activated
+ * state correctly
+ *
+ * If the interrupt is not yet activated, just store the affinity
+ * mask and do not call the chip driver at all. On activation the
+ * driver has to make sure anyway that the interrupt is in a
+ * useable state so startup works.
+ */
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+ irqd_is_activated(data) || !irqd_affinity_on_activate(data))
+ return false;
+
+ cpumask_copy(desc->irq_common_data.affinity, mask);
+ irq_init_effective_affinity(data, mask);
+ irqd_set(data, IRQD_AFFINITY_SET);
+ return true;
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -275,6 +309,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ if (irq_set_affinity_deactivated(data, mask, force))
+ return 0;
+
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
ret = irq_try_set_affinity(data, mask, force);
} else {
@@ -284,7 +321,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
- schedule_work(&desc->affinity_notify->work);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref,
+ desc->affinity_notify->release);
+ }
}
irqd_set(data, IRQD_AFFINITY_SET);
@@ -384,7 +425,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (old_notify) {
- cancel_work_sync(&old_notify->work);
+ if (cancel_work_sync(&old_notify->work)) {
+ /* Pending work had a ref, put that one too */
+ kref_put(&old_notify->kref, old_notify->release);
+ }
kref_put(&old_notify->kref, old_notify->release);
}
@@ -442,23 +486,9 @@ int irq_setup_affinity(struct irq_desc *desc)
{
return irq_select_affinity(irq_desc_get_irq(desc));
}
-#endif
-
-/*
- * Called when a bogus affinity is set via /proc/irq
- */
-int irq_select_affinity_usr(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
+#endif /* CONFIG_AUTO_IRQ_AFFINITY */
+#endif /* CONFIG_SMP */
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_setup_affinity(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
-}
-#endif
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ad26fbcfbfc8..eb95f6106a1e 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
continue;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve)
+ if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..e8c655b7a430 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -115,6 +115,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
return show_irq_affinity(AFFINITY_LIST, m);
}
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ /*
+ * If the interrupt is started up already then this fails. The
+ * interrupt is assigned to an online CPU already. There is no
+ * point to move it around randomly. Tell user space that the
+ * selected mask is bogus.
+ *
+ * If not then any change to the affinity is pointless because the
+ * startup code invokes irq_setup_affinity() which will select
+ * a online CPU anyway.
+ */
+ return -EINVAL;
+}
+#else
+/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ return irq_select_affinity(irq);
+}
+#endif
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 98c04ca5fa43..5064b13b80d6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -72,8 +72,9 @@ void check_irq_resend(struct irq_desc *desc)
desc->istate &= ~IRQS_PENDING;
desc->istate |= IRQS_REPLAY;
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
+ if ((!desc->irq_data.chip->irq_retrigger ||
+ !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) &&
+ !handle_enforce_irqctx(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
unsigned int irq = irq_desc_get_irq(desc);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..19d2fad379ee 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -267,6 +267,23 @@ void irq_timings_disable(void)
#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
struct irqt_stat {
u64 last_ts;
u64 ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +314,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
{
- int i;
+ int period;
+
+ /*
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
+ */
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
/*
* The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +332,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
* period beginning at the end of the buffer. We do that for
* each suffix.
*/
- for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
- int *begin = &buffer[len - (i * 3)];
- int *ptr = begin;
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
/*
* We look if the suite with period 'i' repeat
* itself. If it is truncated at the end, as it
* repeats we can use the period to find out the next
- * element.
+ * element with the modulo.
*/
- while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
- ptr += i;
- if (ptr >= &buffer[len])
- return begin[((i * 3) % i)];
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
}
}
@@ -493,11 +543,7 @@ u64 irq_timings_next_event(u64 now)
* model while decrementing the counter because we consume the
* data from our circular buffer.
*/
-
- i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-
- for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+ for_each_irqts(i, irqts) {
irq = irq_timing_decode(irqts->values[i], &ts);
s = idr_find(&irqt_stats, irq);
if (s)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 136ce049c4ad..61f9d781f70a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -645,19 +645,20 @@ static inline int kallsyms_for_perf(void)
* Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
* block even that).
*/
-int kallsyms_show_value(void)
+bool kallsyms_show_value(const struct cred *cred)
{
switch (kptr_restrict) {
case 0:
if (kallsyms_for_perf())
- return 1;
+ return true;
/* fallthrough */
case 1:
- if (has_capability_noaudit(current, CAP_SYSLOG))
- return 1;
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
/* fallthrough */
default:
- return 0;
+ return false;
}
}
@@ -674,7 +675,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
- iter->show_value = kallsyms_show_value();
+ /*
+ * Instead of checking this on every s_show() call, cache
+ * the result here at open time.
+ */
+ iter->show_value = kallsyms_show_value(file->f_cred);
return 0;
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bc6addd9152b..a2de58de6ab6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,7 +120,7 @@ out:
* invoke it.
*
* If module auto-loading support is disabled then this function
- * becomes a no-operation.
+ * simply returns -ENOENT.
*/
int __request_module(bool wait, const char *fmt, ...)
{
@@ -137,7 +137,7 @@ int __request_module(bool wait, const char *fmt, ...)
WARN_ON_ONCE(wait && current_is_async());
if (!modprobe_path[0])
- return 0;
+ return -ENOENT;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1010bde1146b..a0dfc7d425f0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -326,7 +326,8 @@ struct kprobe *get_kprobe(void *addr)
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist,
+ lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
@@ -510,6 +511,8 @@ static void do_unoptimize_kprobes(void)
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -584,11 +587,12 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
+
+ mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
@@ -610,6 +614,18 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
+static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
+}
+
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
@@ -631,17 +647,21 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
@@ -649,6 +669,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -662,31 +683,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
@@ -1029,9 +1052,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
return ret;
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+ return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
/* Arm a kprobe with text_mutex */
@@ -1191,6 +1225,26 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
+struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
@@ -1208,6 +1262,8 @@ void kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ kprobe_busy_begin();
+
INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
@@ -1221,6 +1277,8 @@ void kprobe_flush_task(struct task_struct *tk)
hlist_del(&ri->hlist);
kfree(ri);
}
+
+ kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
@@ -2033,6 +2091,13 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace.
+ */
+ if (kprobe_ftrace(p))
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
@@ -2290,7 +2355,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p,
else
kprobe_type = "k";
- if (!kallsyms_show_value())
+ if (!kallsyms_show_value(pi->file->f_cred))
addr = NULL;
if (sym)
@@ -2391,7 +2456,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
* If /proc/kallsyms is not showing kernel address, we won't
* show them here either.
*/
- if (!kallsyms_show_value())
+ if (!kallsyms_show_value(m->file->f_cred))
seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
(void *)ent->start_addr);
else
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4fc8dc30cec0..98bdba3161d7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -421,13 +421,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -437,6 +430,15 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
+unsigned long nr_stack_trace_entries;
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
static int save_trace(struct lock_trace *trace)
{
unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -459,6 +461,7 @@ static int save_trace(struct lock_trace *trace)
return 1;
}
+#endif
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
@@ -472,6 +475,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -489,6 +493,7 @@ static const char *usage_str[] =
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
+#endif
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
@@ -1606,9 +1611,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_forward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
@@ -1633,9 +1640,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
arch_spin_lock(&lockdep_lock);
ret = __lockdep_count_backward_deps(&this);
arch_spin_unlock(&lockdep_lock);
+ current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
@@ -2819,10 +2828,6 @@ static inline int validate_chain(struct task_struct *curr,
{
return 1;
}
-
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
-{
-}
#endif
/*
@@ -2876,12 +2881,10 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+#ifdef CONFIG_PROVE_LOCKING
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
-
-
static void
print_usage_bug_scenario(struct held_lock *lock)
{
@@ -3369,8 +3372,12 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
+ if (!check)
+ goto lock_used;
+
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
@@ -3414,6 +3421,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
return 1;
}
@@ -3445,35 +3457,6 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
@@ -3541,6 +3524,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+#else /* CONFIG_PROVE_LOCKING */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROVE_LOCKING */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
@@ -3744,11 +3748,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
- return 0;
-
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -4230,8 +4231,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
*/
static void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..a9768d89e521 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -697,10 +697,10 @@ static void __torture_print_stats(char *page,
if (statp[i].n_lock_fail)
fail = true;
sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 399669f7eba8..472dd462a40c 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init);
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -187,8 +187,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -197,8 +197,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
diff --git a/kernel/module.c b/kernel/module.c
index dcf2cc656e7c..9f35e1927807 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -208,7 +208,8 @@ static struct module *mod_find(unsigned long addr)
{
struct module *mod;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (within_module(addr, mod))
return mod;
}
@@ -442,7 +443,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -602,7 +604,8 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry_rcu(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1019,6 +1022,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
return 0;
out:
mutex_unlock(&module_mutex);
@@ -1447,8 +1452,7 @@ static inline bool sect_empty(const Elf_Shdr *sect)
}
struct module_sect_attr {
- struct module_attribute mattr;
- char *name;
+ struct bin_attribute battr;
unsigned long address;
};
@@ -1458,13 +1462,34 @@ struct module_sect_attrs {
struct module_sect_attr attrs[0];
};
-static ssize_t module_sect_show(struct module_attribute *mattr,
- struct module_kobject *mk, char *buf)
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
{
struct module_sect_attr *sattr =
- container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%px\n", kptr_restrict < 2 ?
- (void *)sattr->address : NULL);
+ container_of(battr, struct module_sect_attr, battr);
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? (void *)sattr->address : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1472,7 +1497,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
unsigned int section;
for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs->attrs[section].battr.attr.name);
kfree(sect_attrs);
}
@@ -1481,7 +1506,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
struct module_sect_attr *sattr;
- struct attribute **gattr;
+ struct bin_attribute **gattr;
/* Count loaded sections and allocate structures */
for (i = 0; i < info->hdr->e_shnum; i++)
@@ -1489,35 +1514,34 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
nloaded++;
size[0] = ALIGN(sizeof(*sect_attrs)
+ nloaded * sizeof(sect_attrs->attrs[0]),
- sizeof(sect_attrs->grp.attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+ sizeof(sect_attrs->grp.bin_attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
if (sect_attrs == NULL)
return;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
- sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+ sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.attrs[0];
+ gattr = &sect_attrs->grp.bin_attrs[0];
for (i = 0; i < info->hdr->e_shnum; i++) {
Elf_Shdr *sec = &info->sechdrs[i];
if (sect_empty(sec))
continue;
+ sysfs_bin_attr_init(&sattr->battr);
sattr->address = sec->sh_addr;
- sattr->name = kstrdup(info->secstrings + sec->sh_name,
- GFP_KERNEL);
- if (sattr->name == NULL)
+ sattr->battr.attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (sattr->battr.attr.name == NULL)
goto out;
sect_attrs->nsections++;
- sysfs_attr_init(&sattr->mattr.attr);
- sattr->mattr.show = module_sect_show;
- sattr->mattr.store = NULL;
- sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUSR;
- *(gattr++) = &(sattr++)->mattr.attr;
+ sattr->battr.read = module_sect_read;
+ sattr->battr.size = MODULE_SECT_READ_SIZE;
+ sattr->battr.attr.mode = 0400;
+ *(gattr++) = &(sattr++)->battr;
}
*gattr = NULL;
@@ -1607,7 +1631,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
continue;
if (info->sechdrs[i].sh_type == SHT_NOTE) {
sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
nattr->attr.mode = S_IRUGO;
nattr->size = info->sechdrs[i].sh_size;
nattr->private = (void *) info->sechdrs[i].sh_addr;
@@ -1725,6 +1749,8 @@ static int module_add_modinfo_attrs(struct module *mod)
error_out:
if (i > 0)
module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
return error;
}
@@ -2957,9 +2983,7 @@ static int setup_load_info(struct load_info *info, int flags)
/* Try to find a name early so we can log errors with a module name */
info->index.info = find_sec(info, ".modinfo");
- if (!info->index.info)
- info->name = "(missing .modinfo section)";
- else
+ if (info->index.info)
info->name = get_modinfo(info, "name");
/* Find internal symbols and strings. */
@@ -2974,14 +2998,15 @@ static int setup_load_info(struct load_info *info, int flags)
}
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo name field)");
+ info->name ?: "(missing .modinfo section or name field)");
return -ENOEXEC;
}
/* This is temporary: point mod into copy of data. */
@@ -4285,7 +4310,7 @@ static int modules_open(struct inode *inode, struct file *file)
if (!err) {
struct seq_file *m = file->private_data;
- m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
}
return err;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d9f5081d578d..157d7c29f720 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -554,7 +554,7 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/padata.c b/kernel/padata.c
index 15a8ad63f4ff..4f39a5e7a853 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -35,6 +35,8 @@
#define MAX_OBJ_NUM 1000
+static void padata_free_pd(struct parallel_data *pd);
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -334,6 +336,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -343,6 +346,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -352,9 +357,12 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ if (atomic_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
/**
@@ -501,7 +509,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
+ atomic_set(&pd->refcnt, 1);
pd->pinst = pinst;
spin_lock_init(&pd->lock);
@@ -526,31 +534,6 @@ static void padata_free_pd(struct parallel_data *pd)
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- del_timer_sync(&pd->timer);
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -564,10 +547,6 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
@@ -588,8 +567,8 @@ static void padata_replace(struct padata_instance *pinst,
if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
notification_mask |= PADATA_CPU_SERIAL;
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ if (atomic_dec_and_test(&pd_old->refcnt))
+ padata_free_pd(pd_old);
if (notification_mask)
blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
@@ -692,8 +671,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- mutex_lock(&pinst->lock);
get_online_cpus();
+ mutex_lock(&pinst->lock);
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -711,8 +690,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ put_online_cpus();
return err;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 3406a7a31449..d5be4272ab18 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -592,37 +592,25 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
struct warn_args args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
- &args);
- va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
-
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
-{
- struct warn_args args;
+ if (!fmt) {
+ pr_warn(CUT_HERE);
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
- pr_warn(CUT_HERE);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cd7434e6000d..9a67082bcbd8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -897,6 +897,13 @@ static int software_resume(void)
error = freeze_processes();
if (error)
goto Close_Finish;
+
+ error = freeze_kernel_threads();
+ if (error) {
+ thaw_processes();
+ goto Close_Finish;
+ }
+
error = load_image_and_restore();
thaw_processes();
Finish:
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 83105874f255..26b9168321e7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -734,8 +734,15 @@ zone_found:
* We have found the zone. Now walk the radix tree to find the leaf node
* for our PFN.
*/
+
+ /*
+ * If the zone we wish to scan is the the current zone and the
+ * pfn falls into the current node then we do not need to walk
+ * the tree.
+ */
node = bm->cur.node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ if (zone == bm->cur.zone &&
+ ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index c8e6ab689d42..b2b0f526f249 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
+void printk_safe_init(void);
+bool printk_percpu_data_ready(void);
+
#define printk_safe_enter_irqsave(flags) \
do { \
local_irq_save(flags); \
@@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
#define printk_safe_enter_irq() local_irq_disable()
#define printk_safe_exit_irq() local_irq_enable()
+static inline void printk_safe_init(void) { }
+static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 424abf802f02..89c6936dee11 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -450,6 +450,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __read_mostly;
+
+bool printk_percpu_data_ready(void)
+{
+ return __printk_percpu_data_ready;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -1136,12 +1148,28 @@ static void __init log_buf_add_cpu(void)
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
+static void __init set_percpu_data_ready(void)
+{
+ printk_safe_init();
+ /* Make sure we set this flag only after printk_safe() init is done */
+ barrier();
+ __printk_percpu_data_ready = true;
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ /*
+ * Some archs call setup_log_buf() multiple times - first is very
+ * early, e.g. from setup_arch(), and second - when percpu_areas
+ * are initialised.
+ */
+ if (!early)
+ set_percpu_data_ready();
+
if (log_buf != __log_buf)
return;
@@ -2760,8 +2788,6 @@ void register_console(struct console *newcon)
* for us.
*/
logbuf_lock_irqsave(flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2773,6 +2799,8 @@ void register_console(struct console *newcon)
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -2956,6 +2984,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
void wake_up_klogd(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
@@ -2966,6 +2997,9 @@ void wake_up_klogd(void)
void defer_console_output(void)
{
+ if (!printk_percpu_data_ready())
+ return;
+
preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index b4045e782743..d9a659a686f3 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -27,7 +27,6 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready)
+ if (printk_percpu_data_ready())
irq_work_queue(&s->work);
}
@@ -402,14 +401,6 @@ void __init printk_safe_init(void)
#endif
}
- /*
- * In the highly unlikely event that a NMI were to trigger at
- * this moment. Make sure IRQ work is set up before this
- * variable is set.
- */
- barrier();
- printk_safe_irq_ready = 1;
-
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 705887f63288..f7db0123a06b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -259,12 +259,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
+ unsigned int mode)
{
+ int ret;
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
+
+ return ret == 0;
}
/* Returns 0 on success, -errno on denial. */
@@ -316,7 +321,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(cred, tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -335,7 +340,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
+ !ptrace_has_cap(cred, mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
config PROVE_RCU
def_bool PROVE_LOCKING
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..58c178e2b394 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(ssp);
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
@@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long flags;
struct srcu_data *sdp;
unsigned long t;
+ unsigned long tlast;
/* If the local srcu_data structure has callbacks, not idle. */
local_irq_save(flags);
@@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
+ tlast = READ_ONCE(ssp->srcu_last_gp_end);
if (exp_holdoff == 0 ||
- time_in_range_open(t, ssp->srcu_last_gp_end,
- ssp->srcu_last_gp_end + exp_holdoff))
+ time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index affa7aae758f..58cdad75ddac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -547,7 +547,7 @@ static void rcu_eqs_enter(bool user)
}
lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
@@ -620,14 +620,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+ trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+ atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (irq)
@@ -714,7 +715,7 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+ trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -798,7 +799,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+ rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c990df880d1..91357562f168 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
rcu_for_each_node_breadth_first(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
- rnp->expmask = rnp->expmaskinit;
+ WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
rnp = rnp->parent;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
}
}
@@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- rnp->expmask &= ~mask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
}
@@ -373,12 +373,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- if (!(mask_ofl_ipi & mask))
- continue;
retry_ipi:
if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
mask_ofl_test |= mask;
@@ -487,7 +485,7 @@ static void synchronize_sched_expedited_wait(void)
struct rcu_data *rdp;
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -499,7 +497,8 @@ static void synchronize_sched_expedited_wait(void)
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rcu_state.expedited_sequence,
- rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ READ_ONCE(rnp_root->expmask),
+ ".T"[!!rnp_root->exp_tasks]);
if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rnp) {
@@ -509,7 +508,7 @@ static void synchronize_sched_expedited_wait(void)
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
- rnp->expmask,
+ READ_ONCE(rnp->expmask),
".T"[!!rnp->exp_tasks]);
}
pr_cont("\n");
@@ -517,7 +516,7 @@ static void synchronize_sched_expedited_wait(void)
rcu_for_each_leaf_node(rnp) {
for_each_leaf_node_possible_cpu(rnp, cpu) {
mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(rnp->expmask & mask))
+ if (!(READ_ONCE(rnp->expmask) & mask))
continue;
dump_cpu_task(cpu);
}
@@ -537,14 +536,13 @@ static void rcu_exp_wait_wake(unsigned long s)
struct rcu_node *rnp;
synchronize_sched_expedited_wait();
- rcu_exp_gp_seq_end();
- trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
- /*
- * Switch over to wakeup mode, allowing the next GP, but -only- the
- * next GP, to proceed.
- */
+ // Switch over to wakeup mode, allowing the next GP to proceed.
+ // End the previous grace period only after acquiring the mutex
+ // to ensure that only one GP runs concurrently with wakeups.
mutex_lock(&rcu_state.exp_wake_mutex);
+ rcu_exp_gp_seq_end();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -555,7 +553,7 @@ static void rcu_exp_wait_wake(unsigned long s)
spin_unlock(&rnp->exp_lock);
}
smp_mb(); /* All above changes before wakeup. */
- wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
}
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
mutex_unlock(&rcu_state.exp_wake_mutex);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1102765f91fd..8df9df6e018c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -240,7 +240,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
- rnp->gp_tasks = &t->rcu_node_entry;
+ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
}
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- return rnp->gp_tasks != NULL;
+ return READ_ONCE(rnp->gp_tasks) != NULL;
}
/* Bias and limit values for ->rcu_read_lock_nesting. */
@@ -523,7 +523,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gp_seq, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
+ WRITE_ONCE(rnp->gp_tasks, np);
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
@@ -661,7 +661,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp) &&
(rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
- rnp->gp_tasks = rnp->blkd_tasks.next;
+ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
t = container_of(rnp->gp_tasks, struct task_struct,
rcu_node_entry);
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
@@ -755,7 +755,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
- __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
+ __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
+ rnp->exp_tasks);
pr_info("%s: ->blkd_tasks", __func__);
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..eb5e63b6841f 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
*
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * Returns true if lockdep must be ignored, in which case *ret contains
+ * the best guess described below. Otherwise returns false, in which
+ * case *ret tells the caller nothing and the caller should instead
+ * consult lockdep.
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0);
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes. This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * then rcu_read_lock_held() sets *ret to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
*
- * Similarly, we avoid claiming an SRCU read lock held if the current
+ * Similarly, we avoid claiming an RCU read lock held if the current
* CPU is offline.
*/
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = 1;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = 0;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = 0;
+ return true;
+ }
+ return false;
+}
+
int rcu_read_lock_sched_held(void)
{
- int lockdep_opinion = 0;
+ bool ret;
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
- if (debug_locks)
- lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || !preemptible();
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
@@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
- rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
*/
int rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*/
int rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
diff --git a/kernel/relay.c b/kernel/relay.c
index ade14fb7ce2e..4b760ec16342 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename,
return NULL;
chan->buf = alloc_percpu(struct rchan_buf *);
+ if (!chan->buf) {
+ kfree(chan);
+ return NULL;
+ }
+
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be7c380c07e6..dcc54d44fcde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2338,6 +2338,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -2402,6 +2403,7 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
+ rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -3102,28 +3104,31 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr) || cpu_is_offline(cpu))
+ if (cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
- delta = rq_clock_task(rq) - curr->se.exec_start;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
curr->sched_class->task_tick(rq, curr, 0);
+ calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
-
out_requeue:
+
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
@@ -3857,7 +3862,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -5405,10 +5411,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ __sched_fork(0, idle);
+
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
@@ -5562,13 +5569,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -6429,8 +6437,15 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
- if (running)
+ if (running) {
set_curr_task(rq, tsk);
+ /*
+ * After changing group, the running task may have joined a
+ * throttled one but it's still the running task. Trigger a
+ * resched to make sure that task can still run.
+ */
+ resched_curr(rq);
+ }
task_rq_unlock(rq, tsk, &rf);
}
@@ -6565,6 +6580,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -6593,6 +6610,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d83c7f..7c2fe50fd76d 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu)
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e139b54716b4..dd44eb122514 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce89a26c..46ed4e1383e2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
/* We might have scheduled out from guest path */
- if (current->flags & PF_VCPU)
+ if (tsk->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
@@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk)
*/
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
- current->flags |= PF_VCPU;
+ tsk->flags |= PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
- current->flags &= ~PF_VCPU;
+ tsk->flags &= ~PF_VCPU;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcdafdcb129c..3daa9b4cf6d9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2679,6 +2679,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
+ dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..0650be5f5bc7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -272,7 +272,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0444, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a312c030b8d..50f4b19a5387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2659,7 +2659,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -3488,9 +3488,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3596,8 +3593,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3805,7 +3806,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- rq->misfit_task_load = task_h_load(p);
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
@@ -3908,6 +3913,7 @@ static inline void check_schedstat_required(void)
#endif
}
+static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
@@ -3986,10 +3992,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ /*
+ * When bandwidth control is enabled, cfs might have been removed
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
+ * add it unconditionnally.
+ */
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
- }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4334,23 +4346,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)
}
/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
- if (cfs_b->quota == RUNTIME_INF)
- return;
-
- now = sched_clock_cpu(smp_processor_id());
- cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ if (cfs_b->quota != RUNTIME_INF)
+ cfs_b->runtime = cfs_b->quota;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4372,8 +4377,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4390,61 +4394,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
@@ -4535,7 +4495,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4546,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4555,6 +4516,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4593,8 +4555,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4614,31 +4575,64 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
- enqueue = 0;
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
+ cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+ }
+ for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (enqueue)
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
+
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, task_delta);
+
+unthrottle_throttle:
+ /*
+ * The cfs_rq_throttled() breaks in the above iteration can result in
+ * incomplete leaf list maintenance, resulting in triggering the
+ * assertion below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
assert_list_leaf_cfs_rq(rq);
- if (!se)
- add_nr_running(rq, task_delta);
-
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4663,7 +4657,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4688,7 +4681,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4716,8 +4709,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4730,8 +4721,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4808,8 +4798,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4842,7 +4831,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4859,7 +4847,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4868,11 +4855,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4965,29 +4951,37 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5019,17 +5013,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- u64 overrun;
-
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
- overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5229,6 +5219,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5252,30 +5243,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto enqueue_throttle;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
+ goto enqueue_throttle;
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
+enqueue_throttle:
if (!se) {
add_nr_running(rq, 1);
/*
@@ -5329,20 +5328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto dequeue_throttle;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5361,15 +5358,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
+ goto dequeue_throttle;
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
}
+dequeue_throttle:
if (!se)
sub_nr_running(rq, 1);
@@ -6243,6 +6245,7 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
@@ -6273,11 +6276,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
if (available_idle_cpu(cpu))
break;
}
@@ -7659,7 +7662,15 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
@@ -7754,6 +7765,7 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
@@ -7781,6 +7793,41 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ rq->last_blocked_load_update_tick = jiffies;
+
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
+
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7800,16 +7847,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7818,9 +7860,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7835,23 +7881,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
-
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (done)
- rq->has_blocked_load = 0;
-#endif
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7901,27 +7934,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
+ bool decayed;
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
- rq->has_blocked_load = 0;
-#endif
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7930,6 +7952,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -9633,7 +9673,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -9952,6 +9997,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -9972,14 +10025,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a516575c18..de22da666ac7 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NO_HZ mode, if there's any pending delta, fold it
- * into the pending NO_HZ delta.
- */
- delta = calc_load_fold_active(this_rq, 0);
+ delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
}
}
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
- delta = calc_load_nohz_fold();
+ delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e3719027e..9154e745f097 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
@@ -1198,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+ if (!nbytes)
+ return -EINVAL;
+
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..c4108c93b2cf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -2522,6 +2524,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -2643,7 +2651,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..2efae7800633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -124,7 +124,13 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
@@ -341,8 +347,6 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
short idle;
short period_active;
@@ -489,7 +493,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -562,8 +567,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -1797,6 +1800,8 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..6fb3e636fbe1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1332,7 +1332,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
@@ -1872,6 +1872,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1937,7 +1973,7 @@ next_level:
static int
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
{
- enum s_alloc alloc_state;
+ enum s_alloc alloc_state = sa_none;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
@@ -1945,6 +1981,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ if (WARN_ON(cpumask_empty(cpu_map)))
+ goto error;
+
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
@@ -1964,6 +2003,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
@@ -2015,7 +2057,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (has_asym)
- static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+ static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
@@ -2110,8 +2152,12 @@ int sched_init_domains(const struct cpumask *cpu_map)
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
+ unsigned int cpu = cpumask_any(cpu_map);
int i;
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
+ static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 811b4a86cdf6..e914d72b3290 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -42,6 +42,14 @@
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
+
enum notify_state {
SECCOMP_NOTIFY_INIT,
SECCOMP_NOTIFY_SENT,
@@ -1161,6 +1169,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
return seccomp_notify_recv(filter, buf);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
default:
@@ -1198,6 +1207,7 @@ static const struct file_operations seccomp_notify_ops = {
.poll = seccomp_notify_poll,
.release = seccomp_notify_release,
.unlocked_ioctl = seccomp_notify_ioctl,
+ .compat_ioctl = seccomp_notify_ioctl,
};
static struct file *init_listener(struct seccomp_filter *filter)
diff --git a/kernel/signal.c b/kernel/signal.c
index 07da8def4329..00448edd0282 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -408,27 +408,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
{
struct sigqueue *q = NULL;
struct user_struct *user;
+ int sigpending;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
+ *
+ * NOTE! A pending signal will hold on to the user refcount,
+ * and we get/put the refcount only when the sigpending count
+ * changes from/to zero.
*/
rcu_read_lock();
- user = get_uid(__task_cred(t)->user);
- atomic_inc(&user->sigpending);
+ user = __task_cred(t)->user;
+ sigpending = atomic_inc_return(&user->sigpending);
+ if (sigpending == 1)
+ get_uid(user);
rcu_read_unlock();
- if (override_rlimit ||
- atomic_read(&user->sigpending) <=
- task_rlimit(t, RLIMIT_SIGPENDING)) {
+ if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
- atomic_dec(&user->sigpending);
- free_uid(user);
+ if (atomic_dec_and_test(&user->sigpending))
+ free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
@@ -442,8 +447,8 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- atomic_dec(&q->user->sigpending);
- free_uid(q->user);
+ if (atomic_dec_and_test(&q->user->sigpending))
+ free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
}
@@ -1486,15 +1491,15 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
unsigned long flags;
int ret = -EINVAL;
+ if (!valid_signal(sig))
+ return ret;
+
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = errno;
info.si_code = SI_ASYNCIO;
*((sigval_t *)&info.si_pid) = addr;
- if (!valid_signal(sig))
- return ret;
-
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (!p) {
@@ -1876,7 +1881,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
* This is only possible if parent == real_parent.
* Check if it has changed security domain.
*/
- if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
sig = SIGCHLD;
}
@@ -1938,8 +1943,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ /*
+ * Send with __send_signal as si_pid and si_uid are in the
+ * parent's namespaces.
+ */
if (valid_signal(sig) && sig)
- __group_send_sig_info(sig, &info, tsk->parent);
+ __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
@@ -2907,80 +2916,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(&kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 899b726c9e98..97c44cff0229 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
struct stacktrace_cookie c = {
.store = store,
.size = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + !!(current == tsk),
};
if (!try_get_task_stack(tsk))
@@ -306,7 +307,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
struct stack_trace trace = {
.entries = store,
.max_entries = size,
- .skip = skipnr + 1,
+ /* skip this function if they are tracing us */
+ .skip = skipnr + !!(current == task),
};
save_stack_trace_tsk(task, &trace);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1beca96fb625..952fc547ea50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1452,7 +1452,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = &one,
.extra2 = &four,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13a0f2e6ebc2..e2ac0e37c4ae 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index bfe0e0656f02..15f20dd30bd8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -88,6 +88,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct wakeup_source *__ws;
+ struct platform_device *pdev;
+ int ret = 0;
if (rtcdev)
return -EBUSY;
@@ -98,12 +100,14 @@ static int alarmtimer_rtc_add_device(struct device *dev,
return -1;
__ws = wakeup_source_register("alarmtimer");
+ pdev = platform_device_register_data(dev, "alarmtimer",
+ PLATFORM_DEVID_AUTO, NULL, 0);
spin_lock_irqsave(&rtcdev_lock, flags);
- if (!rtcdev) {
+ if (__ws && !IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
- spin_unlock_irqrestore(&rtcdev_lock, flags);
- return -1;
+ ret = -1;
+ goto unlock;
}
rtcdev = rtc;
@@ -111,12 +115,17 @@ static int alarmtimer_rtc_add_device(struct device *dev,
get_device(dev);
ws = __ws;
__ws = NULL;
+ pdev = NULL;
+ } else {
+ ret = -1;
}
+unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
+ platform_device_unregister(pdev);
wakeup_source_unregister(__ws);
- return 0;
+ return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
@@ -861,8 +870,7 @@ static struct platform_driver alarmtimer_driver = {
*/
static int __init alarmtimer_init(void)
{
- struct platform_device *pdev;
- int error = 0;
+ int error;
int i;
alarmtimer_rtc_timer_init();
@@ -885,15 +893,7 @@ static int __init alarmtimer_init(void)
if (error)
goto out_if;
- pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- error = PTR_ERR(pdev);
- goto out_drv;
- }
return 0;
-
-out_drv:
- platform_driver_unregister(&alarmtimer_driver);
out_if:
alarmtimer_rtc_interface_remove();
return error;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..cf74160e79c7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused)
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+
+ /*
+ * Arm timer if not already pending: could race with concurrent
+ * pair clocksource_stop_watchdog() clocksource_start_watchdog().
+ */
+ if (!timer_pending(&watchdog_timer)) {
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+ }
out:
spin_unlock(&watchdog_lock);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..5f15dad63bce 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -946,7 +946,8 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index;
- timer->state = HRTIMER_STATE_ENQUEUED;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
return timerqueue_add(&base->active, &timer->node);
}
@@ -968,7 +969,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
u8 state = timer->state;
- timer->state = newstate;
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->state, newstate);
if (!(state & HRTIMER_STATE_ENQUEUED))
return;
@@ -993,8 +995,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
- if (hrtimer_is_queued(timer)) {
- u8 state = timer->state;
+ u8 state = timer->state;
+
+ if (state & HRTIMER_STATE_ENQUEUED) {
int reprogram;
/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 65eb796610dc..069ca78fb0bf 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
/* fill PPS status fields */
pps_fill_timex(txc);
- txc->time.tv_sec = (time_t)ts->tv_sec;
+ txc->time.tv_sec = ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..200fb2d3be99 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -14,8 +14,6 @@
#include "posix-timers.h"
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 142b07619918..3f16750ec7a3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -205,7 +205,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (sched_clock_timer.function != NULL) {
/* update timeout for clock wrap */
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt,
+ HRTIMER_MODE_REL_HARD);
}
r = rate;
@@ -249,9 +250,9 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
/*
@@ -288,7 +289,7 @@ void sched_clock_resume(void)
struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock();
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
rd->read_sched_clock = cd.actual_read_sched_clock;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..e59a34f23aa2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Do a quick check without holding jiffies_lock:
+ * The READ_ONCE() pairs with two updates done later in this function.
*/
- delta = ktime_sub(now, last_jiffies_update);
+ delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
@@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
- last_jiffies_update = ktime_add(last_jiffies_update,
- tick_period);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
@@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now)
ticks = ktime_divns(delta, incr);
- last_jiffies_update = ktime_add_ns(last_jiffies_update,
- incr * ticks);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add_ns(last_jiffies_update,
+ incr * ticks));
}
do_timer(++ticks);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7f7d6914ddd5..f5a6814c6743 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -877,7 +877,8 @@ int get_timespec64(struct timespec64 *ts,
ts->tv_sec = kts.tv_sec;
/* Zero out the padding for 32 bit systems or in compat mode */
- if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+ if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) ||
+ in_compat_syscall()))
kts.tv_nsec &= 0xFFFFFFFFUL;
ts->tv_nsec = kts.tv_nsec;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 44b726bab4bd..d911c8470149 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -819,7 +819,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
} while (read_seqcount_retry(&tk_core.seq, seq));
- return base + nsecs;
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7d63b7347066..e70451eb2495 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,6 +43,7 @@
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/random.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -518,8 +519,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
* Force expire obscene large timeouts to expire at the
* capacity limit of the wheel.
*/
- if (expires >= WHEEL_TIMEOUT_CUTOFF)
- expires = WHEEL_TIMEOUT_MAX;
+ if (delta >= WHEEL_TIMEOUT_CUTOFF)
+ expires = clk + WHEEL_TIMEOUT_MAX;
idx = calc_index(expires, LVL_DEPTH - 1);
}
@@ -581,7 +582,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* Set the next expiry time and kick the CPU so it can reevaluate the
* wheel:
*/
- base->next_expiry = timer->expires;
+ if (time_before(timer->expires, base->clk)) {
+ /*
+ * Prevent from forward_timer_base() moving the base->clk
+ * backward
+ */
+ base->next_expiry = base->clk;
+ } else {
+ base->next_expiry = timer->expires;
+ }
wake_up_nohz_cpu(base->cpu);
}
@@ -893,10 +902,13 @@ static inline void forward_timer_base(struct timer_base *base)
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow))
+ if (time_after(base->next_expiry, jnow)) {
base->clk = jnow;
- else
+ } else {
+ if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
+ return;
base->clk = base->next_expiry;
+ }
#endif
}
@@ -1646,6 +1658,13 @@ void update_process_times(int user_tick)
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers(p);
+
+ /* The current CPU might make use of net randoms without receiving IRQs
+ * to renew them often enough. Let's update the net_rand_state from a
+ * non-constant value that's not affine to the number of calls to make
+ * sure it's updated when there's some activity (we don't care in idle).
+ */
+ this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick);
}
/**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 564e5fdb025f..9669aaf0cd76 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -371,7 +371,6 @@ config PROFILE_ANNOTATED_BRANCHES
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
- imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e1c6d79fb4cc..8f18311f9be5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -336,6 +339,7 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt)
{
+ synchronize_rcu();
blk_trace_free(bt);
put_probe_ref();
}
@@ -344,7 +348,8 @@ static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (!bt)
return -EINVAL;
@@ -479,6 +484,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct dentry *dir = NULL;
int ret;
+ lockdep_assert_held(&q->blk_trace_mutex);
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
@@ -494,6 +501,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -507,13 +525,31 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;
- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
+#ifdef CONFIG_BLK_DEBUG_FS
+ /*
+ * When tracing whole make_request drivers (multiqueue) block devices,
+ * reuse the existing debugfs directory created by the block layer on
+ * init. For request-based block devices, all partitions block devices,
+ * and scsi-generic block devices we create a temporary new debugfs
+ * directory that will be removed once the trace ends.
+ */
+ if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains)
+ dir = q->debugfs_dir;
+ else
+#endif
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
- if (!dir)
+
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ ret = -ENOENT;
goto err;
+ }
bt->dev = dev;
atomic_set(&bt->dropped, 0);
@@ -522,12 +558,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
&blk_dropped_fops);
- if (!bt->dropped_file)
- goto err;
bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- if (!bt->msg_file)
- goto err;
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -549,16 +581,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto err;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;
@@ -636,8 +663,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -747,8 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
void blk_trace_shutdown(struct request_queue *q)
{
mutex_lock(&q->blk_trace_mutex);
-
- if (q->blk_trace) {
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
@@ -760,8 +789,10 @@ void blk_trace_shutdown(struct request_queue *q)
static union kernfs_node_id *
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ /* We don't use the 'bt' value here except as an optimization... */
+ bt = rcu_dereference_protected(q->blk_trace, 1);
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return NULL;
@@ -806,10 +837,14 @@ static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what,
union kernfs_node_id *cgid)
{
- struct blk_trace *bt = rq->q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(rq->q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
@@ -818,6 +853,7 @@ static void blk_add_trace_rq(struct request *rq, int error,
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, error, 0, NULL, cgid);
+ rcu_read_unlock();
}
static void blk_add_trace_rq_insert(void *ignore,
@@ -863,14 +899,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
+ rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -915,11 +956,14 @@ static void blk_add_trace_getrq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
NULL, NULL);
+ rcu_read_unlock();
}
}
@@ -931,27 +975,35 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL, NULL);
+ rcu_read_unlock();
}
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
+ rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
@@ -963,22 +1015,28 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
+ rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu, blk_trace_bio_get_cgid(q, bio));
+ BLK_TA_SPLIT,
+ blk_status_to_errno(bio->bi_status),
+ sizeof(rpdu), &rpdu,
+ blk_trace_bio_get_cgid(q, bio));
}
+ rcu_read_unlock();
}
/**
@@ -998,19 +1056,25 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
+ rcu_read_unlock();
}
/**
@@ -1031,11 +1095,15 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev,
sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -1044,6 +1112,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ rcu_read_unlock();
}
/**
@@ -1061,14 +1130,19 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq,
void *data, size_t len)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(q, rq));
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1220,21 +1294,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent, has_cg);
+ const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
-static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r, bool has_cg)
-{
- const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- __u64 sector_from = __r->sector_from;
-
- r->device_from = be32_to_cpu(__r->device_from);
- r->device_to = be32_to_cpu(__r->device_to);
- r->sector_from = be64_to_cpu(sector_from);
-}
-
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
@@ -1360,13 +1423,13 @@ static void blk_log_with_error(struct trace_seq *s,
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
- struct blk_io_trace_remap r = { .device_from = 0, };
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ MAJOR(be32_to_cpu(__r->device_from)),
+ MINOR(be32_to_cpu(__r->device_from)),
+ be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
@@ -1590,11 +1653,13 @@ static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
put_probe_ref();
+ synchronize_rcu();
blk_trace_free(bt);
return 0;
}
@@ -1621,10 +1686,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto free_bt;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
@@ -1756,6 +1818,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q;
struct block_device *bdev;
+ struct blk_trace *bt;
ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p));
@@ -1768,21 +1831,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
- if (q->blk_trace == NULL)
+ if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
- ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
- ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+ ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
mutex_unlock(&q->blk_trace_mutex);
@@ -1799,6 +1864,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev;
struct request_queue *q;
struct hd_struct *p;
+ struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1829,8 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- if (!!value == !!q->blk_trace) {
+ if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
@@ -1842,18 +1910,21 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
- if (q->blk_trace == NULL)
+ if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
+ }
if (ret == 0) {
if (attr == &dev_attr_act_mask)
- q->blk_trace->act_mask = value;
+ bt->act_mask = value;
else if (attr == &dev_attr_pid)
- q->blk_trace->pid = value;
+ bt->pid = value;
else if (attr == &dev_attr_start_lba)
- q->blk_trace->start_lba = value;
+ bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
- q->blk_trace->end_lba = value;
+ bt->end_lba = value;
}
out_unlock_bdev:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index aaf66cd9daa6..a2c30e1eb7e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -155,7 +155,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
{
/*
@@ -178,10 +178,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- if (!access_ok(unsafe_ptr, size))
- return -EPERM;
- return probe_kernel_write(unsafe_ptr, src, size);
+ return probe_user_write(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index baa54a76c6aa..98751023c05a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -523,8 +523,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -550,7 +549,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
@@ -5085,8 +5085,8 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
-struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH;
enum graph_filter_type {
GRAPH_FILTER_NOTRACE = 0,
@@ -5357,8 +5357,15 @@ ftrace_graph_release(struct inode *inode, struct file *file)
mutex_unlock(&graph_lock);
- /* Wait till all users are no longer using the old hash */
- synchronize_rcu();
+ /*
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ schedule_on_each_cpu(ftrace_sync);
free_ftrace_hash(old_hash);
}
@@ -5671,8 +5678,11 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ }
}
return cnt;
@@ -5849,8 +5859,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -6435,12 +6445,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
@@ -6513,9 +6523,10 @@ static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
- if (v == FTRACE_NO_PIDS)
+ if (v == FTRACE_NO_PIDS) {
+ (*pos)++;
return NULL;
-
+ }
return trace_pid_next(pid_list, v, pos);
}
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * can use rcu_dereference_raw_check() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_check() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw_notrace(list); \
+ op = rcu_dereference_raw_check(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ while (likely(op = rcu_dereference_raw_check((op)->next)) && \
unlikely((op) != &ftrace_list_end))
extern struct ftrace_ops __rcu *ftrace_ops_list;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05b0b3139ebc..0e4af9941a23 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2366,7 +2366,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp)) {
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
- event = rb_add_time_stamp(event, info->delta, abs);
+ event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a65ca0a91f6..c4c3daeba5c0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1726,6 +1726,7 @@ static __init int init_trace_selftests(void)
pr_info("Running postponed tracer tests:\n");
+ tracing_selftest_running = true;
list_for_each_entry_safe(p, n, &postponed_selftests, list) {
/* This loop can take minutes when sanitizers are enabled, so
* lets make sure we allow RCU processing.
@@ -1748,6 +1749,7 @@ static __init int init_trace_selftests(void)
list_del(&p->list);
kfree(p);
}
+ tracing_selftest_running = false;
out:
mutex_unlock(&trace_types_lock);
@@ -2642,10 +2644,10 @@ static void ftrace_exports(struct ring_buffer_event *event)
preempt_disable_notrace();
- export = rcu_dereference_raw_notrace(ftrace_exports_list);
+ export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event);
- export = rcu_dereference_raw_notrace(export->next);
+ export = rcu_dereference_raw_check(export->next);
}
preempt_enable_notrace();
@@ -4569,6 +4571,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ if ((mask == TRACE_ITER_RECORD_TGID) ||
+ (mask == TRACE_ITER_RECORD_CMD))
+ lockdep_assert_held(&event_mutex);
+
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
@@ -4588,7 +4594,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map)
- tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+ tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
sizeof(*tgid_map),
GFP_KERNEL);
if (!tgid_map) {
@@ -4636,6 +4642,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
cmp += len;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = match_string(trace_options, -1, cmp);
@@ -4646,6 +4653,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
/*
* If the first trailing whitespace is replaced with '\0' by strstrip,
@@ -6009,6 +6017,7 @@ waitagain:
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
cpumask_clear(iter->started);
+ trace_seq_init(&iter->seq);
iter->pos = -1;
trace_event_read_lock();
@@ -7947,9 +7956,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = set_tracer_flag(tr, 1 << index, val);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (ret < 0)
return ret;
@@ -8277,6 +8288,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
*/
allocate_snapshot = false;
#endif
+
+ /*
+ * Because of some magic with the way alloc_percpu() works on
+ * x86_64, we need to synchronize the pgd of all the tables,
+ * otherwise the trace events that happen in x86_64 page fault
+ * handlers can't cope with accessing the chance that a
+ * alloc_percpu()'d memory might be touched in the page fault trace
+ * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
+ * calls in tracing, because something might get triggered within a
+ * page fault trace event!
+ */
+ vmalloc_sync_mappings();
+
return 0;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 005f08629b8b..08088cafbb88 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -931,22 +931,31 @@ extern void __trace_graph_return(struct trace_array *tr,
unsigned long flags, int pc);
#ifdef CONFIG_DYNAMIC_FTRACE
-extern struct ftrace_hash *ftrace_graph_hash;
-extern struct ftrace_hash *ftrace_graph_notrace_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_hash;
+extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
+ struct ftrace_hash *hash;
preempt_disable_notrace();
- if (ftrace_hash_empty(ftrace_graph_hash)) {
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible());
+
+ if (ftrace_hash_empty(hash)) {
ret = 1;
goto out;
}
- if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ if (ftrace_lookup_ip(hash, addr)) {
/*
* This needs to be cleared on the return functions
@@ -982,10 +991,20 @@ static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
int ret = 0;
+ struct ftrace_hash *notrace_hash;
preempt_disable_notrace();
- if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ /*
+ * Have to open code "rcu_dereference_sched()" because the
+ * function graph tracer can be called when RCU is not
+ * "watching".
+ * Protected with schedule_on_each_cpu(ftrace_sync)
+ */
+ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ !preemptible());
+
+ if (ftrace_lookup_ip(notrace_hash, addr))
ret = 1;
preempt_enable_notrace();
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0ce3db67f556..42d4afd2fc4f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -327,7 +327,8 @@ void trace_event_enable_cmd_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
@@ -341,7 +342,6 @@ void trace_event_enable_cmd_record(bool enable)
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
void trace_event_enable_tgid_record(bool enable)
@@ -349,7 +349,8 @@ void trace_event_enable_tgid_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
@@ -363,7 +364,6 @@ void trace_event_enable_tgid_record(bool enable)
&file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
static int __ftrace_event_enable_disable(struct trace_event_file *file,
@@ -534,12 +534,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
tr, INT_MIN);
- register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+ register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
tr, INT_MAX);
} else {
unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+ unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
tr);
}
}
@@ -795,7 +795,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
return ret;
}
-static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
int ret;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d5a4bda9ce7b..b878d0ec8181 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1659,7 +1659,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
- kfree(filter);
+ __free_filter(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
tracepoint_synchronize_unregister();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dd310d3b5843..21caeb2052ef 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -111,6 +111,7 @@ struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
+ unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
@@ -464,11 +465,12 @@ struct action_data {
* When a histogram trigger is hit, the values of any
* references to variables, including variables being passed
* as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
+ * var_ref_vals array. This var_ref_idx array is an array of
+ * indices into the var_ref_vals array, one for each synthetic
+ * event param, and is passed to the synthetic event
+ * invocation.
*/
- unsigned int var_ref_idx;
+ unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -674,6 +676,8 @@ static bool synth_field_signed(char *type)
{
if (str_has_prefix(type, "u"))
return false;
+ if (strcmp(type, "gfp_t") == 0)
+ return false;
return true;
}
@@ -852,14 +856,14 @@ static struct trace_event_functions synth_event_funcs = {
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct trace_event_file *trace_file = __data;
struct synth_trace_event *entry;
struct trace_event_buffer fbuffer;
struct ring_buffer *buffer;
struct synth_event *event;
- unsigned int i, n_u64;
+ unsigned int i, n_u64, val_idx;
int fields_size = 0;
event = trace_file->event_call->data;
@@ -882,14 +886,34 @@ static notrace void trace_event_raw_event_synth(void *__data,
goto out;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
char *str_field = (char *)&entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
- entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[val_idx];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
n_u64++;
}
}
@@ -1071,10 +1095,10 @@ static struct tracepoint *alloc_synth_tracepoint(char *name)
}
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
- unsigned int var_ref_idx);
+ unsigned int *var_ref_idx);
static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
- unsigned int var_ref_idx)
+ unsigned int *var_ref_idx)
{
struct tracepoint *tp = event->tp;
@@ -1985,12 +2009,6 @@ static int parse_map_size(char *str)
unsigned long size, map_bits;
int ret;
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
-
ret = kstrtoul(str, 0, &size);
if (ret)
goto out;
@@ -2050,25 +2068,25 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
static int parse_assignment(struct trace_array *tr,
char *str, struct hist_trigger_attrs *attrs)
{
- int ret = 0;
+ int len, ret = 0;
- if ((str_has_prefix(str, "key=")) ||
- (str_has_prefix(str, "keys="))) {
- attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ if ((len = str_has_prefix(str, "key=")) ||
+ (len = str_has_prefix(str, "keys="))) {
+ attrs->keys_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->keys_str) {
ret = -ENOMEM;
goto out;
}
- } else if ((str_has_prefix(str, "val=")) ||
- (str_has_prefix(str, "vals=")) ||
- (str_has_prefix(str, "values="))) {
- attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "val=")) ||
+ (len = str_has_prefix(str, "vals=")) ||
+ (len = str_has_prefix(str, "values="))) {
+ attrs->vals_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->vals_str) {
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "sort=")) {
- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ } else if ((len = str_has_prefix(str, "sort="))) {
+ attrs->sort_key_str = kstrdup(str + len, GFP_KERNEL);
if (!attrs->sort_key_str) {
ret = -ENOMEM;
goto out;
@@ -2079,12 +2097,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "clock=")) {
- strsep(&str, "=");
- if (!str) {
- ret = -EINVAL;
- goto out;
- }
+ } else if ((len = str_has_prefix(str, "clock="))) {
+ str += len;
str = strstrip(str);
attrs->clock = kstrdup(str, GFP_KERNEL);
@@ -2092,8 +2106,8 @@ static int parse_assignment(struct trace_array *tr,
ret = -ENOMEM;
goto out;
}
- } else if (str_has_prefix(str, "size=")) {
- int map_bits = parse_map_size(str);
+ } else if ((len = str_has_prefix(str, "size="))) {
+ int map_bits = parse_map_size(str + len);
if (map_bits < 0) {
ret = map_bits;
@@ -2133,8 +2147,14 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
while (trigger_str) {
char *str = strsep(&trigger_str, ":");
+ char *rhs;
- if (strchr(str, '=')) {
+ rhs = strchr(str, '=');
+ if (rhs) {
+ if (!strlen(++rhs)) {
+ ret = -EINVAL;
+ goto free;
+ }
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
@@ -2378,12 +2398,23 @@ static int contains_operator(char *str)
return field_op;
}
+static void get_hist_field(struct hist_field *hist_field)
+{
+ hist_field->ref++;
+}
+
static void __destroy_hist_field(struct hist_field *hist_field)
{
+ if (--hist_field->ref > 1)
+ return;
+
kfree(hist_field->var.name);
kfree(hist_field->name);
kfree(hist_field->type);
+ kfree(hist_field->system);
+ kfree(hist_field->event_name);
+
kfree(hist_field);
}
@@ -2421,6 +2452,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field)
return NULL;
+ hist_field->ref = 1;
+
hist_field->hist_data = hist_data;
if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
@@ -2595,6 +2628,22 @@ static int init_var_ref(struct hist_field *ref_field,
goto out;
}
+static int find_var_ref_idx(struct hist_trigger_data *hist_data,
+ struct hist_field *var_field)
+{
+ struct hist_field *ref_field;
+ int i;
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
/**
* create_var_ref - Create a variable reference and attach it to trigger
* @hist_data: The trigger that will be referencing the variable
@@ -2616,6 +2665,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
struct hist_field *ref_field;
+ int i;
+
+ /* Check if the variable already exists */
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data) {
+ get_hist_field(ref_field);
+ return ref_field;
+ }
+ }
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
@@ -3413,6 +3473,7 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
goto out;
}
+ var->ref = 1;
var->flags = HIST_FIELD_FL_VAR;
var->var.idx = idx;
var->var.hist_data = var->hist_data = hist_data;
@@ -4042,6 +4103,9 @@ static void destroy_field_vars(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_field_vars; i++)
destroy_field_var(hist_data->field_vars[i]);
+
+ for (i = 0; i < hist_data->n_save_vars; i++)
+ destroy_field_var(hist_data->save_vars[i]);
}
static void save_field_var(struct hist_trigger_data *hist_data,
@@ -4153,11 +4217,11 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
struct trace_array *tr = hist_data->event_file->tr;
char *event_name, *param, *system = NULL;
struct hist_field *hist_field, *var_ref;
- unsigned int i, var_ref_idx;
+ unsigned int i;
unsigned int field_pos = 0;
struct synth_event *event;
char *synth_event_name;
- int ret = 0;
+ int var_ref_idx, ret = 0;
lockdep_assert_held(&event_mutex);
@@ -4174,8 +4238,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
event->ref++;
- var_ref_idx = hist_data->n_var_refs;
-
for (i = 0; i < data->n_params; i++) {
char *p;
@@ -4224,6 +4286,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
goto err;
}
+ var_ref_idx = find_var_ref_idx(hist_data, var_ref);
+ if (WARN_ON(var_ref_idx < 0)) {
+ ret = var_ref_idx;
+ goto err;
+ }
+
+ data->var_ref_idx[i] = var_ref_idx;
+
field_pos++;
kfree(p);
continue;
@@ -4242,7 +4312,6 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
}
data->synth_event = event;
- data->var_ref_idx = var_ref_idx;
out:
return ret;
err:
@@ -4461,10 +4530,6 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
j < TRACING_MAP_VALS_MAX; i++) {
field_str = strsep(&fields_str, ",");
@@ -4559,10 +4624,6 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str)
- goto out;
-
for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
field_str = strsep(&fields_str, ",");
if (!field_str)
@@ -4720,12 +4781,6 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
if (!fields_str)
goto out;
- strsep(&fields_str, "=");
- if (!fields_str) {
- ret = -EINVAL;
- goto out;
- }
-
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
struct hist_field *hist_field;
char *field_str, *field_name;
@@ -4734,9 +4789,11 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
sort_key = &hist_data->sort_keys[i];
field_str = strsep(&fields_str, ",");
- if (!field_str) {
- if (i == 0)
- ret = -EINVAL;
+ if (!field_str)
+ break;
+
+ if (!*field_str) {
+ ret = -EINVAL;
break;
}
@@ -4746,7 +4803,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
field_name = strsep(&field_str, ".");
- if (!field_name) {
+ if (!field_name || !*field_name) {
ret = -EINVAL;
break;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2a2912cb4533..586ea94e0db1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -115,9 +115,10 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
struct trace_event_file *event_file = event_file_data(m->private);
- if (t == SHOW_AVAILABLE_TRIGGERS)
+ if (t == SHOW_AVAILABLE_TRIGGERS) {
+ (*pos)++;
return NULL;
-
+ }
return seq_list_next(t, &event_file->triggers, pos);
}
@@ -210,11 +211,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
static int trigger_process_regex(struct trace_event_file *file, char *buff)
{
- char *command, *next = buff;
+ char *command, *next;
struct event_command *p;
int ret = -EINVAL;
+ next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
command = (command[0] != '!') ? command : command + 1;
mutex_lock(&trigger_cmd_mutex);
@@ -617,8 +624,14 @@ event_trigger_callback(struct event_command *cmd_ops,
int ret;
/* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0]))
+ if (param && isdigit(param[0])) {
trigger = strsep(&param, " \t");
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
+ }
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
@@ -1075,14 +1088,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = register_trigger(glob, ops, data, file);
-
- if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
- unregister_trigger(glob, ops, data, file);
- ret = 0;
- }
+ if (tracing_alloc_snapshot_instance(file->tr) != 0)
+ return 0;
- return ret;
+ return register_trigger(glob, ops, data, file);
}
static int
@@ -1359,6 +1368,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
trigger = strsep(&param, " \t");
if (!trigger)
return -EINVAL;
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
system = strsep(&trigger, ":");
if (!trigger)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 8030e24dbf14..568918fae8d4 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -270,6 +270,7 @@ static bool disable_migrate;
static void move_to_next_cpu(void)
{
struct cpumask *current_mask = &save_cpumask;
+ struct trace_array *tr = hwlat_trace;
int next_cpu;
if (disable_migrate)
@@ -283,7 +284,7 @@ static void move_to_next_cpu(void)
goto disable;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(smp_processor_id(), current_mask);
put_online_cpus();
@@ -360,7 +361,7 @@ static int start_kthread(struct trace_array *tr)
/* Just pick the first CPU on first iteration */
current_mask = &save_cpumask;
get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
put_online_cpus();
next_cpu = cpumask_first(current_mask);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7d736248a070..4666bceb1383 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -389,11 +389,10 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
#if defined(CONFIG_KPROBES_ON_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
-static bool within_notrace_func(struct trace_kprobe *tk)
+static bool __within_notrace_func(unsigned long addr)
{
- unsigned long offset, size, addr;
+ unsigned long offset, size;
- addr = trace_kprobe_address(tk);
if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
return false;
@@ -406,6 +405,28 @@ static bool within_notrace_func(struct trace_kprobe *tk)
*/
return !ftrace_location_range(addr, addr + size - 1);
}
+
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+ unsigned long addr = trace_kprobe_address(tk);
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (!__within_notrace_func(addr))
+ return false;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return true;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_notrace_func(addr);
+ }
+
+ return true;
+}
#else
#define within_notrace_func(tk) (false)
#endif
@@ -767,6 +788,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
int i;
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
+ if (trace_kprobe_is_return(tk) && tk->rp.maxactive)
+ seq_printf(m, "%d", tk->rp.maxactive);
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
trace_event_name(&tk->tp.call));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e288168661e1..e304196d7c28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -89,8 +89,10 @@ static void tracing_sched_unregister(void)
static void tracing_start_sched_switch(int ops)
{
- bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
+ bool sched_register;
+
mutex_lock(&sched_register_mutex);
+ sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
switch (ops) {
case RECORD_CMDLINE:
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..7152062d6af6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -631,7 +631,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -649,6 +649,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 642a850af81a..5b4f50932478 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -184,6 +184,11 @@ static void check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75bf1bcb4a8a..92b76f9e25ed 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -278,18 +278,22 @@ static int tracing_stat_init(void)
d_tracing = tracing_init_dentry();
if (IS_ERR(d_tracing))
- return 0;
+ return -ENODEV;
stat_dir = tracefs_create_dir("trace_stat", d_tracing);
- if (!stat_dir)
+ if (!stat_dir) {
pr_warn("Could not create tracefs 'trace_stat' entry\n");
+ return -ENOMEM;
+ }
return 0;
}
static int init_stat_file(struct stat_session *session)
{
- if (!stat_dir && tracing_stat_init())
- return -ENODEV;
+ int ret;
+
+ if (!stat_dir && (ret = tracing_stat_init()))
+ return ret;
session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
@@ -302,7 +306,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret;
+ int ret = -EINVAL;
if (!trace)
return -EINVAL;
@@ -313,17 +317,15 @@ int register_stat_tracer(struct tracer_stat *trace)
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
- if (node->ts == trace) {
- mutex_unlock(&all_stat_sessions_mutex);
- return -EINVAL;
- }
+ if (node->ts == trace)
+ goto out;
}
- mutex_unlock(&all_stat_sessions_mutex);
+ ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- return -ENOMEM;
+ goto out;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -332,15 +334,16 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- return ret;
+ goto out;
}
+ ret = 0;
/* Register */
- mutex_lock(&all_stat_sessions_mutex);
list_add_tail(&session->session_list, &all_stat_sessions);
+ out:
mutex_unlock(&all_stat_sessions_mutex);
- return 0;
+ return ret;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9a1c22310323..9e31bfc818ff 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
#define DEFINE_TRACING_MAP_CMP_FN(type) \
static int tracing_map_cmp_##type(void *val_a, void *val_b) \
{ \
- type a = *(type *)val_a; \
- type b = *(type *)val_b; \
+ type a = (type)(*(u64 *)val_a); \
+ type b = (type)(*(u64 *)val_b); \
\
return (a > b) ? 1 : ((a < b) ? -1 : 0); \
}
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..3474d6aa55d8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -475,6 +475,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
{
struct umh_info *umh_info = info->data;
+ /* cleanup if umh_pipe_setup() was successful but exec failed */
+ if (info->pid && info->retval) {
+ fput(umh_info->pipe_to_umh);
+ fput(umh_info->pipe_from_umh);
+ }
+
argv_free(info->argv);
umh_info->pid = info->pid;
}
@@ -544,6 +550,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of system workqueues.
* (ie. it runs with full root capabilities and optimized affinity).
+ *
+ * Note: successful return value does not guarantee the helper was called at
+ * all. You can't rely on sub_info->{init,cleanup} being called even for
+ * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
+ * into a successful no-op.
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..c1853aea4c14 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+#define SOFTLOCKUP_RESET ULONG_MAX
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -274,7 +276,7 @@ notrace void touch_softlockup_watchdog_sched(void)
* Preemption can be enabled. It doesn't matter which CPU's timestamp
* gets zeroed here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, 0);
+ raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
notrace void touch_softlockup_watchdog(void)
@@ -298,14 +300,14 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
wq_watchdog_touch(-1);
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, 0);
+ __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
static int is_softlockup(unsigned long touch_ts)
@@ -383,7 +385,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == 0) {
+ if (touch_ts == SOFTLOCKUP_RESET) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 95aea04ff722..6bf536113bc1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -425,7 +425,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* ignored.
*/
#define for_each_pwq(pwq, wq) \
- list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
+ lockdep_is_held(&wq->mutex)) \
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
else
@@ -1416,14 +1417,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
rcu_read_lock();
retry:
- if (req_cpu == WORK_CPU_UNBOUND)
- cpu = wq_select_unbound_cpu(raw_smp_processor_id());
-
/* pwq which will be used unless @work is executing elsewhere */
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
+ if (wq->flags & WQ_UNBOUND) {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+ } else {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ }
/*
* If @work was previously on a different pool, it might still be
@@ -2532,8 +2535,14 @@ repeat:
*/
if (need_to_create_worker(pool)) {
spin_lock(&wq_mayday_lock);
- get_pwq(pwq);
- list_move_tail(&pwq->mayday_node, &wq->maydays);
+ /*
+ * Queue iff we aren't racing destruction
+ * and somebody else hasn't queued it already.
+ */
+ if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
+ }
spin_unlock(&wq_mayday_lock);
}
}
@@ -3487,7 +3496,6 @@ static void rcu_free_wq(struct rcu_head *rcu)
else
free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq->rescuer);
kfree(wq);
}
@@ -4318,9 +4326,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct pool_workqueue *pwq;
int node;
+ /*
+ * Remove it from sysfs first so that sanity check failure doesn't
+ * lead to sysfs name conflicts.
+ */
+ workqueue_sysfs_unregister(wq);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+ if (wq->rescuer) {
+ struct worker *rescuer = wq->rescuer;
+
+ /* this prevents new queueing */
+ spin_lock_irq(&wq_mayday_lock);
+ wq->rescuer = NULL;
+ spin_unlock_irq(&wq_mayday_lock);
+
+ /* rescuer will empty maydays list before exiting */
+ kthread_stop(rescuer->task);
+ kfree(rescuer);
+ }
+
/* sanity checks */
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
@@ -4352,11 +4380,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- workqueue_sysfs_unregister(wq);
-
- if (wq->rescuer)
- kthread_stop(wq->rescuer->task);
-
if (!(wq->flags & WQ_UNBOUND)) {
wq_unregister_lockdep(wq);
/*
@@ -4631,7 +4654,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ pr_cont(" active=%d/%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {