diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 7 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 9 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 21 | ||||
-rw-r--r-- | kernel/bpf/sysfs_btf.c | 11 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 20 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 16 | ||||
-rw-r--r-- | kernel/events/core.c | 23 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 16 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/padata.c | 14 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 7 | ||||
-rw-r--r-- | kernel/relay.c | 5 | ||||
-rw-r--r-- | kernel/sched/fair.c | 52 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 1 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 100 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 1 | ||||
-rw-r--r-- | kernel/trace/ftrace_internal.h | 22 | ||||
-rw-r--r-- | kernel/trace/preemptirq_delay_test.c | 38 | ||||
-rw-r--r-- | kernel/trace/trace.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_boot.c | 20 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 7 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 8 | ||||
-rw-r--r-- | kernel/umh.c | 11 |
23 files changed, 281 insertions, 154 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d6120fd5ba6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -486,7 +486,12 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; - return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > + PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), + vma->vm_pgoff + pgoff); } const struct bpf_map_ops array_map_ops = { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7787bdcb5d68..ff04f60c78d1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3477,8 +3477,8 @@ errout: return ERR_PTR(err); } -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -3605,9 +3605,8 @@ struct btf *btf_parse_vmlinux(void) } env->btf = btf; - btf->data = _binary__btf_vmlinux_bin_start; - btf->data_size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + btf->data = __start_BTF; + btf->data_size = __stop_BTF - __start_BTF; err = btf_parse_hdr(env); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b92aea18ae7..c0ab9bfdf28a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -629,9 +629,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) mutex_lock(&map->freeze_mutex); - if ((vma->vm_flags & VM_WRITE) && map->frozen) { - err = -EPERM; - goto out; + if (vma->vm_flags & VM_WRITE) { + if (map->frozen) { + err = -EPERM; + goto out; + } + /* map is meant to be read-only, so do not allow mapping as + * writable, because it's possible to leak a writable page + * reference and allows user-space to still modify it after + * freezing, while verifier will assume contents do not change + */ + if (map->map_flags & BPF_F_RDONLY_PROG) { + err = -EACCES; + goto out; + } } /* set default open/close callbacks */ @@ -1480,8 +1491,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (err) goto free_value; - if (copy_to_user(uvalue, value, value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) { + err = -EFAULT; goto free_value; + } err = 0; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 7ae5dddd1fe6..3b495773de5a 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,15 +9,15 @@ #include <linux/sysfs.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); + memcpy(buf, __start_BTF + off, len); return len; } @@ -30,15 +30,14 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!_binary__btf_vmlinux_bin_start) + if (!__start_BTF) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c53ccbd5b5d..775fca737909 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4113,7 +4113,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_probe_read_str)) + func_id != BPF_FUNC_probe_read_str && + func_id != BPF_FUNC_probe_read_kernel_str && + func_id != BPF_FUNC_probe_read_user_str)) return 0; /* Error case where ret is in interval [S32MIN, -1]. */ @@ -6498,6 +6500,22 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + range = tnum_const(0); + break; + case BPF_TRACE_RAW_TP: + return 0; + default: + return -ENOTSUPP; + } + break; + case BPF_PROG_TYPE_EXT: + /* freplace program can return anything as its return value + * depends on the to-be-replaced kernel func or bpf program. + */ default: return 0; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6f87352f8219..41ca996568df 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we - * see NULL updated_next or they see our updated stat. - */ - smp_mb(); - - /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - /* - * Paired with the one in cgroup_rstat_cpu_updated(). - * Either they see NULL updated_next or we see their - * updated stat. - */ - smp_mb(); - return pos; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 29ace472f916..ce9fd7605190 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -93,11 +93,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -110,11 +110,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ece7e13f6e4a..cc2095607c74 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -867,10 +867,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* uprobe_write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); @@ -1166,6 +1162,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset, if (offset > i_size_read(inode)) return -EINVAL; + /* + * This ensures that copy_from_page(), copy_to_page() and + * __update_ref_ctr() can't cross page boundary. + */ + if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) + return -EINVAL; + if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) + return -EINVAL; + retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) @@ -2014,6 +2019,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) + return -EINVAL; + pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); diff --git a/kernel/fork.c b/kernel/fork.c index 01316a9e226f..9a5984ee435a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags, int __user *child_tidptr) { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = stack_start, .stack_size = stack_size, }; @@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags, pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (flags & CSIGNAL), + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), .stack = (unsigned long)fn, .stack_size = (unsigned long)arg, }; @@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #endif { struct kernel_clone_args args = { - .flags = (clone_flags & ~CSIGNAL), + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, - .exit_signal = (clone_flags & CSIGNAL), + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; diff --git a/kernel/padata.c b/kernel/padata.c index 62082597d4a2..fee14ae90d96 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6dbeedb7354c..daf3ea9d81de 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -898,6 +898,13 @@ static int software_resume(void) error = freeze_processes(); if (error) goto Close_Finish; + + error = freeze_kernel_threads(); + if (error) { + thaw_processes(); + goto Close_Finish; + } + error = load_image_and_restore(); thaw_processes(); Finish: diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..4b760ec16342 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -581,6 +581,11 @@ struct rchan *relay_open(const char *base_filename, return NULL; chan->buf = alloc_percpu(struct rchan_buf *); + if (!chan->buf) { + kfree(chan); + return NULL; + } + chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c76a20648b72..efb15f0f464b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2682,7 +2682,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -5276,32 +5276,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); + cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto enqueue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } +enqueue_throttle: if (!se) { add_nr_running(rq, 1); /* @@ -5362,17 +5368,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5390,16 +5392,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); + cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto dequeue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); } +dequeue_throttle: if (!se) sub_nr_running(rq, 1); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..743647005f64 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -466,7 +466,6 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68250d433bd7..158233a2ab6c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -325,17 +325,15 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { + int i, mod[3] = {}, fmt_cnt = 0; + char buf[64], fmt_ptype; + void *unsafe_ptr = NULL; bool str_seen = false; - int mod[3] = {}; - int fmt_cnt = 0; - u64 unsafe_addr; - char buf[64]; - int i; /* * bpf_check()->check_func_arg()->check_stack_boundary() @@ -361,40 +359,71 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p' || fmt[i] == 's') { + } else if (fmt[i] == 'p') { mod[fmt_cnt]++; + if ((fmt[i + 1] == 'k' || + fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) return -EINVAL; - fmt_cnt++; - if (fmt[i] == 's') { - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - switch (fmt_cnt) { - case 1: - unsafe_addr = arg1; - arg1 = (long) buf; - break; - case 2: - unsafe_addr = arg2; - arg2 = (long) buf; - break; - case 3: - unsafe_addr = arg3; - arg3 = (long) buf; - break; - } - buf[0] = 0; - strncpy_from_unsafe(buf, - (void *) (long) unsafe_addr, + + goto fmt_next; + } else if (fmt[i] == 's') { + mod[fmt_cnt]++; + fmt_ptype = fmt[i]; +fmt_str: + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) + return -EINVAL; + + switch (fmt_cnt) { + case 0: + unsafe_ptr = (void *)(long)arg1; + arg1 = (long)buf; + break; + case 1: + unsafe_ptr = (void *)(long)arg2; + arg2 = (long)buf; + break; + case 2: + unsafe_ptr = (void *)(long)arg3; + arg3 = (long)buf; + break; + } + + buf[0] = 0; + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + strncpy_from_unsafe(buf, unsafe_ptr, sizeof(buf)); + break; +#endif + case 'k': + strncpy_from_unsafe_strict(buf, unsafe_ptr, + sizeof(buf)); + break; + case 'u': + strncpy_from_unsafe_user(buf, + (__force void __user *)unsafe_ptr, + sizeof(buf)); + break; } - continue; + goto fmt_next; } if (fmt[i] == 'l') { @@ -405,6 +434,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; +fmt_next: fmt_cnt++; } @@ -827,14 +857,16 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_compat_str_proto; +#endif #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fd81c7de77a7..63089c70adbb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5155,6 +5155,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + kfree(entry); ftrace_direct_func_count--; } } diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0456e0a3dab1..382775edf690 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -4,28 +4,6 @@ #ifdef CONFIG_FUNCTION_TRACER -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_check() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_check() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_check(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_check((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - -extern struct ftrace_ops __rcu *ftrace_ops_list; -extern struct ftrace_ops ftrace_list_end; extern struct mutex ftrace_lock; extern struct ftrace_ops global_ops; diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 31c0fad4cb9e..312d1a0ca3b6 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -16,6 +16,7 @@ #include <linux/printk.h> #include <linux/string.h> #include <linux/sysfs.h> +#include <linux/completion.h> static ulong delay = 100; static char test_mode[12] = "irq"; @@ -28,6 +29,8 @@ MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); +static struct completion done; + #define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) @@ -113,22 +116,47 @@ static int preemptirq_delay_run(void *data) for (i = 0; i < s; i++) (testfuncs[i])(i); + + complete(&done); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + return 0; } -static struct task_struct *preemptirq_start_test(void) +static int preemptirq_run_test(void) { + struct task_struct *task; char task_name[50]; + init_completion(&done); + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); - return kthread_run(preemptirq_delay_run, NULL, task_name); + task = kthread_run(preemptirq_delay_run, NULL, task_name); + if (IS_ERR(task)) + return PTR_ERR(task); + if (task) { + wait_for_completion(&done); + kthread_stop(task); + } + return 0; } static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - preemptirq_start_test(); + ssize_t ret; + + ret = preemptirq_run_test(); + if (ret) + return ret; return count; } @@ -148,11 +176,9 @@ static struct kobject *preemptirq_delay_kobj; static int __init preemptirq_delay_init(void) { - struct task_struct *test_task; int retval; - test_task = preemptirq_start_test(); - retval = PTR_ERR_OR_ZERO(test_task); + retval = preemptirq_run_test(); if (retval != 0) return retval; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6b11e4e2150c..5f0aa5d66e22 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8452,6 +8452,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) */ allocate_snapshot = false; #endif + + /* + * Because of some magic with the way alloc_percpu() works on + * x86_64, we need to synchronize the pgd of all the tables, + * otherwise the trace events that happen in x86_64 page fault + * handlers can't cope with accessing the chance that a + * alloc_percpu()'d memory might be touched in the page fault trace + * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() + * calls in tracing, because something might get triggered within a + * page fault trace event! + */ + vmalloc_sync_mappings(); + return 0; } diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 06d7feb5255f..9de29bb45a27 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -95,24 +95,20 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) struct xbc_node *anode; char buf[MAX_BUF_LEN]; const char *val; - int ret; + int ret = 0; - kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + xbc_node_for_each_array_value(node, "probes", anode, val) { + kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); - ret = kprobe_event_gen_cmd_start(&cmd, event, NULL); - if (ret) - return ret; + ret = kprobe_event_gen_cmd_start(&cmd, event, val); + if (ret) + break; - xbc_node_for_each_array_value(node, "probes", anode, val) { - ret = kprobe_event_add_field(&cmd, val); + ret = kprobe_event_gen_cmd_end(&cmd); if (ret) - return ret; + pr_err("Failed to add probe: %s\n", buf); } - ret = kprobe_event_gen_cmd_end(&cmd); - if (ret) - pr_err("Failed to add probe: %s\n", buf); - return ret; } #else diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5f6834a2bf41..fcab11cc6833 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3320,6 +3320,9 @@ static void __destroy_hist_field(struct hist_field *hist_field) kfree(hist_field->name); kfree(hist_field->type); + kfree(hist_field->system); + kfree(hist_field->event_name); + kfree(hist_field); } @@ -4382,6 +4385,7 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, goto out; } + var->ref = 1; var->flags = HIST_FIELD_FL_VAR; var->var.idx = idx; var->var.hist_data = var->hist_data = hist_data; @@ -5011,6 +5015,9 @@ static void destroy_field_vars(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_field_vars; i++) destroy_field_var(hist_data->field_vars[i]); + + for (i = 0; i < hist_data->n_save_vars; i++) + destroy_field_var(hist_data->save_vars[i]); } static void save_field_var(struct hist_trigger_data *hist_data, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d0568af4a0ef..35989383ae11 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -453,7 +453,7 @@ static bool __within_notrace_func(unsigned long addr) static bool within_notrace_func(struct trace_kprobe *tk) { - unsigned long addr = addr = trace_kprobe_address(tk); + unsigned long addr = trace_kprobe_address(tk); char symname[KSYM_NAME_LEN], *p; if (!__within_notrace_func(addr)) @@ -940,6 +940,9 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); * complete command or only the first part of it; in the latter case, * kprobe_event_add_fields() can be used to add more fields following this. * + * Unlikely the synth_event_gen_cmd_start(), @loc must be specified. This + * returns -EINVAL if @loc == NULL. + * * Return: 0 if successful, error otherwise. */ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, @@ -953,6 +956,9 @@ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; + if (!loc) + return -EINVAL; + if (kretprobe) snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name); else diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..3474d6aa55d8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -475,6 +475,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + /* cleanup if umh_pipe_setup() was successful but exec failed */ + if (info->pid && info->retval) { + fput(umh_info->pipe_to_umh); + fput(umh_info->pipe_from_umh); + } + argv_free(info->argv); umh_info->pid = info->pid; } @@ -544,6 +550,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). + * + * Note: successful return value does not guarantee the helper was called at + * all. You can't rely on sub_info->{init,cleanup} being called even for + * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers + * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { |