diff options
Diffstat (limited to 'kernel')
54 files changed, 1329 insertions, 407 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e5bc66a94b70..2044ef02763b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -134,6 +134,8 @@ KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n +obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b16dbc1bf056..1e75a8923a8d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -80,7 +80,7 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); -/* Action for the reboot notifiter, a global allow kdb to change it */ +/* Action for the reboot notifier, a global allow kdb to change it */ static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; @@ -94,14 +94,6 @@ int dbg_switch_cpu; /* Use kdb or gdbserver mode */ int dbg_kdb_mode = 1; -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - module_param(kgdb_use_con, int, 0644); module_param(kgdbreboot, int, 0644); @@ -163,7 +155,7 @@ early_param("nokgdbroundup", opt_nokgdbroundup); /* * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: + * can be overridden by architectures when needed: */ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { @@ -177,17 +169,23 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } +NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint); int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint); int __weak kgdb_validate_break_address(unsigned long addr) { struct kgdb_bkpt tmp; int err; + + if (kgdb_within_blocklist(addr)) + return -EINVAL; + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we @@ -208,6 +206,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); } +NOKPROBE_SYMBOL(kgdb_arch_pc); int __weak kgdb_arch_init(void) { @@ -218,6 +217,7 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(kgdb_skipexception); #ifdef CONFIG_SMP @@ -239,6 +239,7 @@ void __weak kgdb_call_nmi_hook(void *ignored) */ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } +NOKPROBE_SYMBOL(kgdb_call_nmi_hook); void __weak kgdb_roundup_cpus(void) { @@ -272,6 +273,7 @@ void __weak kgdb_roundup_cpus(void) kgdb_info[cpu].rounding_up = false; } } +NOKPROBE_SYMBOL(kgdb_roundup_cpus); #endif @@ -298,6 +300,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr); /* * SW breakpoint management: @@ -325,6 +328,7 @@ int dbg_activate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints); int dbg_set_sw_break(unsigned long addr) { @@ -388,6 +392,7 @@ int dbg_deactivate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints); int dbg_remove_sw_break(unsigned long addr) { @@ -509,6 +514,7 @@ static int kgdb_io_ready(int print_wait) } return 1; } +NOKPROBE_SYMBOL(kgdb_io_ready); static int kgdb_reenter_check(struct kgdb_state *ks) { @@ -556,6 +562,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } +NOKPROBE_SYMBOL(kgdb_reenter_check); static void dbg_touch_watchdogs(void) { @@ -563,6 +570,7 @@ static void dbg_touch_watchdogs(void) clocksource_touch_watchdog(); rcu_cpu_stall_reset(); } +NOKPROBE_SYMBOL(dbg_touch_watchdogs); static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int exception_state) @@ -752,6 +760,8 @@ cpu_master_loop: } } + dbg_activate_sw_breakpoints(); + /* Call the I/O driver's post_exception routine */ if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); @@ -794,6 +804,7 @@ kgdb_restore: return kgdb_info[cpu].ret_state; } +NOKPROBE_SYMBOL(kgdb_cpu_enter); /* * kgdb_handle_exception() - main entry point from a kernel exception @@ -838,6 +849,7 @@ out: arch_kgdb_ops.enable_nmi(1); return ret; } +NOKPROBE_SYMBOL(kgdb_handle_exception); /* * GDB places a breakpoint at this function to know dynamically loaded objects. @@ -872,6 +884,7 @@ int kgdb_nmicallback(int cpu, void *regs) #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallback); int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, atomic_t *send_ready) @@ -897,6 +910,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallin); static void kgdb_console_write(struct console *co, const char *s, unsigned count) @@ -920,6 +934,20 @@ static struct console kgdbcons = { .index = -1, }; +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + + if (kgdb_io_module_registered && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + #ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handle_dbg(int key) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index cc3c43dfec44..a77df59d9ca5 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -725,7 +725,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) } } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); ptr = pack_threadid(ptr, thref); @@ -735,7 +735,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) finished = 1; } i++; - } while_each_thread(g, p); + } *(--ptr) = '\0'; break; @@ -1061,7 +1061,6 @@ int gdb_serial_stub(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - dbg_activate_sw_breakpoints(); fallthrough; /* to default processing */ default: default_handle: diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index d7ebb2c79cb8..ec4940146612 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -307,6 +307,15 @@ static int kdb_bp(int argc, const char **argv) return KDB_BADINT; /* + * This check is redundant (since the breakpoint machinery should + * be doing the same check during kdb_bp_install) but gives the + * user immediate feedback. + */ + diag = kgdb_validate_break_address(template.bp_addr); + if (diag) + return diag; + + /* * Find an empty bp structure to allocate */ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 18e03aba2cfc..1f9f0e47aeda 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -149,14 +149,14 @@ kdb_bt(int argc, const char **argv) return 0; } /* Now the inactive tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, btaprompt)) return 0; - } kdb_while_each_thread(g, p); + } } else if (strcmp(argv[0], "btp") == 0) { struct task_struct *p; unsigned long pid; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 53a0df6e4d92..0220afda3200 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -147,7 +147,6 @@ int kdb_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); - dbg_activate_sw_breakpoints(); /* Set the exit state to a single step or a continue */ if (KDB_STATE(DOING_SS)) gdbstub_state(ks, "s"); @@ -167,7 +166,6 @@ int kdb_stub(struct kgdb_state *ks) * differently vs the gdbstub */ kgdb_single_step = 0; - dbg_deactivate_sw_breakpoints(); return DBG_SWITCH_CPU_EVENT; } return kgdb_info[ks->cpu].ret_state; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9d847ab851db..6735ac36b718 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,18 +545,18 @@ static int kdb_search_string(char *searched, char *searchfor) static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; + const char *cp; + int len; if (msg_len == 0) return; - if (dbg_io_ops) { - const char *cp = msg; - int len = msg_len; + cp = msg; + len = msg_len; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; } for_each_console(c) { @@ -706,12 +706,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } - if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) { /* * This was a interactive search (using '/' at more - * prompt) and it has completed. Clear the flag. + * prompt) and it has completed. Replace the \0 with + * its original value to ensure multi-line strings + * are handled properly, and return to normal mode. */ + *cphold = replaced_byte; kdb_grepping_flag = 0; + } /* * at this point the string is a full line and * should be printed, up to the null. diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5c7949061671..930ac1b25ec7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2299,10 +2299,10 @@ void kdb_ps_suppressed(void) if (kdb_task_state(p, mask_I)) ++idle; } - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (kdb_task_state(p, mask_M)) ++daemon; - } kdb_while_each_thread(g, p); + } if (idle || daemon) { if (idle) kdb_printf("%d idle process%s (state I)%s\n", @@ -2370,12 +2370,12 @@ static int kdb_ps(int argc, const char **argv) } kdb_printf("\n"); /* Now the real tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (kdb_task_state(p, mask)) kdb_ps1(p); - } kdb_while_each_thread(g, p); + } return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2e296e4a234c..a4281fb99299 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -230,10 +230,6 @@ extern struct task_struct *kdb_curr_task(int); #define kdb_task_has_cpu(p) (task_curr(p)) -/* Simplify coexistence with NPTL */ -#define kdb_do_each_thread(g, p) do_each_thread(g, p) -#define kdb_while_each_thread(g, p) while_each_thread(g, p) - #define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) extern void *debug_kmalloc(size_t size, gfp_t flags); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b92d08e65999..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,7 +16,7 @@ #include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 145ab11b8318..0a1e20f8d4e8 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -304,7 +304,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * terminate a grace period, if and only if the timer interrupt is * not nested into another interrupt. * - * Checking for __rcu_is_watching() here would prevent the nesting + * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim diff --git a/kernel/exit.c b/kernel/exit.c index 1f51c27bae59..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,25 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) { - get_pid(pid); - *flags = f.file->f_flags; - } - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { diff --git a/kernel/fork.c b/kernel/fork.c index 3ca8f1f83fb3..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..680854dcf156 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include <linux/cpu.h> #include <asm/sections.h> -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c5e5e5a11535..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 84f7316792a7..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -521,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9cfa5e89cff7..62d215b2e39f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = { #include <linux/percpu-rwsem.h> static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/pid.c b/kernel/pid.c index 74ddbff1a6ba..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2493348a1631..24a960a89aa8 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1125,7 +1125,10 @@ static char *data_realloc(struct printk_ringbuffer *rb, /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { - blk = to_block(data_ring, blk_lpos->begin); + if (wrapped) + blk = to_block(data_ring, 0); + else + blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include <linux/kernel.h> #include <linux/init.h> +#include <linux/minmax.h> +#include <linux/printk.h> #include <linux/sort.h> #include <linux/string.h> #include <linux/range.h> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0ebe15a84985..b71e21f73c40 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -135,10 +135,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD depends on TREE_RCU && RCU_EXPERT - default 16 + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..1942c1f1bb65 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -23,7 +23,7 @@ config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST @@ -114,4 +114,19 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 95f5117ef8da..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -11,7 +11,7 @@ obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9a0f66133b4b..2d2a6b6b9dfb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuscale.c index 21448d3374e2..2819b95479af 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuscale.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update module-based performance-test facility + * Read-Copy Update module-based scalability-test facility * * Copyright (C) IBM Corporation, 2015 * @@ -44,13 +44,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) /* * The intended use cases for the nreaders and nwriters module parameters @@ -61,25 +61,25 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * nr_cpus for a mixed reader/writer test. * * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the + * rcuscale.nreaders to zero. This will set nwriters to the * value specified by nr_cpus for an update-only test. * * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the + * rcuscale.nwriters to zero. This will set nreaders to the * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. * * Note that this test's readers are intended only as a test load for - * the writers. The reader performance statistics will be overly + * the writers. The reader scalability statistics will be overly * pessimistic due to the per-critical-section interrupt disabling, * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE -# define RCUPERF_SHUTDOWN 0 +# define RCUSCALE_SHUTDOWN 0 #else -# define RCUPERF_SHUTDOWN 1 +# define RCUSCALE_SHUTDOWN 1 #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); @@ -88,16 +88,16 @@ torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUPERF_SHUTDOWN, - "Shutdown at end of performance tests."); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); -torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; @@ -107,12 +107,12 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(atomic_t, n_async_inflight); * Operations vector for selecting different types of tests. */ -struct rcu_perf_ops { +struct rcu_scale_ops { int ptype; void (*init)(void); void (*cleanup)(void); @@ -140,19 +140,19 @@ struct rcu_perf_ops { const char *name; }; -static struct rcu_perf_ops *cur_ops; +static struct rcu_scale_ops *cur_ops; /* - * Definitions for rcu perf testing. + * Definitions for rcu scalability testing. */ -static int rcu_perf_read_lock(void) __acquires(RCU) +static int rcu_scale_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_perf_read_unlock(int idx) __releases(RCU) +static void rcu_scale_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -162,15 +162,15 @@ static unsigned long __maybe_unused rcu_no_completed(void) return 0; } -static void rcu_sync_perf_init(void) +static void rcu_sync_scale_init(void) { } -static struct rcu_perf_ops rcu_ops = { +static struct rcu_scale_ops rcu_ops = { .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, @@ -182,23 +182,23 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for srcu perf testing. + * Definitions for srcu scalability testing. */ -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) { return srcu_read_lock(srcu_ctlp); } -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) { srcu_read_unlock(srcu_ctlp, idx); } -static unsigned long srcu_perf_completed(void) +static unsigned long srcu_scale_completed(void) { return srcu_batches_completed(srcu_ctlp); } @@ -213,78 +213,78 @@ static void srcu_rcu_barrier(void) srcu_barrier(srcu_ctlp); } -static void srcu_perf_synchronize(void) +static void srcu_scale_synchronize(void) { synchronize_srcu(srcu_ctlp); } -static void srcu_perf_synchronize_expedited(void) +static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); } -static struct rcu_perf_ops srcu_ops = { +static struct rcu_scale_ops srcu_ops = { .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcu" }; static struct srcu_struct srcud; -static void srcu_sync_perf_init(void) +static void srcu_sync_scale_init(void) { srcu_ctlp = &srcud; init_srcu_struct(srcu_ctlp); } -static void srcu_sync_perf_cleanup(void) +static void srcu_sync_scale_cleanup(void) { cleanup_srcu_struct(srcu_ctlp); } -static struct rcu_perf_ops srcud_ops = { +static struct rcu_scale_ops srcud_ops = { .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcud" }; /* - * Definitions for RCU-tasks perf testing. + * Definitions for RCU-tasks scalability testing. */ -static int tasks_perf_read_lock(void) +static int tasks_scale_read_lock(void) { return 0; } -static void tasks_perf_read_unlock(int idx) +static void tasks_scale_read_unlock(int idx) { } -static struct rcu_perf_ops tasks_ops = { +static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, @@ -294,7 +294,7 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) return new - old; @@ -302,60 +302,60 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) } /* - * If performance tests complete, wait for shutdown to commence. + * If scalability tests complete, wait for shutdown to commence. */ -static void rcu_perf_wait_shutdown(void) +static void rcu_scale_wait_shutdown(void) { cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) return; while (!torture_must_stop()) schedule_timeout_uninterruptible(1); } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side critical - * section, minimizing update-side interference. However, the point of - * this test is not to evaluate reader performance, but instead to serve - * as a test load for update-side performance testing. + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. */ static int -rcu_perf_reader(void *arg) +rcu_scale_reader(void *arg) { unsigned long flags; int idx; long me = (long)arg; - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); + atomic_inc(&n_rcu_scale_reader_started); do { local_irq_save(flags); idx = cur_ops->readlock(); cur_ops->readunlock(idx); local_irq_restore(flags); - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); + torture_kthread_stopping("rcu_scale_reader"); return 0; } /* - * Callback function for asynchronous grace periods from rcu_perf_writer(). + * Callback function for asynchronous grace periods from rcu_scale_writer(). */ -static void rcu_perf_async_cb(struct rcu_head *rhp) +static void rcu_scale_async_cb(struct rcu_head *rhp) { atomic_dec(this_cpu_ptr(&n_async_inflight)); kfree(rhp); } /* - * RCU perf writer kthread. Repeatedly does a grace period. + * RCU scale writer kthread. Repeatedly does a grace period. */ static int -rcu_perf_writer(void *arg) +rcu_scale_writer(void *arg) { int i = 0; int i_max; @@ -366,7 +366,7 @@ rcu_perf_writer(void *arg) u64 *wdp; u64 *wdpp = writer_durations[me]; - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sched_set_fifo_low(current); @@ -383,8 +383,8 @@ rcu_perf_writer(void *arg) schedule_timeout_uninterruptible(1); t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; if (gp_exp) { b_rcu_gp_test_started = cur_ops->exp_completed() / 2; @@ -404,7 +404,7 @@ retry: rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); + cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); @@ -421,19 +421,19 @@ retry: *wdp = t - *wdp; i_max = i; if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; if (!done && i >= MIN_MEAS) { done = true; sched_set_normal(current, 0); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; if (gp_exp) { b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; @@ -448,30 +448,30 @@ retry: } } if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; if (started && !alldone && i < MAX_MEAS - 1) i++; - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); + torture_kthread_stopping("rcu_scale_writer"); return 0; } static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { - pr_alert("%s" PERF_FLAG + pr_alert("%s" SCALE_FLAG "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } static void -rcu_perf_cleanup(void) +rcu_scale_cleanup(void) { int i; int j; @@ -484,11 +484,11 @@ rcu_perf_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -499,30 +499,30 @@ rcu_perf_cleanup(void) if (reader_tasks) { for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, + torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); } if (writer_tasks) { for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, + torture_stop_kthread(rcu_scale_writer, writer_tasks[i]); if (!writer_n_durations) continue; j = writer_n_durations[i]; pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); + scale_type, SCALE_FLAG, i, j); ngps += j; } pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, ngps, - rcuperf_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -534,7 +534,7 @@ rcu_perf_cleanup(void) for (j = 0; j <= writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, + scale_type, SCALE_FLAG, i, j, *wdp); if (j % 100 == 0) schedule_timeout_uninterruptible(1); @@ -573,22 +573,22 @@ static int compute_real(int n) } /* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts * down system. */ static int -rcu_perf_shutdown(void *arg) +rcu_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); + rcu_scale_cleanup(); kernel_power_off(); return -EINVAL; } /* - * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -598,8 +598,8 @@ torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num alloc static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; -static atomic_t n_kfree_perf_thread_started; -static atomic_t n_kfree_perf_thread_ended; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; struct kfree_obj { char kfree_obj[8]; @@ -607,7 +607,7 @@ struct kfree_obj { }; static int -kfree_perf_thread(void *arg) +kfree_scale_thread(void *arg) { int i, loop = 0; long me = (long)arg; @@ -615,13 +615,13 @@ kfree_perf_thread(void *arg) u64 start_time, end_time; long long mem_begin, mem_during = 0; - VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); start_time = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { if (gp_exp) b_rcu_gp_test_started = cur_ops->exp_completed() / 2; else @@ -646,7 +646,7 @@ kfree_perf_thread(void *arg) cond_resched(); } while (!torture_must_stop() && ++loop < kfree_loops); - if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { end_time = ktime_get_mono_fast_ns(); if (gp_exp) @@ -656,7 +656,7 @@ kfree_perf_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); if (shutdown) { @@ -665,12 +665,12 @@ kfree_perf_thread(void *arg) } } - torture_kthread_stopping("kfree_perf_thread"); + torture_kthread_stopping("kfree_scale_thread"); return 0; } static void -kfree_perf_cleanup(void) +kfree_scale_cleanup(void) { int i; @@ -679,7 +679,7 @@ kfree_perf_cleanup(void) if (kfree_reader_tasks) { for (i = 0; i < kfree_nrealthreads; i++) - torture_stop_kthread(kfree_perf_thread, + torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); } @@ -691,20 +691,20 @@ kfree_perf_cleanup(void) * shutdown kthread. Just waits to be awakened, then shuts down system. */ static int -kfree_perf_shutdown(void *arg) +kfree_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ - kfree_perf_cleanup(); + kfree_scale_cleanup(); kernel_power_off(); return -EINVAL; } static int __init -kfree_perf_init(void) +kfree_scale_init(void) { long i; int firsterr = 0; @@ -713,7 +713,7 @@ kfree_perf_init(void) /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -730,13 +730,13 @@ kfree_perf_init(void) } for (i = 0; i < kfree_nrealthreads; i++) { - firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, kfree_reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) schedule_timeout_uninterruptible(1); torture_init_end(); @@ -744,35 +744,35 @@ kfree_perf_init(void) unwind: torture_init_end(); - kfree_perf_cleanup(); + kfree_scale_cleanup(); return firsterr; } static int __init -rcu_perf_init(void) +rcu_scale_init(void) { long i; int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { + static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose)) + if (!torture_init_begin(scale_type, verbose)) return -EBUSY; - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) break; } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -781,20 +781,20 @@ rcu_perf_init(void) cur_ops->init(); if (kfree_rcu_test) - return kfree_perf_init(); + return kfree_scale_init(); nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -803,17 +803,17 @@ rcu_perf_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); @@ -823,7 +823,7 @@ rcu_perf_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -835,7 +835,7 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (firsterr) goto unwind; @@ -845,9 +845,9 @@ rcu_perf_init(void) unwind: torture_init_end(); - rcu_perf_cleanup(); + rcu_scale_cleanup(); return firsterr; } -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f453bf8d2f1e..916ea4f66e4b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,19 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) @@ -100,6 +87,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); @@ -185,6 +173,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -1413,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) preempt_enable(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { @@ -1808,6 +1800,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; +static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static bool rcu_fwd_emergency_stop; @@ -2074,8 +2067,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = rcu_fwds; + struct rcu_fwd *rfp; + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(rfp); @@ -2093,6 +2092,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -2114,13 +2114,11 @@ static int rcu_torture_fwd_prog(void *args) do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); if (!IS_ENABLED(CONFIG_TINY_RCU) || rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); if (rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_cr(rfp); - unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); @@ -2160,9 +2158,26 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } +static void rcu_torture_fwd_prog_cleanup(void) +{ + struct rcu_fwd *rfp; + + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rfp = rcu_fwds; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); + unregister_oom_notifier(&rcutorture_oom_nb); + kfree(rfp); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -2460,7 +2475,7 @@ rcu_torture_cleanup(void) show_rcu_gp_kthreads(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); @@ -2482,8 +2497,9 @@ rcu_torture_cleanup(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); if (rcu_torture_can_boost()) @@ -2607,6 +2623,8 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, @@ -2649,6 +2667,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d9291f883b54..952595c678b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -546,9 +546,11 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); + if (!errexit) { + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + } for (exp = 0; exp < nruns; exp++) { u64 avg; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c100acf332ed..c13348ee80a5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,19 +29,6 @@ #include "rcu.h" #include "rcu_segcblist.h" -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f78ee759af9c..06895ef85d69 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* @@ -178,6 +165,12 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -468,24 +461,25 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; @@ -1092,11 +1086,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1229,13 +1218,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); @@ -1498,9 +1502,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + return ret; } @@ -1576,6 +1581,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, } /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1645,6 +1663,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } @@ -1683,6 +1702,15 @@ static void rcu_gp_torture_wait(void) } /* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static bool rcu_gp_init(void) @@ -1720,10 +1748,13 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { @@ -1810,6 +1841,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } @@ -1898,7 +1933,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); @@ -2026,6 +2061,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* @@ -2204,7 +2243,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2213,6 +2252,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2229,8 +2269,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2279,7 +2318,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* @@ -2376,6 +2415,7 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; unsigned long flags; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -2404,9 +2444,15 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); @@ -2547,8 +2593,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to @@ -2616,6 +2661,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { @@ -2660,6 +2713,10 @@ static __latent_entropy void rcu_core(void) /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -3022,6 +3079,12 @@ struct kfree_rcu_cpu_work { * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3037,14 +3100,6 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; - - /* - * A simple cache list that contains objects for - * reuse purpose. In order to save some per-cpu - * space the list is singular. Even though it is - * lockless an access has to be protected by the - * per-cpu lock. - */ struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3445,7 +3500,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3460,7 +3515,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3493,7 +3548,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3857,6 +3912,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -3975,8 +4031,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3996,12 +4050,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4061,7 +4114,7 @@ void rcu_report_dead(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..e4f66b8f7c47 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -156,6 +156,7 @@ struct rcu_data { bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -164,6 +165,7 @@ struct rcu_data { /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ bool defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1888c0eb1216..8760b6ead770 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void) /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_node *rnp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..fd8a52e9a887 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -374,6 +376,8 @@ void __rcu_read_lock(void) rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -455,8 +459,14 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) return; } t->rcu_read_unlock_special.s = 0; - if (special.b.need_qs) - rcu_qs(); + if (special.b.need_qs) { + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); + } else { + rcu_qs(); + } + } /* * Respond to a request by an expedited grace period for a @@ -769,6 +779,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ /* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + return; + rdp = this_cpu_ptr(&rcu_data); + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + +/* * Tell them what RCU they are running. */ static void __init rcu_bootup_announce(void) @@ -1926,6 +1954,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * nearest grace period (if any) to wait for next. The CB kthreads * and the global grace-period kthread are awakened if needed. */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); @@ -2411,13 +2440,12 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_timer); + wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && - !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ - pr_info(" !!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], "tT"[wastimer], diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5d3b4794db4..0fde39b8daab 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void) { unsigned long j; - if (!rcu_kick_kthreads) + if (!READ_ONCE(rcu_kick_kthreads)) return; j = READ_ONCE(rcu_state.jiffies_kick_kthreads); if (time_after(jiffies, j) && rcu_state.gp_kthread && @@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); @@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && @@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e0f4bcb558f..39334d2d2b37 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -53,19 +53,6 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); diff --git a/kernel/relay.c b/kernel/relay.c index fb4e0c530c08..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); diff --git a/kernel/resource.c b/kernel/resource.c index db6582b6cf42..6d227a570f38 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1248,7 +1248,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1266,21 +1265,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1306,7 +1312,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1323,20 +1328,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1347,9 +1355,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1360,10 +1367,69 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ diff --git a/kernel/scftorture.c b/kernel/scftorture.c new file mode 100644 index 000000000000..554a521ee235 --- /dev/null +++ b/kernel/scftorture.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Torture test for smp_call_function() and friends. +// +// Copyright (C) Facebook, 2020. +// +// Author: Paul E. McKenney <paulmck@kernel.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> + +#define SCFTORT_STRING "scftorture" +#define SCFTORT_FLAG SCFTORT_STRING ": " + +#define SCFTORTOUT(s, x...) \ + pr_alert(SCFTORT_FLAG s, ## x) + +#define VERBOSE_SCFTORTOUT(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + +#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)"); +torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs."); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); +torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); +torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); +torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); + +char *torture_type = ""; + +#ifdef MODULE +# define SCFTORT_SHUTDOWN 0 +#else +# define SCFTORT_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test."); + +struct scf_statistics { + struct task_struct *task; + int cpu; + long long n_single; + long long n_single_ofl; + long long n_single_wait; + long long n_single_wait_ofl; + long long n_many; + long long n_many_wait; + long long n_all; + long long n_all_wait; +}; + +static struct scf_statistics *scf_stats_p; +static struct task_struct *scf_torture_stats_task; +static DEFINE_PER_CPU(long long, scf_invoked_count); + +// Data for random primitive selection +#define SCF_PRIM_SINGLE 0 +#define SCF_PRIM_MANY 1 +#define SCF_PRIM_ALL 2 +#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. + +static char *scf_prim_name[] = { + "smp_call_function_single", + "smp_call_function_many", + "smp_call_function", +}; + +struct scf_selector { + unsigned long scfs_weight; + int scfs_prim; + bool scfs_wait; +}; +static struct scf_selector scf_sel_array[SCF_NPRIMS]; +static int scf_sel_array_len; +static unsigned long scf_sel_totweight; + +// Communicate between caller and handler. +struct scf_check { + bool scfc_in; + bool scfc_out; + int scfc_cpu; // -1 for not _single(). + bool scfc_wait; +}; + +// Use to wait for all threads to start. +static atomic_t n_started; +static atomic_t n_errs; +static atomic_t n_mb_in_errs; +static atomic_t n_mb_out_errs; +static atomic_t n_alloc_errs; +static bool scfdone; +static char *bangstr = ""; + +static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); + +// Print torture statistics. Caller must ensure serialization. +static void scf_torture_stats_print(void) +{ + int cpu; + int i; + long long invoked_count = 0; + bool isdone = READ_ONCE(scfdone); + struct scf_statistics scfs = {}; + + for_each_possible_cpu(cpu) + invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); + for (i = 0; i < nthreads; i++) { + scfs.n_single += scf_stats_p[i].n_single; + scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_wait += scf_stats_p[i].n_single_wait; + scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; + scfs.n_many += scf_stats_p[i].n_many; + scfs.n_many_wait += scf_stats_p[i].n_many_wait; + scfs.n_all += scf_stats_p[i].n_all; + scfs.n_all_wait += scf_stats_p[i].n_all_wait; + } + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || + atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + bangstr = "!!! "; + pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, + scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); + torture_onoff_stats(); + pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), + atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs), + atomic_read(&n_alloc_errs)); +} + +// Periodically prints torture statistics, if periodic statistics printing +// was specified via the stat_interval module parameter. +static int +scf_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("scf_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + scf_torture_stats_print(); + torture_shutdown_absorb("scf_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("scf_torture_stats"); + return 0; +} + +// Add a primitive to the scf_sel_array[]. +static void scf_sel_add(unsigned long weight, int prim, bool wait) +{ + struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len]; + + // If no weight, if array would overflow, if computing three-place + // percentages would overflow, or if the scf_prim_name[] array would + // overflow, don't bother. In the last three two cases, complain. + if (!weight || + WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) || + WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) || + WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name))) + return; + scf_sel_totweight += weight; + scfsp->scfs_weight = scf_sel_totweight; + scfsp->scfs_prim = prim; + scfsp->scfs_wait = wait; + scf_sel_array_len++; +} + +// Dump out weighting percentages for scf_prim_name[] array. +static void scf_sel_dump(void) +{ + int i; + unsigned long oldw = 0; + struct scf_selector *scfsp; + unsigned long w; + + for (i = 0; i < scf_sel_array_len; i++) { + scfsp = &scf_sel_array[i]; + w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight; + pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000, + scf_prim_name[scfsp->scfs_prim], + scfsp->scfs_wait ? "wait" : "nowait"); + oldw = scfsp->scfs_weight; + } +} + +// Randomly pick a primitive and wait/nowait, based on weightings. +static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) +{ + int i; + unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1); + + for (i = 0; i < scf_sel_array_len; i++) + if (scf_sel_array[i].scfs_weight >= w) + return &scf_sel_array[i]; + WARN_ON_ONCE(1); + return &scf_sel_array[0]; +} + +// Update statistics and occasionally burn up mass quantities of CPU time, +// if told to do so via scftorture.longwait. Otherwise, occasionally burn +// a little bit. +static void scf_handler(void *scfc_in) +{ + int i; + int j; + unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp)) { + WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers. + if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); + } + this_cpu_inc(scf_invoked_count); + if (longwait <= 0) { + if (!(r & 0xffc0)) + udelay(r & 0x3f); + goto out; + } + if (r & 0xfff) + goto out; + r = (r >> 12); + if (longwait <= 0) { + udelay((r & 0xff) + 1); + goto out; + } + r = r % longwait + 1; + for (i = 0; i < r; i++) { + for (j = 0; j < 1000; j++) { + udelay(1000); + cpu_relax(); + } + } +out: + if (unlikely(!scfcp)) + return; + if (scfcp->scfc_wait) + WRITE_ONCE(scfcp->scfc_out, true); + else + kfree(scfcp); +} + +// As above, but check for correct CPU. +static void scf_handler_1(void *scfc_in) +{ + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) { + atomic_inc(&n_errs); + } + scf_handler(scfcp); +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) +{ + uintptr_t cpu; + int ret = 0; + struct scf_check *scfcp = NULL; + struct scf_selector *scfsp = scf_sel_rand(trsp); + + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) { + atomic_inc(&n_alloc_errs); + } else { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + } + } + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: + cpu = torture_random(trsp) % nr_cpu_ids; + if (scfsp->scfs_wait) + scfp->n_single_wait++; + else + scfp->n_single++; + if (scfcp) { + scfcp->scfc_cpu = cpu; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); + if (ret) { + if (scfsp->scfs_wait) + scfp->n_single_wait_ofl++; + else + scfp->n_single_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; + case SCF_PRIM_MANY: + if (scfsp->scfs_wait) + scfp->n_many_wait++; + else + scfp->n_many++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); + break; + case SCF_PRIM_ALL: + if (scfsp->scfs_wait) + scfp->n_all_wait++; + else + scfp->n_all++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); + break; + default: + WARN_ON_ONCE(1); + if (scfcp) + scfcp->scfc_out = true; + } + if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && + !scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + barrier(); // Prevent race-reduction compiler optimizations. + } + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + if (!(torture_random(trsp) & 0xfff)) + schedule_timeout_uninterruptible(1); +} + +// SCF test kthread. Repeatedly does calls to members of the +// smp_call_function() family of functions. +static int scftorture_invoker(void *arg) +{ + int cpu; + DEFINE_TORTURE_RANDOM(rand); + struct scf_statistics *scfp = (struct scf_statistics *)arg; + bool was_offline = false; + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); + cpu = scfp->cpu % nr_cpu_ids; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + set_user_nice(current, MAX_NICE); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) { + if (torture_must_stop()) { + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu); + goto end; + } + schedule_timeout_uninterruptible(1); + } + + VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); + + do { + scftorture_invoke_one(scfp, &rand); + while (cpu_is_offline(cpu) && !torture_must_stop()) { + schedule_timeout_interruptible(HZ / 5); + was_offline = true; + } + if (was_offline) { + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + was_offline = false; + } + cond_resched(); + } while (!torture_must_stop()); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); +end: + torture_kthread_stopping("scftorture_invoker"); + return 0; +} + +static void +scftorture_print_module_parms(const char *tag) +{ + pr_alert(SCFTORT_FLAG + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); +} + +static void scf_cleanup_handler(void *unused) +{ +} + +static void scf_torture_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + WRITE_ONCE(scfdone, true); + if (nthreads) + for (i = 0; i < nthreads; i++) + torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); + else + goto end; + smp_call_function(scf_cleanup_handler, NULL, 0); + torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); + scf_torture_stats_print(); // -After- the stats thread is stopped! + kfree(scf_stats_p); // -After- the last stats print has completed! + scf_stats_p = NULL; + + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) + scftorture_print_module_parms("End of test: FAILURE"); + else if (torture_onoff_failures()) + scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); + else + scftorture_print_module_parms("End of test: SUCCESS"); + +end: + torture_cleanup_end(); +} + +static int __init scf_torture_init(void) +{ + long i; + int firsterr = 0; + unsigned long weight_single1 = weight_single; + unsigned long weight_single_wait1 = weight_single_wait; + unsigned long weight_many1 = weight_many; + unsigned long weight_many_wait1 = weight_many_wait; + unsigned long weight_all1 = weight_all; + unsigned long weight_all_wait1 = weight_all_wait; + + if (!torture_init_begin(SCFTORT_STRING, verbose)) + return -EBUSY; + + scftorture_print_module_parms("Start of test"); + + if (weight_single == -1 && weight_single_wait == -1 && + weight_many == -1 && weight_many_wait == -1 && + weight_all == -1 && weight_all_wait == -1) { + weight_single1 = 2 * nr_cpu_ids; + weight_single_wait1 = 2 * nr_cpu_ids; + weight_many1 = 2; + weight_many_wait1 = 2; + weight_all1 = 1; + weight_all_wait1 = 1; + } else { + if (weight_single == -1) + weight_single1 = 0; + if (weight_single_wait == -1) + weight_single_wait1 = 0; + if (weight_many == -1) + weight_many1 = 0; + if (weight_many_wait == -1) + weight_many_wait1 = 0; + if (weight_all == -1) + weight_all1 = 0; + if (weight_all_wait == -1) + weight_all_wait1 = 0; + } + if (weight_single1 == 0 && weight_single_wait1 == 0 && + weight_many1 == 0 && weight_many_wait1 == 0 && + weight_all1 == 0 && weight_all_wait1 == 0) { + VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); + firsterr = -EINVAL; + goto unwind; + } + scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); + scf_sel_add(weight_many1, SCF_PRIM_MANY, false); + scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); + scf_sel_add(weight_all1, SCF_PRIM_ALL, false); + scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true); + scf_sel_dump(); + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); + if (firsterr) + goto unwind; + } + + // Worker tasks invoking smp_call_function(). + if (nthreads < 0) + nthreads = num_online_cpus(); + scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); + if (!scf_stats_p) { + VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + + atomic_set(&n_started, nthreads); + for (i = 0; i < nthreads; i++) { + scf_stats_p[i].cpu = i; + firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], + scf_stats_p[i].task); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); + if (firsterr) + goto unwind; + } + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + scf_torture_cleanup(); + return firsterr; +} + +module_init(scf_torture_init); +module_exit(scf_torture_cleanup); diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..4d17501433be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -20,6 +20,9 @@ #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> +#include <linux/sched/clock.h> +#include <linux/nmi.h> +#include <linux/sched/debug.h> #include "smpboot.h" #include "sched/smp.h" @@ -96,6 +99,103 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + +static DEFINE_PER_CPU(call_single_data_t *, cur_csd); +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +static DEFINE_PER_CPU(void *, cur_csd_info); + +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static atomic_t csd_bug_count = ATOMIC_INIT(0); + +/* Record current CSD work for current CPU, NULL to erase. */ +static void csd_lock_record(call_single_data_t *csd) +{ + if (!csd) { + smp_mb(); /* NULL cur_csd after unlock. */ + __this_cpu_write(cur_csd, NULL); + return; + } + __this_cpu_write(cur_csd_func, csd->func); + __this_cpu_write(cur_csd_info, csd->info); + smp_wmb(); /* func and info before csd. */ + __this_cpu_write(cur_csd, csd); + smp_mb(); /* Update cur_csd before function call. */ + /* Or before unlock, as the case may be. */ +} + +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +{ + unsigned int csd_type; + + csd_type = CSD_TYPE(csd); + if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) + return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return -1; +} + +/* + * Complain if too much time spent waiting. Note that only + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, + * so waiting on other types gets much less information. + */ +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +{ + int cpu = -1; + int cpux; + bool firsttime; + u64 ts2, ts_delta; + call_single_data_t *cpu_cur_csd; + unsigned int flags = READ_ONCE(csd->flags); + + if (!(flags & CSD_FLAG_LOCK)) { + if (!unlikely(*bug_id)) + return true; + cpu = csd_lock_wait_getcpu(csd); + pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", + *bug_id, raw_smp_processor_id(), cpu); + return true; + } + + ts2 = sched_clock(); + ts_delta = ts2 - *ts1; + if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + return false; + + firsttime = !*bug_id; + if (firsttime) + *bug_id = atomic_inc_return(&csd_bug_count); + cpu = csd_lock_wait_getcpu(csd); + if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) + cpux = 0; + else + cpux = cpu; + cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + cpu, csd->func, csd->info); + if (cpu_cur_csd && csd != cpu_cur_csd) { + pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", + *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), + READ_ONCE(per_cpu(cur_csd_info, cpux))); + } else { + pr_alert("\tcsd: CSD lock (#%d) %s.\n", + *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); + } + if (cpu >= 0) { + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + if (!cpu_cur_csd) { + pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); + arch_send_call_function_single_ipi(cpu); + } + } + dump_stack(); + *ts1 = ts2; + + return false; +} + /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -105,8 +205,28 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(call_single_data_t *csd) { + int bug_id = 0; + u64 ts0, ts1; + + ts1 = ts0 = sched_clock(); + for (;;) { + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + break; + cpu_relax(); + } + smp_acquire__after_ctrl_dep(); +} + +#else +static void csd_lock_record(call_single_data_t *csd) +{ +} + +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } +#endif static __always_inline void csd_lock(call_single_data_t *csd) { @@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ + csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); + csd_lock_record(NULL); local_irq_restore(flags); return 0; } @@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->llist; } + csd_lock_record(csd); func(info); csd_unlock(csd); + csd_lock_record(NULL); } else { prev = &csd->llist; } @@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_call_func_t func = csd->func; void *info = csd->info; + csd_lock_record(csd); csd_unlock(csd); func(info); + csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } @@ -375,6 +501,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif err = generic_exec_single(cpu, csd); @@ -540,6 +670,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } @@ -741,7 +875,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..6401880dff74 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c925d1e1777e..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f0199a4ba1ad..81632cd5e3b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29ca60f058ef..47a71f96e5bc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2498,7 +2498,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ -static __init int +static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ac088ce6059b..437935e7a199 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1212,11 +1212,14 @@ out_put: * stable state - idle, on timer or on worklist. * * Return: + * + * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long + * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |