aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/alpha/include/asm/spinlock_types.h4
-rw-r--r--arch/arm/Kconfig5
-rw-r--r--arch/arm/include/asm/spinlock_types.h4
-rw-r--r--arch/arm/include/asm/thread_info.h6
-rw-r--r--arch/arm/kernel/asm-offsets.c1
-rw-r--r--arch/arm/kernel/entry-armv.S19
-rw-r--r--arch/arm/kernel/signal.c3
-rw-r--r--arch/arm/kernel/smp.c2
-rw-r--r--arch/arm/mm/fault.c6
-rw-r--r--arch/arm64/Kconfig3
-rw-r--r--arch/arm64/include/asm/preempt.h25
-rw-r--r--arch/arm64/include/asm/spinlock_types.h4
-rw-r--r--arch/arm64/include/asm/thread_info.h8
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kernel/entry.S13
-rw-r--r--arch/arm64/kernel/fpsimd.c14
-rw-r--r--arch/arm64/kernel/signal.c2
-rw-r--r--arch/arm64/kvm/arm.c6
-rw-r--r--arch/hexagon/include/asm/spinlock_types.h4
-rw-r--r--arch/ia64/include/asm/spinlock_types.h4
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/cmpxchg.h2
-rw-r--r--arch/powerpc/include/asm/simple_spinlock_types.h2
-rw-r--r--arch/powerpc/include/asm/smp.h1
-rw-r--r--arch/powerpc/include/asm/spinlock_types.h4
-rw-r--r--arch/powerpc/include/asm/stackprotector.h4
-rw-r--r--arch/powerpc/include/asm/thread_info.h7
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/interrupt.c10
-rw-r--r--arch/powerpc/kernel/irq.c4
-rw-r--r--arch/powerpc/kernel/kgdb.c11
-rw-r--r--arch/powerpc/kernel/smp.c5
-rw-r--r--arch/powerpc/kernel/traps.c8
-rw-r--r--arch/powerpc/kernel/watchdog.c5
-rw-r--r--arch/powerpc/kexec/crash.c3
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c31
-rw-r--r--arch/s390/include/asm/spinlock_types.h4
-rw-r--r--arch/sh/include/asm/spinlock_types.h4
-rw-r--r--arch/sh/kernel/irq.c2
-rw-r--r--arch/sparc/kernel/irq_64.c2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/fpu/api.h1
-rw-r--r--arch/x86/include/asm/irq_stack.h3
-rw-r--r--arch/x86/include/asm/preempt.h33
-rw-r--r--arch/x86/include/asm/signal.h13
-rw-r--r--arch/x86/include/asm/stackprotector.h8
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c3
-rw-r--r--arch/x86/kernel/fpu/core.c12
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/kgdb.c10
-rw-r--r--arch/x86/kvm/x86.c8
-rw-r--r--arch/xtensa/include/asm/spinlock_types.h4
-rw-r--r--block/blk-mq.c6
-rw-r--r--crypto/cryptd.c19
-rw-r--r--drivers/base/node.c18
-rw-r--r--drivers/block/zram/zram_drv.c36
-rw-r--r--drivers/block/zram/zram_drv.h1
-rw-r--r--drivers/char/random.c11
-rw-r--r--drivers/char/tpm/tpm_tis.c29
-rw-r--r--drivers/firmware/efi/efi.c5
-rw-r--r--drivers/gpu/drm/i915/display/intel_crtc.c15
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_pm.c8
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c2
-rw-r--r--drivers/gpu/drm/i915/i915_trace.h6
-rw-r--r--drivers/gpu/drm/radeon/radeon_display.c2
-rw-r--r--drivers/hv/hyperv_vmbus.h1
-rw-r--r--drivers/hv/vmbus_drv.c5
-rw-r--r--drivers/leds/trigger/Kconfig1
-rw-r--r--drivers/md/raid5.c7
-rw-r--r--drivers/md/raid5.h1
-rw-r--r--drivers/scsi/fcoe/fcoe.c16
-rw-r--r--drivers/scsi/fcoe/fcoe_ctlr.c4
-rw-r--r--drivers/scsi/libfc/fc_exch.c4
-rw-r--r--drivers/tty/serial/8250/8250.h47
-rw-r--r--drivers/tty/serial/8250/8250_core.c17
-rw-r--r--drivers/tty/serial/8250/8250_fsl.c9
-rw-r--r--drivers/tty/serial/8250/8250_ingenic.c7
-rw-r--r--drivers/tty/serial/8250/8250_mtk.c29
-rw-r--r--drivers/tty/serial/8250/8250_port.c92
-rw-r--r--drivers/tty/serial/amba-pl011.c17
-rw-r--r--drivers/tty/serial/omap-serial.c12
-rw-r--r--fs/afs/dir_silly.c2
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/dcache.c39
-rw-r--r--fs/fuse/readdir.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c8
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--include/asm-generic/softirq_stack.h2
-rw-r--r--include/linux/console.h26
-rw-r--r--include/linux/dcache.h4
-rw-r--r--include/linux/debug_locks.h3
-rw-r--r--include/linux/entry-common.h2
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/hardirq.h2
-rw-r--r--include/linux/highmem-internal.h27
-rw-r--r--include/linux/hrtimer.h8
-rw-r--r--include/linux/irq_work.h6
-rw-r--r--include/linux/irqdesc.h1
-rw-r--r--include/linux/irqflags.h23
-rw-r--r--include/linux/kgdb.h3
-rw-r--r--include/linux/local_lock_internal.h74
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h31
-rw-r--r--include/linux/mutex.h165
-rw-r--r--include/linux/nfs_xdr.h2
-rw-r--r--include/linux/notifier.h6
-rw-r--r--include/linux/pid.h1
-rw-r--r--include/linux/preempt.h79
-rw-r--r--include/linux/printk.h30
-rw-r--r--include/linux/random.h2
-rw-r--r--include/linux/rbtree.h30
-rw-r--r--include/linux/rbtree_types.h34
-rw-r--r--include/linux/rcupdate.h7
-rw-r--r--include/linux/rtmutex.h4
-rw-r--r--include/linux/rwbase_rt.h37
-rw-r--r--include/linux/rwlock_rt.h140
-rw-r--r--include/linux/rwlock_types.h39
-rw-r--r--include/linux/rwsem.h58
-rw-r--r--include/linux/sched.h197
-rw-r--r--include/linux/sched/mm.h11
-rw-r--r--include/linux/sched/wake_q.h7
-rw-r--r--include/linux/serial_8250.h5
-rw-r--r--include/linux/shmem_fs.h2
-rw-r--r--include/linux/skbuff.h7
-rw-r--r--include/linux/slub_def.h2
-rw-r--r--include/linux/smp.h3
-rw-r--r--include/linux/spinlock.h15
-rw-r--r--include/linux/spinlock_api_smp.h3
-rw-r--r--include/linux/spinlock_rt.h151
-rw-r--r--include/linux/spinlock_types.h45
-rw-r--r--include/linux/spinlock_types_raw.h65
-rw-r--r--include/linux/thread_info.h12
-rw-r--r--include/linux/trace_events.h7
-rw-r--r--include/linux/u64_stats_sync.h42
-rw-r--r--include/linux/vmstat.h69
-rw-r--r--include/linux/wait.h1
-rw-r--r--include/linux/ww_mutex.h16
-rw-r--r--include/net/gen_stats.h11
-rw-r--r--include/net/net_seq_lock.h15
-rw-r--r--include/net/sch_generic.h27
-rw-r--r--init/Kconfig5
-rw-r--r--init/main.c7
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt6
-rw-r--r--kernel/cgroup/cpuset.c70
-rw-r--r--kernel/cgroup/rstat.c5
-rw-r--r--kernel/debug/debug_core.c45
-rw-r--r--kernel/debug/kdb/kdb_io.c18
-rw-r--r--kernel/entry/common.c12
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c466
-rw-r--r--kernel/irq/handle.c8
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c69
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/kthread.c16
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/mutex-debug.c25
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c139
-rw-r--r--kernel/locking/mutex.h35
-rw-r--r--kernel/locking/rtmutex.c928
-rw-r--r--kernel/locking/rtmutex_api.c647
-rw-r--r--kernel/locking/rtmutex_common.h110
-rw-r--r--kernel/locking/rwbase_rt.c266
-rw-r--r--kernel/locking/rwsem.c108
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c5
-rw-r--r--kernel/locking/spinlock_rt.c258
-rw-r--r--kernel/notifier.c12
-rw-r--r--kernel/panic.c33
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h69
-rw-r--r--kernel/printk/printk.c1182
-rw-r--r--kernel/printk/printk_safe.c414
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/rcu/rcutorture.c97
-rw-r--r--kernel/rcu/tasks.h9
-rw-r--r--kernel/sched/core.c219
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c3
-rw-r--r--kernel/signal.c36
-rw-r--r--kernel/smp.c15
-rw-r--r--kernel/time/hrtimer.c31
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/trace.c58
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/bug.c1
-rw-r--r--lib/debugobjects.c5
-rw-r--r--lib/irq_poll.c5
-rw-r--r--lib/locking-selftest.c51
-rw-r--r--lib/nmi_backtrace.c6
-rw-r--r--lib/scatterlist.c2
-rw-r--r--lib/test_lockup.c8
-rw-r--r--localversion-rt1
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/memcontrol.c89
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c259
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slub.c756
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmstat.c258
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/zsmalloc.c85
-rw-r--r--net/Kconfig2
-rw-r--r--net/core/dev.c33
-rw-r--r--net/core/gen_estimator.c6
-rw-r--r--net/core/gen_stats.c12
-rw-r--r--net/core/sock.c6
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv6/inet6_hashtables.c5
-rw-r--r--net/sched/sch_api.c2
-rw-r--r--net/sched/sch_generic.c10
-rw-r--r--net/sunrpc/svc_xprt.c4
-rw-r--r--samples/kfifo/bytestream-example.c12
-rw-r--r--samples/kfifo/inttype-example.c12
-rw-r--r--samples/kfifo/record-example.c12
233 files changed, 6554 insertions, 3201 deletions
diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h
index 1d5716bc060b..6883bc952d22 100644
--- a/arch/alpha/include/asm/spinlock_types.h
+++ b/arch/alpha/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef _ALPHA_SPINLOCK_TYPES_H
#define _ALPHA_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
typedef struct {
volatile unsigned int lock;
} arch_spinlock_t;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 24804f11302d..4252af49eecb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -32,6 +32,7 @@ config ARM
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_MEMTEST
@@ -69,7 +70,7 @@ config ARM
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
select HAVE_ARCH_MMAP_RND_BITS if MMU
@@ -110,6 +111,7 @@ config ARM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RSEQ
@@ -125,6 +127,7 @@ config ARM
select OLD_SIGSUSPEND3
select PCI_SYSCALL if PCI
select PERF_USE_VMALLOC
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select RTC_LIB
select SET_FS
select SYS_SUPPORTS_APM_EMULATION
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
index 5976958647fe..a37c0803954b 100644
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
#define TICKET_SHIFT 16
typedef struct {
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 70d4cbc49ae1..b86418b4dfef 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -54,6 +54,7 @@ struct cpu_context_save {
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
__u32 cpu; /* cpu */
@@ -146,6 +147,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
#define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 9
#define TIF_USING_IWMMXT 17
#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
@@ -160,6 +162,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT)
/* Checks for any syscall work in entry-common.S */
@@ -169,7 +172,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
/*
* Change these and you break ASM code in entry-common.S
*/
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NOTIFY_SIGNAL)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 70993af22d80..024c65c3a0f2 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -43,6 +43,7 @@ int main(void)
BLANK();
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 0ea8529a4872..fa0d155d21b3 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -206,11 +206,18 @@ __irq_svc:
#ifdef CONFIG_PREEMPTION
ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
- ldr r0, [tsk, #TI_FLAGS] @ get flags
teq r8, #0 @ if preempt count != 0
+ bne 1f @ return from exeption
+ ldr r0, [tsk, #TI_FLAGS] @ get flags
+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
+ blne svc_preempt @ preempt!
+
+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r8, #0 @ if preempt lazy count != 0
movne r0, #0 @ force flags to 0
- tst r0, #_TIF_NEED_RESCHED
+ tst r0, #_TIF_NEED_RESCHED_LAZY
blne svc_preempt
+1:
#endif
svc_exit r5, irq = 1 @ return from exception
@@ -225,8 +232,14 @@ svc_preempt:
1: bl preempt_schedule_irq @ irq en/disable is done inside
ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
tst r0, #_TIF_NEED_RESCHED
+ bne 1b
+ tst r0, #_TIF_NEED_RESCHED_LAZY
reteq r8 @ go again
- b 1b
+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
+ teq r0, #0 @ if preempt lazy count != 0
+ beq 1b
+ ret r8 @ go again
+
#endif
__und_fault:
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a3a38d0a4c85..f04ccf19ab1f 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
*/
trace_hardirqs_off();
do {
- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
+ _TIF_NEED_RESCHED_LAZY))) {
schedule();
} else {
if (unlikely(!user_mode(regs)))
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index c7bb168b0d97..b943e2df9540 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -667,9 +667,7 @@ static void do_handle_IPI(int ipinr)
break;
case IPI_CPU_BACKTRACE:
- printk_nmi_enter();
nmi_cpu_backtrace(get_irq_regs());
- printk_nmi_exit();
break;
default:
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index da78740faf7b..07f79e533a29 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
if (addr < TASK_SIZE)
return do_page_fault(addr, fsr, regs);
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
if (user_mode(regs))
goto bad_area;
@@ -479,6 +482,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
static int
do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
+ if (interrupts_enabled(regs))
+ local_irq_enable();
+
do_bad_area(addr, fsr, regs);
return 0;
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d8566bbf9..4c36d75edb65 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -87,6 +87,7 @@ config ARM64
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
@@ -192,6 +193,7 @@ config ARM64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_PREEMPT_LAZY
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUTEX_CMPXCHG if FUTEX
select MMU_GATHER_RCU_TABLE_FREE
@@ -213,6 +215,7 @@ config ARM64
select PCI_DOMAINS_GENERIC if PCI
select PCI_ECAM if (ACPI && PCI)
select PCI_SYSCALL if PCI
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select POWER_RESET
select POWER_SUPPLY
select SPARSE_IRQ
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index e83f0982b99c..2545c17281e1 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -70,13 +70,36 @@ static inline bool __preempt_count_dec_and_test(void)
* interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
* pair.
*/
- return !pc || !READ_ONCE(ti->preempt_count);
+ if (!pc || !READ_ONCE(ti->preempt_count))
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if ((pc & ~PREEMPT_NEED_RESCHED))
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
}
static inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u64 pc = READ_ONCE(current_thread_info()->preempt_count);
+ if (pc == preempt_offset)
+ return true;
+
+ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset)
+ return false;
+
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
u64 pc = READ_ONCE(current_thread_info()->preempt_count);
return pc == preempt_offset;
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
index 18782f0c4721..6672b05350b4 100644
--- a/arch/arm64/include/asm/spinlock_types.h
+++ b/arch/arm64/include/asm/spinlock_types.h
@@ -5,10 +5,6 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
-# error "please don't include this file directly"
-#endif
-
#include <asm-generic/qspinlock_types.h>
#include <asm-generic/qrwlock_types.h>
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 6623c99f0984..c55ccec33a5a 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -26,6 +26,7 @@ struct thread_info {
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
@@ -67,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst,
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */
#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */
+#define TIF_NEED_RESCHED_LAZY 7
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
@@ -97,8 +99,10 @@ int arch_dup_task_struct(struct task_struct *dst,
#define _TIF_SVE (1 << TIF_SVE)
#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
_TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
_TIF_NOTIFY_SIGNAL)
@@ -107,6 +111,8 @@ int arch_dup_task_struct(struct task_struct *dst,
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
_TIF_SYSCALL_EMU)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
#ifdef CONFIG_SHADOW_CALL_STACK
#define INIT_SCS \
.scs_base = init_shadow_call_stack, \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..fe19323aa44a 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -30,6 +30,7 @@ int main(void)
BLANK();
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
#endif
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3513984a88bd..db926701ef59 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -572,9 +572,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
mrs x0, daif
orr x24, x24, x0
alternative_else_nop_endif
- cbnz x24, 1f // preempt count != 0 || NMI return path
- bl arm64_preempt_schedule_irq // irq en/disable is done inside
+
+ cbz x24, 1f // (need_resched + count) == 0
+ cbnz w24, 2f // count != 0
+
+ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
+ cbnz w24, 2f // preempt lazy count != 0
+
+ ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
+ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
1:
+ bl arm64_preempt_schedule_irq // irq en/disable is done inside
+2:
#endif
mov x0, sp
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index ad3dd34a83cf..9bf86cd7b605 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -226,6 +226,16 @@ static void sve_free(struct task_struct *task)
__sve_free(task);
}
+static void *sve_free_atomic(struct task_struct *task)
+{
+ void *sve_state = task->thread.sve_state;
+
+ WARN_ON(test_tsk_thread_flag(task, TIF_SVE));
+
+ task->thread.sve_state = NULL;
+ return sve_state;
+}
+
/*
* TIF_SVE controls whether a task can use SVE without trapping while
* in userspace, and also the way a task's FPSIMD/SVE state is stored
@@ -1031,6 +1041,7 @@ void fpsimd_thread_switch(struct task_struct *next)
void fpsimd_flush_thread(void)
{
int vl, supported_vl;
+ void *mem = NULL;
if (!system_supports_fpsimd())
return;
@@ -1043,7 +1054,7 @@ void fpsimd_flush_thread(void)
if (system_supports_sve()) {
clear_thread_flag(TIF_SVE);
- sve_free(current);
+ mem = sve_free_atomic(current);
/*
* Reset the task vector length as required.
@@ -1077,6 +1088,7 @@ void fpsimd_flush_thread(void)
}
put_cpu_fpsimd_context();
+ kfree(mem);
}
/*
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 22899c86711a..eb7f8fa21e3f 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -916,7 +916,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
unsigned long thread_flags)
{
do {
- if (thread_flags & _TIF_NEED_RESCHED) {
+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
/* Unmask Debug and SError for the next task */
local_daif_restore(DAIF_PROCCTX_NOIRQ);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index facf4d41d32a..58c8d904d609 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -750,7 +750,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* involves poking the GIC, which must be done in a
* non-preemptible context.
*/
- preempt_disable();
+ migrate_disable();
kvm_pmu_flush_hwstate(vcpu);
@@ -799,7 +799,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_timer_sync_user(vcpu);
kvm_vgic_sync_hwstate(vcpu);
local_irq_enable();
- preempt_enable();
+ migrate_enable();
continue;
}
@@ -871,7 +871,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
/* Exit types that need handling before we can be preempted */
handle_exit_early(vcpu, ret);
- preempt_enable();
+ migrate_enable();
/*
* The ARMv8 architecture doesn't give the hypervisor
diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h
index 19d233497ba5..de72fb23016d 100644
--- a/arch/hexagon/include/asm/spinlock_types.h
+++ b/arch/hexagon/include/asm/spinlock_types.h
@@ -8,10 +8,6 @@
#ifndef _ASM_SPINLOCK_TYPES_H
#define _ASM_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
typedef struct {
volatile unsigned int lock;
} arch_spinlock_t;
diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
index 6e345fefcdca..681408d6816f 100644
--- a/arch/ia64/include/asm/spinlock_types.h
+++ b/arch/ia64/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef _ASM_IA64_SPINLOCK_TYPES_H
#define _ASM_IA64_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
typedef struct {
volatile unsigned int lock;
} arch_spinlock_t;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2afcfe4..95c4518680ca 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -152,6 +152,7 @@ config PPC
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC32 || PPC_BOOK3S_64
+ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
@@ -222,6 +223,7 @@ config PPC
select HAVE_IOREMAP_PROT
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE
select HAVE_KERNEL_LZO if DEFAULT_UIMAGE
@@ -238,6 +240,7 @@ config PPC
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index cf091c4c22e5..7371f7e23c35 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -5,7 +5,7 @@
#ifdef __KERNEL__
#include <linux/compiler.h>
#include <asm/synch.h>
-#include <linux/bug.h>
+#include <linux/bits.h>
#ifdef __BIG_ENDIAN
#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h
index 0f3cdd8faa95..d45561e9e6ba 100644
--- a/arch/powerpc/include/asm/simple_spinlock_types.h
+++ b/arch/powerpc/include/asm/simple_spinlock_types.h
@@ -2,7 +2,7 @@
#ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
#define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H)
# error "please don't include this file directly"
#endif
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 03b3d010cbab..eec452e647b3 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -58,6 +58,7 @@ struct smp_ops_t {
extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us);
extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us);
+extern void smp_send_debugger_break_cpu(unsigned int cpu);
extern void smp_send_debugger_break(void);
extern void start_secondary_resume(void);
extern void smp_generic_give_timebase(void);
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index c5d742f18021..cc6922a011ba 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
#define _ASM_POWERPC_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
#include <asm-generic/qspinlock_types.h>
#include <asm-generic/qrwlock_types.h>
diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
index 1c8460e23583..b1653c160bab 100644
--- a/arch/powerpc/include/asm/stackprotector.h
+++ b/arch/powerpc/include/asm/stackprotector.h
@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void)
unsigned long canary;
/* Try to get a semi random initial value. */
+#ifdef CONFIG_PREEMPT_RT
+ canary = (unsigned long)&canary;
+#else
canary = get_random_canary();
+#endif
canary ^= mftb();
canary ^= LINUX_VERSION_CODE;
canary &= CANARY_MASK;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index b4ec6c7dd72e..07df83231ec2 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -47,6 +47,8 @@
struct thread_info {
int preempt_count; /* 0 => preemptable,
<0 => BUG */
+ int preempt_lazy_count; /* 0 => preemptable,
+ <0 => BUG */
unsigned long local_flags; /* private flags for thread */
#ifdef CONFIG_LIVEPATCH
unsigned long *livepatch_sp;
@@ -93,6 +95,7 @@ void arch_setup_new_exec(void);
#define TIF_PATCH_PENDING 6 /* pending live patching update */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SINGLESTEP 8 /* singlestepping active */
+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
#define TIF_SECCOMP 10 /* secure computing */
#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */
#define TIF_NOERROR 12 /* Force successful syscall return */
@@ -108,6 +111,7 @@ void arch_setup_new_exec(void);
#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */
#define TIF_32BIT 20 /* 32 bit binary */
+
/* as above, but as bit values */
#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
@@ -119,6 +123,7 @@ void arch_setup_new_exec(void);
#define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
#define _TIF_RESTOREALL (1<<TIF_RESTOREALL)
#define _TIF_NOERROR (1<<TIF_NOERROR)
@@ -132,10 +137,12 @@ void arch_setup_new_exec(void);
_TIF_SYSCALL_EMU)
#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+ _TIF_NEED_RESCHED_LAZY | \
_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
_TIF_NOTIFY_SIGNAL)
#define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
/* Bits in local_flags */
/* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f633f09dc912..4d47c3fac81b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -188,6 +188,7 @@ int main(void)
OFFSET(TI_FLAGS, thread_info, flags);
OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
OFFSET(TI_PREEMPT, thread_info, preempt_count);
+ OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
#ifdef CONFIG_PPC64
OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size);
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index e0938ba298f2..507dca866e1a 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -287,7 +287,7 @@ again:
ti_flags = READ_ONCE(current_thread_info()->flags);
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
local_irq_enable();
- if (ti_flags & _TIF_NEED_RESCHED) {
+ if (ti_flags & _TIF_NEED_RESCHED_MASK) {
schedule();
} else {
/*
@@ -376,7 +376,7 @@ again:
ti_flags = READ_ONCE(current_thread_info()->flags);
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
local_irq_enable(); /* returning to user: may enable */
- if (ti_flags & _TIF_NEED_RESCHED) {
+ if (ti_flags & _TIF_NEED_RESCHED_MASK) {
schedule();
} else {
if (ti_flags & _TIF_SIGPENDING)
@@ -461,11 +461,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
/* Returning to a kernel context with local irqs enabled. */
WARN_ON_ONCE(!(regs->msr & MSR_EE));
again:
- if (IS_ENABLED(CONFIG_PREEMPT)) {
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
/* Return to preemptible kernel context */
if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) {
if (preempt_count() == 0)
preempt_schedule_irq();
+ } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) {
+ if ((preempt_count() == 0) &&
+ (current_thread_info()->preempt_lazy_count == 0))
+ preempt_schedule_irq();
}
}
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ca58ad3b06da..024d5b8acf1b 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -588,6 +588,7 @@ static inline void check_stack_overflow(void)
}
}
+#ifndef CONFIG_PREEMPT_RT
static __always_inline void call_do_softirq(const void *sp)
{
/* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */
@@ -606,6 +607,7 @@ static __always_inline void call_do_softirq(const void *sp)
"r11", "r12"
);
}
+#endif
static __always_inline void call_do_irq(struct pt_regs *regs, void *sp)
{
@@ -718,10 +720,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly;
void *softirq_ctx[NR_CPUS] __read_mostly;
void *hardirq_ctx[NR_CPUS] __read_mostly;
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
call_do_softirq(softirq_ctx[smp_processor_id()]);
}
+#endif
irq_hw_number_t virq_to_hw(unsigned int virq)
{
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 7dd2ad3603ad..59402cc01eb9 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -20,6 +20,7 @@
#include <linux/signal.h>
#include <linux/ptrace.h>
#include <linux/kdebug.h>
+#include <linux/console.h>
#include <asm/current.h>
#include <asm/processor.h>
#include <asm/machdep.h>
@@ -120,11 +121,19 @@ int kgdb_skipexception(int exception, struct pt_regs *regs)
static int kgdb_debugger_ipi(struct pt_regs *regs)
{
- kgdb_nmicallback(raw_smp_processor_id(), regs);
+ int cpu = raw_smp_processor_id();
+
+ if (!console_atomic_kgdb_cpu_delay(cpu))
+ kgdb_nmicallback(cpu, regs);
return 0;
}
#ifdef CONFIG_SMP
+void kgdb_roundup_cpu(unsigned int cpu)
+{
+ smp_send_debugger_break_cpu(cpu);
+}
+
void kgdb_roundup_cpus(void)
{
smp_send_debugger_break();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df6b468976d5..35558e14dfb1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -582,6 +582,11 @@ static void debugger_ipi_callback(struct pt_regs *regs)
debugger_ipi(regs);
}
+void smp_send_debugger_break_cpu(unsigned int cpu)
+{
+ smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000);
+}
+
void smp_send_debugger_break(void)
{
smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 04090fde27c8..f4ec910aa585 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void)
extern void panic_flush_kmsg_end(void)
{
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(0);
debug_locks_off();
@@ -260,12 +259,17 @@ static char *get_mmu_str(void)
static int __die(const char *str, struct pt_regs *regs, long err)
{
+ const char *pr = "";
+
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
+
printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
PAGE_SIZE / 1024, get_mmu_str(),
- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index c9a8f4781a10..dc17d8903d4f 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -183,11 +183,6 @@ static void watchdog_smp_panic(int cpu, u64 tb)
wd_smp_unlock(&flags);
- printk_safe_flush();
- /*
- * printk_safe_flush() seems to require another print
- * before anything actually goes out to console.
- */
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index 0196d0c211ac..899955be1cfe 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -312,9 +312,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
unsigned int i;
int (*old_handler)(struct pt_regs *regs);
- /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */
- printk_nmi_enter();
-
/*
* This function is only called after the system
* has panicked or is otherwise in a critical state.
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index e45644657d49..b826174ce983 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -179,6 +179,7 @@ config KVM_E500MC
config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && E500
+ depends on !PREEMPT_RT
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 0c55b991f665..b529370fb27a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -24,6 +24,7 @@
#include <linux/of.h>
#include <linux/iommu.h>
#include <linux/rculist.h>
+#include <linux/local_lock.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -190,7 +191,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
return ret;
}
-static DEFINE_PER_CPU(__be64 *, tce_page);
+struct tce_page {
+ __be64 * page;
+ local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct tce_page, tce_page) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
long npages, unsigned long uaddr,
@@ -212,9 +219,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
direction, attrs);
}
- local_irq_save(flags); /* to protect tcep and the page behind it */
+ /* to protect tcep and the page behind it */
+ local_lock_irqsave(&tce_page.lock, flags);
- tcep = __this_cpu_read(tce_page);
+ tcep = __this_cpu_read(tce_page.page);
/* This is safe to do since interrupts are off when we're called
* from iommu_alloc{,_sg}()
@@ -223,12 +231,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
/* If allocation fails, fall back to the loop implementation */
if (!tcep) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
return tce_build_pSeriesLP(tbl->it_index, tcenum,
tbl->it_page_shift,
npages, uaddr, direction, attrs);
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
rpn = __pa(uaddr) >> TCE_SHIFT;
@@ -258,7 +266,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
tcenum += limit;
} while (npages > 0 && !rc);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&tce_page.lock, flags);
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
@@ -429,16 +437,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
DMA_BIDIRECTIONAL, 0);
}
- local_irq_disable(); /* to protect tcep and the page behind it */
- tcep = __this_cpu_read(tce_page);
+ /* to protect tcep and the page behind it */
+ local_lock_irq(&tce_page.lock);
+ tcep = __this_cpu_read(tce_page.page);
if (!tcep) {
tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
if (!tcep) {
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return -ENOMEM;
}
- __this_cpu_write(tce_page, tcep);
+ __this_cpu_write(tce_page.page, tcep);
}
proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
@@ -481,7 +490,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
/* error cleanup: caller will clear whole range */
- local_irq_enable();
+ local_unlock_irq(&tce_page.lock);
return rc;
}
diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
index a2bbfd7df85f..f059d282e766 100644
--- a/arch/s390/include/asm/spinlock_types.h
+++ b/arch/s390/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
typedef struct {
int lock;
} arch_spinlock_t;
diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h
index e82369f286a2..22ca9a98bbb8 100644
--- a/arch/sh/include/asm/spinlock_types.h
+++ b/arch/sh/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef __ASM_SH_SPINLOCK_TYPES_H
#define __ASM_SH_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
typedef struct {
volatile unsigned int lock;
} arch_spinlock_t;
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ef0f0827cf57..2d3eca8fee01 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu)
hardirq_ctx[cpu] = NULL;
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
struct thread_info *curctx;
@@ -176,6 +177,7 @@ void do_softirq_own_stack(void)
"r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
);
}
+#endif
#else
static inline void handle_one_irq(unsigned int irq)
{
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index c8848bb681a1..41fa1be980a3 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
set_irq_regs(old_regs);
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
void *orig_sp, *sp = softirq_stack[smp_processor_id()];
@@ -869,6 +870,7 @@ void do_softirq_own_stack(void)
__asm__ __volatile__("mov %0, %%sp"
: : "r" (orig_sp));
}
+#endif
#ifdef CONFIG_HOTPLUG_CPU
void fixup_irqs(void)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2db2bf999489..33c4ef85d71e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_SUPPORTS_LTO_CLANG if X86_64
select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64
+ select ARCH_SUPPORTS_RT
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
@@ -229,6 +230,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_PREEMPT_LAZY
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 23bef08a8388..62cf3e4c06fb 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -28,6 +28,7 @@ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);
+extern void kernel_fpu_resched(void);
/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 562854c60808..ea0c5ab31da4 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -185,6 +185,7 @@
IRQ_CONSTRAINTS, regs, vector); \
}
+#ifndef CONFIG_PREEMPT_RT
#define ASM_CALL_SOFTIRQ \
"call %P[__func] \n"
@@ -201,6 +202,8 @@
__this_cpu_write(hardirq_stack_inuse, false); \
}
+#endif
+
#else /* CONFIG_X86_64 */
/* System vector handlers always run on the stack they interrupted. */
#define run_sysvec_on_irqstack_cond(func, regs) \
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index fe5efbcba824..ab8cb5fc2329 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val)
* a decrement which hits zero means we have no preempt_count and should
* reschedule.
*/
-static __always_inline bool __preempt_count_dec_and_test(void)
+static __always_inline bool ____preempt_count_dec_and_test(void)
{
return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+ if (____preempt_count_dec_and_test())
+ return true;
+#ifdef CONFIG_PREEMPT_LAZY
+ if (preempt_count())
+ return false;
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
+ return false;
+#endif
+}
+
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
+#ifdef CONFIG_PREEMPT_LAZY
+ u32 tmp;
+ tmp = raw_cpu_read_4(__preempt_count);
+ if (tmp == preempt_offset)
+ return true;
+
+ /* preempt count == 0 ? */
+ tmp &= ~PREEMPT_NEED_RESCHED;
+ if (tmp != preempt_offset)
+ return false;
+ /* XXX PREEMPT_LOCK_OFFSET */
+ if (current_thread_info()->preempt_lazy_count)
+ return false;
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+#else
return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+#endif
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 6fd8410a3910..f3bf2f515edb 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -28,6 +28,19 @@ typedef struct {
#define SA_IA32_ABI 0x02000000u
#define SA_X32_ABI 0x01000000u
+/*
+ * Because some traps use the IST stack, we must keep preemption
+ * disabled while calling do_trap(), but do_trap() may call
+ * force_sig_info() which will grab the signal spin_locks for the
+ * task, which in PREEMPT_RT are mutexes. By defining
+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
+ * trap.
+ */
+#if defined(CONFIG_PREEMPT_RT)
+#define ARCH_RT_DELAYS_SIGNAL_SEND
+#endif
+
#ifndef CONFIG_COMPAT
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index b6ffe58c70fa..e79e75ede951 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -50,7 +50,7 @@
*/
static __always_inline void boot_init_stack_canary(void)
{
- u64 canary;
+ u64 canary = 0;
u64 tsc;
#ifdef CONFIG_X86_64
@@ -61,8 +61,14 @@ static __always_inline void boot_init_stack_canary(void)
* of randomness. The TSC only matters for very early init,
* there it already has some randomness on most systems. Later
* on during the bootup the random pool has true entropy too.
+ * For preempt-rt we need to weaken the randomness a bit, as
+ * we can't call into the random generator from atomic context
+ * due to locking constraints. We just leave canary
+ * uninitialized and use the TSC based randomness on top of it.
*/
+#ifndef CONFIG_PREEMPT_RT
get_random_bytes(&canary, sizeof(canary));
+#endif
tsc = rdtsc();
canary += tsc + (tsc << 32UL);
canary &= CANARY_MASK;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index de406d93b515..730d86e28f46 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,11 +57,14 @@ struct thread_info {
unsigned long flags; /* low level flags */
unsigned long syscall_work; /* SYSCALL_WORK_ flags */
u32 status; /* thread synchronous flags */
+ int preempt_lazy_count; /* 0 => lazy preemptable
+ <0 => BUG */
};
#define INIT_THREAD_INFO(tsk) \
{ \
.flags = 0, \
+ .preempt_lazy_count = 0, \
}
#else /* !__ASSEMBLY__ */
@@ -90,6 +93,7 @@ struct thread_info {
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
#define TIF_SLD 18 /* Restore split lock detection on context switch */
+#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -113,6 +117,7 @@ struct thread_info {
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
#define _TIF_SLD (1 << TIF_SLD)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
@@ -143,6 +148,8 @@ struct thread_info {
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+
#define STACK_WARN (THREAD_SIZE/8)
/*
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4fa0a4280895..6b59b0a7d7d8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -80,11 +80,12 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ u64 ip = regs ? instruction_pointer(regs) : 0;
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0, ip);
ack_APIC_irq();
set_irq_regs(old_regs);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..d315d45b64fa 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -159,6 +159,18 @@ void kernel_fpu_end(void)
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
+void kernel_fpu_resched(void)
+{
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+
+ if (should_resched(PREEMPT_OFFSET)) {
+ kernel_fpu_end();
+ cond_resched();
+ kernel_fpu_begin();
+ }
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
+
/*
* Save the FPU state (mark it for reload if necessary):
*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 044902d5a3c4..e5dd6da78713 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
}
+#ifndef CONFIG_PREEMPT_RT
void do_softirq_own_stack(void)
{
struct irq_stack *irqstk;
@@ -148,6 +149,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
+#endif
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3a43a2dee658..55c446dc0d9b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -32,6 +32,7 @@
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/console.h>
#include <linux/hw_breakpoint.h>
#include <linux/uaccess.h>
#include <linux/memory.h>
@@ -502,9 +503,12 @@ static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
if (atomic_read(&kgdb_active) != -1) {
/* KGDB CPU roundup */
cpu = raw_smp_processor_id();
- kgdb_nmicallback(cpu, regs);
- set_bit(cpu, was_in_debug_nmi);
- touch_nmi_watchdog();
+
+ if (!console_atomic_kgdb_cpu_delay(cpu)) {
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
+ touch_nmi_watchdog();
+ }
return NMI_HANDLED;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1e11198f8993..b6c3394f2519 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8193,6 +8193,14 @@ int kvm_arch_init(void *opaque)
goto out;
}
+#ifdef CONFIG_PREEMPT_RT
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+#endif
+
r = -ENOMEM;
x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
__alignof__(struct fpu), SLAB_ACCOUNT,
diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
index 64c9389254f1..dc846323b1cd 100644
--- a/arch/xtensa/include/asm/spinlock_types.h
+++ b/arch/xtensa/include/asm/spinlock_types.h
@@ -2,10 +2,6 @@
#ifndef __ASM_SPINLOCK_TYPES_H
#define __ASM_SPINLOCK_TYPES_H
-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
-# error "please don't include this file directly"
-#endif
-
#include <asm-generic/qspinlock_types.h>
#include <asm-generic/qrwlock_types.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6dfa572ac1fc..3b63fb5b481f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1552,14 +1552,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- int cpu = get_cpu();
+ int cpu = get_cpu_light();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- put_cpu();
+ put_cpu_light();
return;
}
- put_cpu();
+ put_cpu_light();
}
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index a1bea0f4baa8..5f8ca8c1f59c 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -36,6 +36,7 @@ static struct workqueue_struct *cryptd_wq;
struct cryptd_cpu_queue {
struct crypto_queue queue;
struct work_struct work;
+ spinlock_t qlock;
};
struct cryptd_queue {
@@ -105,6 +106,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue,
cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
INIT_WORK(&cpu_queue->work, cryptd_queue_worker);
+ spin_lock_init(&cpu_queue->qlock);
}
pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen);
return 0;
@@ -129,8 +131,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
struct cryptd_cpu_queue *cpu_queue;
refcount_t *refcnt;
- cpu = get_cpu();
- cpu_queue = this_cpu_ptr(queue->cpu_queue);
+ cpu_queue = raw_cpu_ptr(queue->cpu_queue);
+ spin_lock_bh(&cpu_queue->qlock);
+ cpu = smp_processor_id();
+
err = crypto_enqueue_request(&cpu_queue->queue, request);
refcnt = crypto_tfm_ctx(request->tfm);
@@ -146,7 +150,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
refcount_inc(refcnt);
out_put_cpu:
- put_cpu();
+ spin_unlock_bh(&cpu_queue->qlock);
return err;
}
@@ -162,16 +166,11 @@ static void cryptd_queue_worker(struct work_struct *work)
cpu_queue = container_of(work, struct cryptd_cpu_queue, work);
/*
* Only handle one request at a time to avoid hogging crypto workqueue.
- * preempt_disable/enable is used to prevent being preempted by
- * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent
- * cryptd_enqueue_request() being accessed from software interrupts.
*/
- local_bh_disable();
- preempt_disable();
+ spin_lock_bh(&cpu_queue->qlock);
backlog = crypto_get_backlog(&cpu_queue->queue);
req = crypto_dequeue_request(&cpu_queue->queue);
- preempt_enable();
- local_bh_enable();
+ spin_unlock_bh(&cpu_queue->qlock);
if (!req)
return;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 2c36f61d30bc..9db297431b97 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -482,6 +482,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
{
+ fold_vm_numa_events();
return sysfs_emit(buf,
"numa_hit %lu\n"
"numa_miss %lu\n"
@@ -489,12 +490,12 @@ static ssize_t node_read_numastat(struct device *dev,
"interleave_hit %lu\n"
"local_node %lu\n"
"other_node %lu\n",
- sum_zone_numa_state(dev->id, NUMA_HIT),
- sum_zone_numa_state(dev->id, NUMA_MISS),
- sum_zone_numa_state(dev->id, NUMA_FOREIGN),
- sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_numa_state(dev->id, NUMA_LOCAL),
- sum_zone_numa_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_event_state(dev->id, NUMA_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_MISS),
+ sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_event_state(dev->id, NUMA_OTHER));
}
static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
@@ -512,10 +513,11 @@ static ssize_t node_read_vmstat(struct device *dev,
sum_zone_node_page_state(nid, i));
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ fold_vm_numa_events();
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
numa_stat_name(i),
- sum_zone_numa_state(nid, i));
+ sum_zone_numa_event_state(nid, i));
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index cf8deecc39ef..5c7999ebc4e0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
u32 index, int offset, struct bio *bio);
+#ifdef CONFIG_PREEMPT_RT
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
+{
+ size_t index;
+
+ for (index = 0; index < num_pages; index++)
+ spin_lock_init(&zram->table[index].lock);
+}
+
+static int zram_slot_trylock(struct zram *zram, u32 index)
+{
+ int ret;
+
+ ret = spin_trylock(&zram->table[index].lock);
+ if (ret)
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+ return ret;
+}
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+ spin_lock(&zram->table[index].lock);
+ __set_bit(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ __clear_bit(ZRAM_LOCK, &zram->table[index].flags);
+ spin_unlock(&zram->table[index].lock);
+}
+
+#else
+
+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
static int zram_slot_trylock(struct zram *zram, u32 index)
{
@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
{
bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
}
+#endif
static inline bool init_done(struct zram *zram)
{
@@ -1169,6 +1204,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
if (!huge_class_size)
huge_class_size = zs_huge_class_size(zram->mem_pool);
+ zram_meta_init_table_locks(zram, num_pages);
return true;
}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 419a7e8281ee..561c7ba1421f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -63,6 +63,7 @@ struct zram_table_entry {
unsigned long element;
};
unsigned long flags;
+ spinlock_t lock;
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
ktime_t ac_time;
#endif
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 605969ed0f96..9bcaa6c99f69 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1242,26 +1242,25 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
return *ptr;
}
-void add_interrupt_randomness(int irq, int irq_flags)
+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
{
struct entropy_store *r;
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
- struct pt_regs *regs = get_irq_regs();
unsigned long now = jiffies;
cycles_t cycles = random_get_entropy();
__u32 c_high, j_high;
- __u64 ip;
if (cycles == 0)
- cycles = get_reg(fast_pool, regs);
+ cycles = get_reg(fast_pool, NULL);
c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
j_high = (sizeof(now) > 4) ? now >> 32 : 0;
fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
fast_pool->pool[1] ^= now ^ c_high;
- ip = regs ? instruction_pointer(regs) : _RET_IP_;
+ if (!ip)
+ ip = _RET_IP_;
fast_pool->pool[2] ^= ip;
fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
- get_reg(fast_pool, regs);
+ get_reg(fast_pool, NULL);
fast_mix(fast_pool);
add_interrupt_bench(cycles);
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 4ed6e660273a..c2bd0d40b5fc 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
return container_of(data, struct tpm_tis_tcg_phy, priv);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Flushes previous write operations to chip so that a subsequent
+ * ioread*()s won't stall a cpu.
+ */
+static inline void tpm_tis_flush(void __iomem *iobase)
+{
+ ioread8(iobase + TPM_ACCESS(0));
+}
+#else
+#define tpm_tis_flush(iobase) do { } while (0)
+#endif
+
+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
+{
+ iowrite8(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
+{
+ iowrite32(b, iobase + addr);
+ tpm_tis_flush(iobase);
+}
+
static int interrupts = -1;
module_param(interrupts, int, 0444);
MODULE_PARM_DESC(interrupts, "Enable interrupts");
@@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
while (len--)
- iowrite8(*value++, phy->iobase + addr);
+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
return 0;
}
@@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
{
struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
- iowrite32(value, phy->iobase + addr);
+ tpm_tis_iowrite32(value, phy->iobase, addr);
return 0;
}
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 847f33ffc4ae..ae79c3300129 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -66,7 +66,7 @@ struct mm_struct efi_mm = {
struct workqueue_struct *efi_rts_wq;
-static bool disable_runtime;
+static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT);
static int __init setup_noefi(char *arg)
{
disable_runtime = true;
@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str)
if (parse_option_str(str, "noruntime"))
disable_runtime = true;
+ if (parse_option_str(str, "runtime"))
+ disable_runtime = false;
+
if (parse_option_str(str, "nosoftreserve"))
set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags);
diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c
index 39358076c05b..54d62d343f1a 100644
--- a/drivers/gpu/drm/i915/display/intel_crtc.c
+++ b/drivers/gpu/drm/i915/display/intel_crtc.c
@@ -425,7 +425,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
*/
intel_psr_wait_for_idle(new_crtc_state);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
crtc->debug.min_vbl = min;
crtc->debug.max_vbl = max;
@@ -450,11 +451,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
break;
}
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
timeout = schedule_timeout(timeout);
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
finish_wait(wq, &wait);
@@ -487,7 +490,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state)
return;
irq_disable:
- local_irq_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
}
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE)
@@ -566,7 +570,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
new_crtc_state->uapi.event = NULL;
}
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
/* Send VRR Push to terminate Vblank */
intel_vrr_send_push(new_crtc_state);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 7c9af86fdb1e..4008080ebd9a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -84,9 +84,10 @@ static int __engine_unpark(struct intel_wakeref *wf)
static unsigned long __timeline_mark_lock(struct intel_context *ce)
{
- unsigned long flags;
+ unsigned long flags = 0;
- local_irq_save(flags);
+ if (!force_irqthreads)
+ local_irq_save(flags);
mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_);
return flags;
@@ -96,7 +97,8 @@ static void __timeline_mark_unlock(struct intel_context *ce,
unsigned long flags)
{
mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_);
- local_irq_restore(flags);
+ if (!force_irqthreads)
+ local_irq_restore(flags);
}
#else
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 783f25920d00..50478356591f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -887,6 +887,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
/* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
+ preempt_disable_rt();
/* Get optional system timestamp before query. */
if (stime)
@@ -951,6 +952,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc,
*etime = ktime_get();
/* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
+ preempt_enable_rt();
spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index a4addcc64978..396b6598694d 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -2,6 +2,10 @@
#if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
#define _I915_TRACE_H_
+#ifdef CONFIG_PREEMPT_RT
+#define NOTRACE
+#endif
+
#include <linux/stringify.h>
#include <linux/types.h>
#include <linux/tracepoint.h>
@@ -778,7 +782,7 @@ DEFINE_EVENT(i915_request, i915_request_add,
TP_ARGS(rq)
);
-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
+#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE)
DEFINE_EVENT(i915_request, i915_request_submit,
TP_PROTO(struct i915_request *rq),
TP_ARGS(rq)
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 1d03ec763604..c6f1cb30f94a 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -1814,6 +1814,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
struct radeon_device *rdev = dev->dev_private;
/* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
+ preempt_disable_rt();
/* Get optional system timestamp before query. */
if (stime)
@@ -1906,6 +1907,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
*etime = ktime_get();
/* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
+ preempt_enable_rt();
/* Decode into vertical and horizontal scanout position. */
*vpos = position & 0x1fff;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 9416e09ebd58..4a5767a15544 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -18,6 +18,7 @@
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include "hv_trace.h"
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 92cb3f7d21d9..3fc06e5f2785 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -22,6 +22,7 @@
#include <linux/clockchips.h>
#include <linux/cpu.h>
#include <linux/sched/task_stack.h>
+#include <linux/irq.h>
#include <linux/delay.h>
#include <linux/notifier.h>
@@ -1339,6 +1340,8 @@ static void vmbus_isr(void)
void *page_addr = hv_cpu->synic_event_page;
struct hv_message *msg;
union hv_synic_event_flags *event;
+ struct pt_regs *regs = get_irq_regs();
+ u64 ip = regs ? instruction_pointer(regs) : 0;
bool handled = false;
if (unlikely(page_addr == NULL))
@@ -1383,7 +1386,7 @@ static void vmbus_isr(void)
tasklet_schedule(&hv_cpu->msg_dpc);
}
- add_interrupt_randomness(vmbus_interrupt, 0);
+ add_interrupt_randomness(vmbus_interrupt, 0, ip);
}
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
index b77a01bd27f4..aa74e2a05798 100644
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT
config LEDS_TRIGGER_CPU
bool "LED CPU Trigger"
+ depends on !PREEMPT_RT
help
This allows LEDs to be controlled by active CPUs. This shows
the active CPUs across an array of LEDs so you can see which
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d4ff8a5c55e..29055cdc9446 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2217,8 +2217,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
struct raid5_percpu *percpu;
unsigned long cpu;
- cpu = get_cpu();
+ cpu = get_cpu_light();
percpu = per_cpu_ptr(conf->percpu, cpu);
+ spin_lock(&percpu->lock);
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh);
overlap_clear++;
@@ -2277,7 +2278,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&sh->raid_conf->wait_for_overlap);
}
- put_cpu();
+ spin_unlock(&percpu->lock);
+ put_cpu_light();
}
static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
@@ -7078,6 +7080,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
__func__, cpu);
return -ENOMEM;
}
+ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
return 0;
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 5c05acf20e1f..665fe138ab4f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -635,6 +635,7 @@ struct r5conf {
int recovery_disabled;
/* per cpu variables */
struct raid5_percpu {
+ spinlock_t lock; /* Protection for -RT */
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 89ec735929c3..9c1dc0767bb9 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1452,11 +1452,11 @@ err2:
static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
{
struct fcoe_percpu_s *fps;
- int rc;
+ int rc, cpu = get_cpu_light();
- fps = &get_cpu_var(fcoe_percpu);
+ fps = &per_cpu(fcoe_percpu, cpu);
rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
- put_cpu_var(fcoe_percpu);
+ put_cpu_light();
return rc;
}
@@ -1641,11 +1641,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
return 0;
}
- stats = per_cpu_ptr(lport->stats, get_cpu());
+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
stats->InvalidCRCCount++;
if (stats->InvalidCRCCount < 5)
printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
- put_cpu();
+ put_cpu_light();
return -EINVAL;
}
@@ -1686,7 +1686,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
*/
hp = (struct fcoe_hdr *) skb_network_header(skb);
- stats = per_cpu_ptr(lport->stats, get_cpu());
+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
if (stats->ErrorFrames < 5)
printk(KERN_WARNING "fcoe: FCoE version "
@@ -1718,13 +1718,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
goto drop;
if (!fcoe_filter_frames(lport, fp)) {
- put_cpu();
+ put_cpu_light();
fc_exch_recv(lport, fp);
return;
}
drop:
stats->ErrorFrames++;
- put_cpu();
+ put_cpu_light();
kfree_skb(skb);
}
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index 1756a0ac6f08..3a2cbf35ea3d 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
INIT_LIST_HEAD(&del_list);
- stats = per_cpu_ptr(fip->lp->stats, get_cpu());
+ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
@@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
sel_time = fcf->time;
}
}
- put_cpu();
+ put_cpu_light();
list_for_each_entry_safe(fcf, next, &del_list, list) {
/* Removes fcf from current list */
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index 841000445b9a..26d661ddc950 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -825,10 +825,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
}
memset(ep, 0, sizeof(*ep));
- cpu = get_cpu();
+ cpu = get_cpu_light();
pool = per_cpu_ptr(mp->pool, cpu);
spin_lock_bh(&pool->lock);
- put_cpu();
+ put_cpu_light();
/* peek cache of free slot */
if (pool->left != FC_XID_UNKNOWN) {
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 6473361525d1..b52ba054a4da 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -132,12 +132,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value)
up->dl_write(up, value);
}
+static inline void serial8250_set_IER(struct uart_8250_port *up,
+ unsigned char ier)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ serial_out(up, UART_IER, ier);
+
+ if (is_console)
+ console_atomic_unlock(flags);
+}
+
+static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int clearval = 0;
+ unsigned int prior;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (up->capabilities & UART_CAP_UUE)
+ clearval = UART_IER_UUE;
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ prior = serial_port_in(port, UART_IER);
+ serial_port_out(port, UART_IER, clearval);
+
+ if (is_console)
+ console_atomic_unlock(flags);
+
+ return prior;
+}
+
static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
if (up->ier & UART_IER_THRI)
return false;
up->ier |= UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
@@ -146,7 +189,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
if (!(up->ier & UART_IER_THRI))
return false;
up->ier &= ~UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index cae61d1ebec5..47dd23056271 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -274,10 +274,8 @@ static void serial8250_backup_timeout(struct timer_list *t)
* Must disable interrupts or else we risk racing with the interrupt
* based handler.
*/
- if (up->port.irq) {
- ier = serial_in(up, UART_IER);
- serial_out(up, UART_IER, 0);
- }
+ if (up->port.irq)
+ ier = serial8250_clear_IER(up);
iir = serial_in(up, UART_IIR);
@@ -300,7 +298,7 @@ static void serial8250_backup_timeout(struct timer_list *t)
serial8250_tx_chars(up);
if (up->port.irq)
- serial_out(up, UART_IER, ier);
+ serial8250_set_IER(up, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
#ifdef CONFIG_SERIAL_8250_CONSOLE
+static void univ8250_console_write_atomic(struct console *co, const char *s,
+ unsigned int count)
+{
+ struct uart_8250_port *up = &serial8250_ports[co->index];
+
+ serial8250_console_write_atomic(up, s, count);
+}
+
static void univ8250_console_write(struct console *co, const char *s,
unsigned int count)
{
@@ -671,6 +677,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx,
static struct console univ8250_console = {
.name = "ttyS",
+ .write_atomic = univ8250_console_write_atomic,
.write = univ8250_console_write,
.device = uart_console_device,
.setup = univ8250_console_setup,
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index fc65a2293ce9..d659e05c2e60 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port)
/* Stop processing interrupts on input overrun */
if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
+ unsigned int ca_flags;
unsigned long delay;
+ bool is_console;
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&ca_flags);
up->ier = port->serial_in(port, UART_IER);
+ if (is_console)
+ console_atomic_unlock(ca_flags);
+
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
index 988bf6bcce42..bcd26d672539 100644
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart",
static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
{
+ unsigned int flags;
+ bool is_console;
int ier;
switch (offset) {
@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
* If we have enabled modem status IRQs we should enable
* modem mode.
*/
+ is_console = uart_console(p);
+ if (is_console)
+ console_atomic_lock(&flags);
ier = p->serial_in(p, UART_IER);
+ if (is_console)
+ console_atomic_unlock(flags);
if (ier & UART_IER_MSI)
value |= UART_MCR_MDCE | UART_MCR_FCM;
diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
index fb65dc601b23..5bc734c70da4 100644
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -218,12 +218,37 @@ static void mtk8250_shutdown(struct uart_port *port)
static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier & (~mask));
+
+ if (is_console)
+ console_atomic_unlock(flags);
}
static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+
+ if (uart_console(port))
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier | mask);
+
+ if (uart_console(port))
+ console_atomic_unlock(flags);
}
static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 9422284bb3f3..253470fbd3c4 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -761,7 +761,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
@@ -1435,7 +1435,7 @@ static void serial8250_stop_rx(struct uart_port *port)
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1465,7 +1465,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p)
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_port_out(&p->port, UART_IER, p->ier);
+ serial8250_set_IER(p, p->ier);
}
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
@@ -1687,7 +1687,7 @@ static void serial8250_disable_ms(struct uart_port *port)
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1703,7 +1703,7 @@ static void serial8250_enable_ms(struct uart_port *port)
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -2131,14 +2131,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
struct uart_8250_port *up = up_to_u8250p(port);
serial8250_rpm_get(up);
- /*
- * First save the IER then disable the interrupts
- */
- ier = serial_port_in(port, UART_IER);
- if (up->capabilities & UART_CAP_UUE)
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ ier = serial8250_clear_IER(up);
wait_for_xmitr(up, BOTH_EMPTY);
/*
@@ -2151,7 +2144,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ serial8250_set_IER(up, ier);
serial8250_rpm_put(up);
}
@@ -2454,7 +2447,7 @@ void serial8250_do_shutdown(struct uart_port *port)
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- serial_port_out(port, UART_IER, 0);
+ serial8250_set_IER(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2797,7 +2790,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- serial_port_out(port, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3263,7 +3256,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults);
#ifdef CONFIG_SERIAL_8250_CONSOLE
-static void serial8250_console_putchar(struct uart_port *port, int ch)
+static void serial8250_console_putchar_locked(struct uart_port *port, int ch)
{
struct uart_8250_port *up = up_to_u8250p(port);
@@ -3271,6 +3264,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch)
serial_port_out(port, UART_TX, ch);
}
+static void serial8250_console_putchar(struct uart_port *port, int ch)
+{
+ struct uart_8250_port *up = up_to_u8250p(port);
+ unsigned int flags;
+
+ wait_for_xmitr(up, UART_LSR_THRE);
+
+ console_atomic_lock(&flags);
+ serial8250_console_putchar_locked(port, ch);
+ console_atomic_unlock(flags);
+}
+
/*
* Restore serial console when h/w power-off detected
*/
@@ -3292,6 +3297,32 @@ static void serial8250_console_restore(struct uart_8250_port *up)
serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
}
+void serial8250_console_write_atomic(struct uart_8250_port *up,
+ const char *s, unsigned int count)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+
+ console_atomic_lock(&flags);
+
+ touch_nmi_watchdog();
+
+ ier = serial8250_clear_IER(up);
+
+ if (atomic_fetch_inc(&up->console_printing)) {
+ uart_console_write(port, "\n", 1,
+ serial8250_console_putchar_locked);
+ }
+ uart_console_write(port, s, count, serial8250_console_putchar_locked);
+ atomic_dec(&up->console_printing);
+
+ wait_for_xmitr(up, BOTH_EMPTY);
+ serial8250_set_IER(up, ier);
+
+ console_atomic_unlock(flags);
+}
+
/*
* Print a string to the serial port trying not to disturb
* any possible real use of the port...
@@ -3308,24 +3339,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
struct uart_port *port = &up->port;
unsigned long flags;
unsigned int ier;
- int locked = 1;
touch_nmi_watchdog();
- if (oops_in_progress)
- locked = spin_trylock_irqsave(&port->lock, flags);
- else
- spin_lock_irqsave(&port->lock, flags);
-
- /*
- * First save the IER then disable the interrupts
- */
- ier = serial_port_in(port, UART_IER);
+ spin_lock_irqsave(&port->lock, flags);
- if (up->capabilities & UART_CAP_UUE)
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ ier = serial8250_clear_IER(up);
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
@@ -3339,7 +3358,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
mdelay(port->rs485.delay_rts_before_send);
}
+ atomic_inc(&up->console_printing);
uart_console_write(port, s, count, serial8250_console_putchar);
+ atomic_dec(&up->console_printing);
/*
* Finally, wait for transmitter to become empty
@@ -3352,8 +3373,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
if (em485->tx_stopped)
up->rs485_stop_tx(up);
}
-
- serial_port_out(port, UART_IER, ier);
+ serial8250_set_IER(up, ier);
/*
* The receive handling will happen properly because the
@@ -3365,8 +3385,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
if (up->msr_saved_flags)
serial8250_modem_status(up);
- if (locked)
- spin_unlock_irqrestore(&port->lock, flags);
+ spin_unlock_irqrestore(&port->lock, flags);
}
static unsigned int probe_baud(struct uart_port *port)
@@ -3386,6 +3405,7 @@ static unsigned int probe_baud(struct uart_port *port)
int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
int baud = 9600;
int bits = 8;
int parity = 'n';
@@ -3395,6 +3415,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
if (!port->iobase && !port->membase)
return -ENODEV;
+ atomic_set(&up->console_printing, 0);
+
if (options)
uart_parse_options(options, &baud, &parity, &bits, &flow);
else if (probe)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 78682c12156a..401513bb0bb6 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2199,18 +2199,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
{
struct uart_amba_port *uap = amba_ports[co->index];
unsigned int old_cr = 0, new_cr;
- unsigned long flags;
+ unsigned long flags = 0;
int locked = 1;
clk_enable(uap->clk);
- local_irq_save(flags);
+ /*
+ * local_irq_save(flags);
+ *
+ * This local_irq_save() is nonsense. If we come in via sysrq
+ * handling then interrupts are already disabled. Aside of
+ * that the port.sysrq check is racy on SMP regardless.
+ */
if (uap->port.sysrq)
locked = 0;
else if (oops_in_progress)
- locked = spin_trylock(&uap->port.lock);
+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
else
- spin_lock(&uap->port.lock);
+ spin_lock_irqsave(&uap->port.lock, flags);
/*
* First save the CR then disable the interrupts
@@ -2236,8 +2242,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
pl011_write(old_cr, uap, REG_CR);
if (locked)
- spin_unlock(&uap->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&uap->port.lock, flags);
clk_disable(uap->clk);
}
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 84e8158088cd..342005ed5ebf 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s,
pm_runtime_get_sync(up->dev);
- local_irq_save(flags);
- if (up->port.sysrq)
- locked = 0;
- else if (oops_in_progress)
- locked = spin_trylock(&up->port.lock);
+ if (up->port.sysrq || oops_in_progress)
+ locked = spin_trylock_irqsave(&up->port.lock, flags);
else
- spin_lock(&up->port.lock);
+ spin_lock_irqsave(&up->port.lock, flags);
/*
* First save the IER then disable the interrupts
@@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s,
pm_runtime_mark_last_busy(up->dev);
pm_runtime_put_autosuspend(up->dev);
if (locked)
- spin_unlock(&up->port.lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&up->port.lock, flags);
}
static int __init
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index dae9a57d7ec0..9a6a0ec4d1fb 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode)
struct dentry *alias;
int ret;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
_enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 63bfc533c9fb..a749570a3142 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -82,7 +82,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct inode *inode;
struct super_block *sb = parent->d_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
diff --git a/fs/dcache.c b/fs/dcache.c
index 9508bd57a3bc..654b4c9d4f8b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2539,9 +2539,10 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
+ preempt_disable_rt();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = dir->__i_dir_seq;
+ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
return n;
cpu_relax();
}
@@ -2549,26 +2550,30 @@ static inline unsigned start_dir_add(struct inode *dir)
static inline void end_dir_add(struct inode *dir, unsigned n)
{
- smp_store_release(&dir->i_dir_seq, n + 2);
+ smp_store_release(&dir->__i_dir_seq, n + 2);
+ preempt_enable_rt();
}
static void d_wait_lookup(struct dentry *dentry)
{
- if (d_in_lookup(dentry)) {
- DECLARE_WAITQUEUE(wait, current);
- add_wait_queue(dentry->d_wait, &wait);
- do {
- set_current_state(TASK_UNINTERRUPTIBLE);
- spin_unlock(&dentry->d_lock);
- schedule();
- spin_lock(&dentry->d_lock);
- } while (d_in_lookup(dentry));
- }
+ struct swait_queue __wait;
+
+ if (!d_in_lookup(dentry))
+ return;
+
+ INIT_LIST_HEAD(&__wait.task_list);
+ do {
+ prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&dentry->d_lock);
+ schedule();
+ spin_lock(&dentry->d_lock);
+ } while (d_in_lookup(dentry));
+ finish_swait(dentry->d_wait, &__wait);
}
struct dentry *d_alloc_parallel(struct dentry *parent,
const struct qstr *name,
- wait_queue_head_t *wq)
+ struct swait_queue_head *wq)
{
unsigned int hash = name->hash;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
@@ -2582,7 +2587,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
retry:
rcu_read_lock();
- seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
+ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
r_seq = read_seqbegin(&rename_lock);
dentry = __d_lookup_rcu(parent, name, &d_seq);
if (unlikely(dentry)) {
@@ -2610,7 +2615,7 @@ retry:
}
hlist_bl_lock(b);
- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
+ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
hlist_bl_unlock(b);
rcu_read_unlock();
goto retry;
@@ -2683,7 +2688,7 @@ void __d_lookup_done(struct dentry *dentry)
hlist_bl_lock(b);
dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
- wake_up_all(dentry->d_wait);
+ swake_up_all(dentry->d_wait);
dentry->d_wait = NULL;
hlist_bl_unlock(b);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index bc267832310c..3176913fae6c 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file,
struct inode *dir = d_inode(parent);
struct fuse_conn *fc;
struct inode *inode;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
if (!o->nodeid) {
/*
diff --git a/fs/inode.c b/fs/inode.c
index 78de5b5dc084..646fe09ba5ec 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -157,7 +157,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_pipe = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
- inode->i_dir_seq = 0;
+ inode->__i_dir_seq = 0;
inode->i_rdev = 0;
inode->dirtied_when = 0;
diff --git a/fs/namei.c b/fs/namei.c
index 79b0ff9b151e..4699d4a6038f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ static struct dentry *__lookup_slow(const struct qstr *name,
{
struct dentry *dentry, *old;
struct inode *inode = dir->d_inode;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
/* Don't go there if it's already dead */
if (unlikely(IS_DEADDIR(inode)))
@@ -3127,7 +3127,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
return ERR_PTR(-ENOENT);
diff --git a/fs/namespace.c b/fs/namespace.c
index 670ba5250137..7336bb1037b8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -14,6 +14,7 @@
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
+#include <linux/hrtimer.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
@@ -342,8 +343,11 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
- cpu_relax();
+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ preempt_enable();
+ cpu_chill();
+ preempt_disable();
+ }
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1a6d2867fba4..2e67080475c5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -636,7 +636,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
unsigned long dir_verifier)
{
struct qstr filename = QSTR_INIT(entry->name, entry->len);
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
struct dentry *dentry;
struct dentry *alias;
struct inode *inode;
@@ -1876,7 +1876,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned open_flags,
umode_t mode)
{
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
struct nfs_open_context *ctx;
struct dentry *res;
struct iattr attr = { .ia_valid = ATTR_OPEN };
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 5fa11e1aca4c..984f26eb888c 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,7 +13,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
#include <linux/sched.h>
-#include <linux/wait.h>
+#include <linux/swait.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
@@ -180,7 +180,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
data->cred = get_current_cred();
data->res.dir_attr = &data->dir_attr;
- init_waitqueue_head(&data->wq);
+ init_swait_queue_head(&data->wq);
status = -EBUSY;
spin_lock(&dentry->d_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 06150eb56a60..8c45129eca02 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
+#include <linux/swait.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -2037,7 +2038,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
child = d_hash_and_lookup(dir, &qname);
if (!child) {
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
if (IS_ERR(child))
goto end_instantiate;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dea0f5ee540c..8872fbf4803b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -678,7 +678,7 @@ static bool proc_sys_fill_cache(struct file *file,
child = d_lookup(dir, &qname);
if (!child) {
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
if (IS_ERR(child))
return false;
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
index eceeecf6a5bd..d3e2d81656e0 100644
--- a/include/asm-generic/softirq_stack.h
+++ b/include/asm-generic/softirq_stack.h
@@ -2,7 +2,7 @@
#ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
#define __ASM_GENERIC_SOFTIRQ_STACK_H
-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
+#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT)
void do_softirq_own_stack(void);
#else
static inline void do_softirq_own_stack(void)
diff --git a/include/linux/console.h b/include/linux/console.h
index 20874db50bc8..851daf13de0a 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -16,6 +16,19 @@
#include <linux/atomic.h>
#include <linux/types.h>
+#include <linux/printk.h>
+#include <linux/seqlock.h>
+
+#ifdef CONFIG_PRINTK_NMI
+#define PRINTK_CTX_NUM 2
+#else
+#define PRINTK_CTX_NUM 1
+#endif
+
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
struct vc_data;
struct console_font_op;
@@ -136,10 +149,12 @@ static inline int con_debug_leave(void)
#define CON_ANYTIME (16) /* Safe to call when cpu is offline */
#define CON_BRL (32) /* Used for a braille device */
#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */
+#define CON_HANDOVER (128) /* Device was previously a boot console. */
struct console {
char name[16];
void (*write)(struct console *, const char *, unsigned);
+ void (*write_atomic)(struct console *co, const char *s, unsigned int count);
int (*read)(struct console *, char *, unsigned);
struct tty_driver *(*device)(struct console *, int *);
void (*unblank)(void);
@@ -149,6 +164,13 @@ struct console {
short flags;
short index;
int cflag;
+#ifdef CONFIG_PRINTK
+ char sync_buf[CONSOLE_LOG_MAX];
+#endif
+ struct latched_seq printk_seq;
+ struct latched_seq printk_sync_seq[PRINTK_CTX_NUM];
+
+ struct task_struct *thread;
void *data;
struct console *next;
};
@@ -229,4 +251,8 @@ extern void console_init(void);
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);
+extern void console_atomic_lock(unsigned int *flags);
+extern void console_atomic_unlock(unsigned int flags);
+extern bool console_atomic_kgdb_cpu_delay(unsigned int cpu);
+
#endif /* _LINUX_CONSOLE_H */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9e23d33bb6f1..9f89d4887e35 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -108,7 +108,7 @@ struct dentry {
union {
struct list_head d_lru; /* LRU list */
- wait_queue_head_t *d_wait; /* in-lookup ones only */
+ struct swait_queue_head *d_wait; /* in-lookup ones only */
};
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
@@ -240,7 +240,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
- wait_queue_head_t *);
+ struct swait_queue_head *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index edb5c186b0b7..3f49e65169c6 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -3,8 +3,7 @@
#define __LINUX_DEBUG_LOCKING_H
#include <linux/atomic.h>
-#include <linux/bug.h>
-#include <linux/printk.h>
+#include <linux/cache.h>
struct task_struct;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 2e2b8d6140ed..572fac3dd288 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -59,7 +59,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 93eb43e002d9..a47dcfa53b83 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -699,7 +699,7 @@ struct inode {
struct pipe_inode_info *i_pipe;
struct cdev *i_cdev;
char *i_link;
- unsigned i_dir_seq;
+ unsigned __i_dir_seq;
};
__u32 i_generation;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 69bc86ea382c..76878b357ffa 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -116,7 +116,6 @@ extern void rcu_nmi_exit(void);
do { \
lockdep_off(); \
arch_nmi_enter(); \
- printk_nmi_enter(); \
BUG_ON(in_nmi() == NMI_MASK); \
__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
} while (0)
@@ -135,7 +134,6 @@ extern void rcu_nmi_exit(void);
do { \
BUG_ON(!in_nmi()); \
__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
- printk_nmi_exit(); \
arch_nmi_exit(); \
lockdep_on(); \
} while (0)
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 7902c7d8b55f..4aa1031d3e4c 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr)
static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
+
pagefault_disable();
return __kmap_local_page_prot(page, prot);
}
@@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page)
static inline void *kmap_atomic_pfn(unsigned long pfn)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
+
pagefault_disable();
return __kmap_local_pfn_prot(pfn, kmap_prot);
}
@@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr)
{
kunmap_local_indexed(addr);
pagefault_enable();
- preempt_enable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_enable();
+ else
+ preempt_enable();
}
unsigned int __nr_free_highpages(void);
@@ -179,7 +190,10 @@ static inline void __kunmap_local(void *addr)
static inline void *kmap_atomic(struct page *page)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
pagefault_disable();
return page_address(page);
}
@@ -200,7 +214,10 @@ static inline void __kunmap_atomic(void *addr)
kunmap_flush_on_unmap(addr);
#endif
pagefault_enable();
- preempt_enable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_enable();
+ else
+ preempt_enable();
}
static inline unsigned int nr_free_highpages(void) { return 0; }
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bb5e7b0a4274..7d4768c1ae3d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -42,6 +42,7 @@ enum hrtimer_mode {
HRTIMER_MODE_PINNED = 0x02,
HRTIMER_MODE_SOFT = 0x04,
HRTIMER_MODE_HARD = 0x08,
+ HRTIMER_MODE_CHILL = 0x10,
HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
@@ -124,6 +125,7 @@ struct hrtimer {
u8 is_rel;
u8 is_soft;
u8 is_hard;
+ u8 is_chill;
};
/**
@@ -540,4 +542,10 @@ int hrtimers_dead_cpu(unsigned int cpu);
#define hrtimers_dead_cpu NULL
#endif
+#ifdef CONFIG_PREEMPT_RT
+extern void cpu_chill(void);
+#else
+# define cpu_chill() cpu_relax()
+#endif
+
#endif
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index ec2a47a81e42..dbbef9089789 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -64,4 +64,10 @@ static inline void irq_work_run(void) { }
static inline void irq_work_single(void *arg) { }
#endif
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT)
+void irq_work_tick_soft(void);
+#else
+static inline void irq_work_tick_soft(void) { }
+#endif
+
#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index df4651250785..aabd79e8e19a 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -68,6 +68,7 @@ struct irq_desc {
unsigned int irqs_unhandled;
atomic_t threads_handled;
int threads_handled_last;
+ u64 random_ip;
raw_spinlock_t lock;
struct cpumask *percpu_enabled;
const struct cpumask *percpu_affinity;
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 600c10da321a..4b140938b03e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -71,14 +71,6 @@ do { \
do { \
__this_cpu_dec(hardirq_context); \
} while (0)
-# define lockdep_softirq_enter() \
-do { \
- current->softirq_context++; \
-} while (0)
-# define lockdep_softirq_exit() \
-do { \
- current->softirq_context--; \
-} while (0)
# define lockdep_hrtimer_enter(__hrtimer) \
({ \
@@ -140,6 +132,21 @@ do { \
# define lockdep_irq_work_exit(__work) do { } while (0)
#endif
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT)
+# define lockdep_softirq_enter() \
+do { \
+ current->softirq_context++; \
+} while (0)
+# define lockdep_softirq_exit() \
+do { \
+ current->softirq_context--; \
+} while (0)
+
+#else
+# define lockdep_softirq_enter() do { } while (0)
+# define lockdep_softirq_exit() do { } while (0)
+#endif
+
#if defined(CONFIG_IRQSOFF_TRACER) || \
defined(CONFIG_PREEMPT_TRACER)
extern void stop_critical_timings(void);
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 392a3670944c..67197bbdcba8 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -212,6 +212,8 @@ extern void kgdb_call_nmi_hook(void *ignored);
*/
extern void kgdb_roundup_cpus(void);
+extern void kgdb_roundup_cpu(unsigned int cpu);
+
/**
* kgdb_arch_set_pc - Generic call back to the program counter
* @regs: Current &struct pt_regs.
@@ -365,5 +367,6 @@ extern void kgdb_free_init_mem(void);
#define dbg_late_init()
static inline void kgdb_panic(const char *msg) {}
static inline void kgdb_free_init_mem(void) { }
+static inline void kgdb_roundup_cpu(unsigned int cpu) {}
#endif /* ! CONFIG_KGDB */
#endif /* _KGDB_H_ */
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index ded90b097e6e..f39fb2806164 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -6,6 +6,8 @@
#include <linux/percpu-defs.h>
#include <linux/lockdep.h>
+#ifndef CONFIG_PREEMPT_RT
+
typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -56,38 +58,98 @@ static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#define ll_preempt_disable() preempt_disable()
+#define ll_preempt_enable() preempt_enable()
+#define ll_local_irq_disable() local_irq_disable()
+#define ll_local_irq_enable() local_irq_enable()
+#define ll_local_irq_save(flags) local_irq_save(flags)
+#define ll_local_irq_restore(flags) local_irq_restore(flags)
+
+#else /* !CONFIG_PREEMPT_RT */
+
+/*
+ * The preempt RT mapping of local locks: a spinlock.
+ */
+typedef struct {
+ spinlock_t lock;
+} local_lock_t;
+
+#define INIT_LOCAL_LOCK(lockname) { \
+ __SPIN_LOCK_UNLOCKED((lockname).lock), \
+ }
+
+#define __local_lock_init(l) \
+do { \
+ spin_lock_init(&(l)->lock); \
+} while (0)
+
+static inline void local_lock_acquire(local_lock_t *l)
+{
+ spin_lock(&l->lock);
+}
+
+static inline void local_lock_release(local_lock_t *l)
+{
+ spin_unlock(&l->lock);
+}
+
+/*
+ * On RT enabled kernels the serialization is guaranteed by the spinlock in
+ * local_lock_t, so the only guarantee to make is to not leave the CPU.
+ */
+#define ll_preempt_disable() migrate_disable()
+#define ll_preempt_enable() migrate_enable()
+#define ll_local_irq_disable() migrate_disable()
+#define ll_local_irq_enable() migrate_enable()
+
+#define ll_local_irq_save(flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ migrate_disable(); \
+ } while (0)
+
+#define ll_local_irq_restore(flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ (void)flags; \
+ migrate_enable(); \
+ } while (0)
+
+#endif /* CONFIG_PREEMPT_RT */
+
#define __local_lock(lock) \
do { \
- preempt_disable(); \
+ ll_preempt_disable(); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
#define __local_lock_irq(lock) \
do { \
- local_irq_disable(); \
+ ll_local_irq_disable(); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
#define __local_lock_irqsave(lock, flags) \
do { \
- local_irq_save(flags); \
+ ll_local_irq_save(flags); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
#define __local_unlock(lock) \
do { \
local_lock_release(this_cpu_ptr(lock)); \
- preempt_enable(); \
+ ll_preempt_enable(); \
} while (0)
#define __local_unlock_irq(lock) \
do { \
local_lock_release(this_cpu_ptr(lock)); \
- local_irq_enable(); \
+ ll_local_irq_enable(); \
} while (0)
#define __local_unlock_irqrestore(lock, flags) \
do { \
local_lock_release(this_cpu_ptr(lock)); \
- local_irq_restore(flags); \
+ ll_local_irq_restore(flags); \
} while (0)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5043c0702c10..409376d4d7fa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
+#include <linux/rcupdate.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
@@ -569,6 +570,9 @@ struct mm_struct {
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcu_head delayed_drop;
+#endif
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d53eba1c383..d7740c97b87e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
+#include <linux/local_lock.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -134,10 +135,10 @@ enum numa_stat_item {
NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
- NR_VM_NUMA_STAT_ITEMS
+ NR_VM_NUMA_EVENT_ITEMS
};
#else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
#endif
enum zone_stat_item {
@@ -337,24 +338,31 @@ enum zone_watermarks {
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+ int expire; /* When 0, remote pagesets are drained */
+#endif
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
};
-struct per_cpu_pageset {
- struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
- s8 expire;
- u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
#ifdef CONFIG_SMP
- s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+ s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+ /*
+ * Low priority inaccurate counters that are only folded
+ * on demand. Use a large type to avoid the overhead of
+ * folding during refresh_cpu_vm_stats.
+ */
+ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};
@@ -484,7 +492,8 @@ struct zone {
int node;
#endif
struct pglist_data *zone_pgdat;
- struct per_cpu_pageset __percpu *pageset;
+ struct per_cpu_pages __percpu *per_cpu_pageset;
+ struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
* the high and batch values are copied to individual pagesets for
* faster access
@@ -619,7 +628,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
- atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index e19323521f9c..2cfc234a786d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,9 +20,6 @@
#include <linux/osq_lock.h>
#include <linux/debug_locks.h>
-struct ww_class;
-struct ww_acquire_ctx;
-
/*
* Simple, straightforward mutexes with strict semantics:
*
@@ -51,9 +48,30 @@ struct ww_acquire_ctx;
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct mutex {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+ , .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_SLEEP, \
+ }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
+
+/*
+ * Typedef _mutex_t for ww_mutex and core code to allow ww_mutex being
+ * built on the regular mutex code in RT kernels while mutex itself is
+ * substituted by a rt_mutex.
+ */
+#ifndef CONFIG_PREEMPT_RT
+typedef struct mutex
+#else
+typedef struct __mutex
+#endif
+{
atomic_long_t owner;
- spinlock_t wait_lock;
+ raw_spinlock_t wait_lock;
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
struct optimistic_spin_queue osq; /* Spinner MCS lock */
#endif
@@ -64,44 +82,24 @@ struct mutex {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
-};
+} _mutex_t;
-struct ww_mutex {
- struct mutex base;
- struct ww_acquire_ctx *ctx;
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_class *ww_class;
-#endif
-};
-
-/*
- * This is the control structure for tasks blocked on mutex,
- * which resides on the blocked task's kernel stack:
- */
-struct mutex_waiter {
- struct list_head list;
- struct task_struct *task;
- struct ww_acquire_ctx *ww_ctx;
-#ifdef CONFIG_DEBUG_MUTEXES
- void *magic;
-#endif
-};
+extern void __mutex_t_init(_mutex_t *lock, const char *name,
+ struct lock_class_key *key);
+extern int _mutex_t_trylock(_mutex_t *lock);
+extern bool _mutex_t_is_locked(_mutex_t *lock);
#ifdef CONFIG_DEBUG_MUTEXES
-
-#define __DEBUG_MUTEX_INITIALIZER(lockname) \
+# define __DEBUG_MUTEX_INITIALIZER(lockname) \
, .magic = &lockname
-extern void mutex_destroy(struct mutex *lock);
-
+extern void _mutex_t_destroy(_mutex_t *lock);
#else
-
# define __DEBUG_MUTEX_INITIALIZER(lockname)
-
-static inline void mutex_destroy(struct mutex *lock) {}
-
+static inline void _mutex_t_destroy(_mutex_t *lock) {}
#endif
+#ifndef CONFIG_PREEMPT_RT
/**
* mutex_init - initialize the mutex
* @mutex: the mutex to be initialized
@@ -114,22 +112,17 @@ static inline void mutex_destroy(struct mutex *lock) {}
do { \
static struct lock_class_key __key; \
\
- __mutex_init((mutex), #mutex, &__key); \
+ __mutex_t_init((mutex), #mutex, &__key); \
} while (0)
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
- , .dep_map = { \
- .name = #lockname, \
- .wait_type_inner = LD_WAIT_SLEEP, \
- }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
+#define __mutex_init(mutex, name, key) \
+do { \
+ __mutex_t_init((mutex), name, key); \
+} while (0)
#define __MUTEX_INITIALIZER(lockname) \
{ .owner = ATOMIC_LONG_INIT(0) \
- , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
+ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
__DEBUG_MUTEX_INITIALIZER(lockname) \
__DEP_MAP_MUTEX_INITIALIZER(lockname) }
@@ -137,8 +130,10 @@ do { \
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
-extern void __mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+static __always_inline void mutex_destroy(struct mutex *lock)
+{
+ _mutex_t_destroy(lock);
+}
/**
* mutex_is_locked - is the mutex locked
@@ -146,7 +141,74 @@ extern void __mutex_init(struct mutex *lock, const char *name,
*
* Returns true if the mutex is locked, false if unlocked.
*/
-extern bool mutex_is_locked(struct mutex *lock);
+static __always_inline bool mutex_is_locked(struct mutex *lock)
+{
+ return _mutex_t_is_locked(lock);
+}
+
+/**
+ * mutex_trylock - try to acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated from the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+static __always_inline int mutex_trylock(struct mutex *lock)
+{
+ return _mutex_t_trylock(lock);
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+/*
+ * Preempt-RT variant based on rtmutexes.
+ */
+#include <linux/rtmutex.h>
+
+struct mutex {
+ struct rt_mutex rtmutex;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+#define __MUTEX_INITIALIZER(mutexname) \
+ { \
+ .rtmutex = __RT_MUTEX_INITIALIZER(mutexname.rtmutex) \
+ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
+ }
+
+#define DEFINE_MUTEX(mutexname) \
+ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void __mutex_rt_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+extern int mutex_trylock(struct mutex *lock);
+
+static inline void mutex_destroy(struct mutex *lock) { }
+
+#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->rtmutex)
+
+#define mutex_init(mutex) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ rt_mutex_init(&(mutex)->rtmutex); \
+ __mutex_rt_init((mutex), #mutex, &__key); \
+} while (0)
+
+#define __mutex_init(mutex, name, key) \
+do { \
+ rt_mutex_init(&(mutex)->rtmutex); \
+ __mutex_rt_init((mutex), name, key); \
+} while (0)
+#endif /* CONFIG_PREEMPT_RT */
/*
* See kernel/locking/mutex.c for detailed documentation of these APIs.
@@ -186,13 +248,6 @@ extern void mutex_lock_io(struct mutex *lock);
# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
#endif
-/*
- * NOTE: mutex_trylock() follows the spin_trylock() convention,
- * not the down_trylock() convention!
- *
- * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
- */
-extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5c73a91a8105..31d24f44acb5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1691,7 +1691,7 @@ struct nfs_unlinkdata {
struct nfs_removeargs args;
struct nfs_removeres res;
struct dentry *dentry;
- wait_queue_head_t wq;
+ struct swait_queue_head wq;
const struct cred *cred;
struct nfs_fattr dir_attr;
long timeout;
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 2fb373a5c1ed..723bc2df6388 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -58,7 +58,7 @@ struct notifier_block {
};
struct atomic_notifier_head {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct notifier_block __rcu *head;
};
@@ -78,7 +78,7 @@ struct srcu_notifier_head {
};
#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \
- spin_lock_init(&(name)->lock); \
+ raw_spin_lock_init(&(name)->lock); \
(name)->head = NULL; \
} while (0)
#define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \
@@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
cleanup_srcu_struct(&(name)->srcu);
#define ATOMIC_NOTIFIER_INIT(name) { \
- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
.head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) { \
.rwsem = __RWSEM_INITIALIZER((name).rwsem), \
diff --git a/include/linux/pid.h b/include/linux/pid.h
index fa10acb8d6a4..2f86f84e9fc1 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
#define _LINUX_PID_H
#include <linux/rculist.h>
+#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/refcount.h>
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9881eac0698f..af39859f02ee 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -121,7 +121,11 @@
/*
* The preempt_count offset after spin_lock()
*/
+#if !defined(CONFIG_PREEMPT_RT)
#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
+#else
+#define PREEMPT_LOCK_OFFSET 0
+#endif
/*
* The preempt_count offset needed for things like:
@@ -170,6 +174,20 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)
+#ifdef CONFIG_PREEMPT_LAZY
+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
+#else
+#define add_preempt_lazy_count(val) do { } while (0)
+#define sub_preempt_lazy_count(val) do { } while (0)
+#define inc_preempt_lazy_count() do { } while (0)
+#define dec_preempt_lazy_count() do { } while (0)
+#define preempt_lazy_count() (0)
+#endif
+
#ifdef CONFIG_PREEMPT_COUNT
#define preempt_disable() \
@@ -178,13 +196,25 @@ do { \
barrier(); \
} while (0)
+#define preempt_lazy_disable() \
+do { \
+ inc_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define sched_preempt_enable_no_resched() \
do { \
barrier(); \
preempt_count_dec(); \
} while (0)
-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+#ifdef CONFIG_PREEMPT_RT
+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+# define preempt_check_resched_rt() preempt_check_resched()
+#else
+# define preempt_enable_no_resched() preempt_enable()
+# define preempt_check_resched_rt() barrier();
+#endif
#define preemptible() (preempt_count() == 0 && !irqs_disabled())
@@ -209,6 +239,18 @@ do { \
__preempt_schedule(); \
} while (0)
+/*
+ * open code preempt_check_resched() because it is not exported to modules and
+ * used by local_unlock() or bpf_enable_instrumentation().
+ */
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+ if (should_resched(0)) \
+ __preempt_schedule(); \
+} while (0)
+
#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
@@ -216,6 +258,12 @@ do { \
preempt_count_dec(); \
} while (0)
+#define preempt_lazy_enable() \
+do { \
+ dec_preempt_lazy_count(); \
+ barrier(); \
+} while (0)
+
#define preempt_enable_notrace() \
do { \
barrier(); \
@@ -254,8 +302,12 @@ do { \
#define preempt_disable_notrace() barrier()
#define preempt_enable_no_resched_notrace() barrier()
#define preempt_enable_notrace() barrier()
+#define preempt_check_resched_rt() barrier()
#define preemptible() 0
+#define preempt_lazy_disable() barrier()
+#define preempt_lazy_enable() barrier()
+
#endif /* CONFIG_PREEMPT_COUNT */
#ifdef MODULE
@@ -274,10 +326,22 @@ do { \
} while (0)
#define preempt_fold_need_resched() \
do { \
- if (tif_need_resched()) \
+ if (tif_need_resched_now()) \
set_preempt_need_resched(); \
} while (0)
+#ifdef CONFIG_PREEMPT_RT
+# define preempt_disable_rt() preempt_disable()
+# define preempt_enable_rt() preempt_enable()
+# define preempt_disable_nort() barrier()
+# define preempt_enable_nort() barrier()
+#else
+# define preempt_disable_rt() barrier()
+# define preempt_enable_rt() barrier()
+# define preempt_disable_nort() preempt_disable()
+# define preempt_enable_nort() preempt_enable()
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier;
@@ -390,8 +454,15 @@ extern void migrate_enable(void);
#else
-static inline void migrate_disable(void) { }
-static inline void migrate_enable(void) { }
+static inline void migrate_disable(void)
+{
+ preempt_lazy_disable();
+}
+
+static inline void migrate_enable(void)
+{
+ preempt_lazy_enable();
+}
#endif /* CONFIG_SMP */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index fe7eb2351610..0cdd25b49fcc 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -46,6 +46,12 @@ static inline const char *printk_skip_headers(const char *buffer)
#define CONSOLE_EXT_LOG_MAX 8192
+/*
+ * The maximum size of a record formatted for console printing
+ * (i.e. with the prefix prepended to every line).
+ */
+#define CONSOLE_LOG_MAX 1024
+
/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
@@ -149,18 +155,6 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-#ifdef CONFIG_PRINTK_NMI
-extern void printk_nmi_enter(void);
-extern void printk_nmi_exit(void);
-extern void printk_nmi_direct_enter(void);
-extern void printk_nmi_direct_exit(void);
-#else
-static inline void printk_nmi_enter(void) { }
-static inline void printk_nmi_exit(void) { }
-static inline void printk_nmi_direct_enter(void) { }
-static inline void printk_nmi_direct_exit(void) { }
-#endif /* PRINTK_NMI */
-
struct dev_printk_info;
#ifdef CONFIG_PRINTK
@@ -207,8 +201,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
-extern void printk_safe_flush(void);
-extern void printk_safe_flush_on_panic(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
@@ -272,14 +264,6 @@ static inline void show_regs_print_info(const char *log_lvl)
static inline void dump_stack(void)
{
}
-
-static inline void printk_safe_flush(void)
-{
-}
-
-static inline void printk_safe_flush_on_panic(void)
-{
-}
#endif
extern int kptr_restrict;
@@ -497,6 +481,8 @@ extern int kptr_restrict;
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
+bool pr_flush(int timeout_ms, bool reset_on_progress);
+
/*
* ratelimited messages with local ratelimit_state,
* no local ratelimit_state used in the !PRINTK case
diff --git a/include/linux/random.h b/include/linux/random.h
index f45b8be3e3c4..0e41d0527809 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {}
extern void add_input_randomness(unsigned int type, unsigned int code,
unsigned int value) __latent_entropy;
-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
extern void get_random_bytes(void *buf, int nbytes);
extern int wait_for_random_bytes(void);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d31ecaf4fdd3..36a0e7226ec5 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -19,22 +19,11 @@
#include <linux/kernel.h>
#include <linux/stddef.h>
+#include <linux/rbtree_types.h>
#include <linux/rcupdate.h>
-struct rb_node {
- unsigned long __rb_parent_color;
- struct rb_node *rb_right;
- struct rb_node *rb_left;
-} __attribute__((aligned(sizeof(long))));
- /* The alignment might seem pointless, but allegedly CRIS needs it */
-
-struct rb_root {
- struct rb_node *rb_node;
-};
-
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
-#define RB_ROOT (struct rb_root) { NULL, }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
@@ -112,23 +101,6 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent
typeof(*pos), field); 1; }); \
pos = n)
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
-};
-
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
-
/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost
diff --git a/include/linux/rbtree_types.h b/include/linux/rbtree_types.h
new file mode 100644
index 000000000000..45b6ecde3665
--- /dev/null
+++ b/include/linux/rbtree_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_RBTREE_TYPES_H
+#define _LINUX_RBTREE_TYPES_H
+
+struct rb_node {
+ unsigned long __rb_parent_color;
+ struct rb_node *rb_right;
+ struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+ struct rb_node *rb_node;
+};
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT (struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1199ffd305d1..a70da3f37df3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -54,6 +54,11 @@ void __rcu_read_unlock(void);
* types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
*/
#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+#ifndef CONFIG_PREEMPT_RT
+#define sched_rcu_preempt_depth() rcu_preempt_depth()
+#else
+static inline int sched_rcu_preempt_depth(void) { return 0; }
+#endif
#else /* #ifdef CONFIG_PREEMPT_RCU */
@@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void)
return 0;
}
+#define sched_rcu_preempt_depth() rcu_preempt_depth()
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index d1672de9ca89..7bbee67720dc 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,8 +14,8 @@
#define __LINUX_RT_MUTEX_H
#include <linux/linkage.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock_types.h>
+#include <linux/rbtree_types.h>
+#include <linux/spinlock_types_raw.h>
extern int max_lock_depth; /* for sysctl */
diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h
new file mode 100644
index 000000000000..78b62a01e301
--- /dev/null
+++ b/include/linux/rwbase_rt.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _LINUX_RW_BASE_RT_H
+#define _LINUX_RW_BASE_RT_H
+
+#include <linux/rtmutex.h>
+#include <linux/atomic.h>
+
+#define READER_BIAS (1U << 31)
+#define WRITER_BIAS (1U << 30)
+
+struct rwbase_rt {
+ atomic_t readers;
+ struct rt_mutex rtmutex;
+};
+
+#define __RWBASE_INITIALIZER(name) \
+{ \
+ .readers = ATOMIC_INIT(READER_BIAS), \
+ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \
+}
+
+#define init_rwbase_rt(rwbase) \
+ do { \
+ rt_mutex_init(&(rwbase)->rtmutex); \
+ atomic_set(&(rwbase)->readers, READER_BIAS); \
+} while (0)
+
+static __always_inline bool rw_base_is_locked(struct rwbase_rt *rwb)
+{
+ return atomic_read(&rwb->readers) != READER_BIAS;
+}
+
+static __always_inline bool rw_base_is_contended(struct rwbase_rt *rwb)
+{
+ return atomic_read(&rwb->readers) > 0;
+}
+#endif
diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
new file mode 100644
index 000000000000..38048bf7d063
--- /dev/null
+++ b/include/linux/rwlock_rt.h
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_RWLOCK_RT_H
+#define __LINUX_RWLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_RT_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key);
+#else
+static inline void __rt_rwlock_init(rwlock_t *rwlock, char *name,
+ struct lock_class_key *key)
+{
+}
+#endif
+
+#define rwlock_init(rwl) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ init_rwbase_rt(&(rwl)->rwbase); \
+ __rt_rwlock_init(rwl, #rwl, &__key); \
+} while (0)
+
+extern void rt_read_lock(rwlock_t *rwlock);
+extern int rt_read_trylock(rwlock_t *rwlock);
+extern void rt_read_unlock(rwlock_t *rwlock);
+extern void rt_write_lock(rwlock_t *rwlock);
+extern int rt_write_trylock(rwlock_t *rwlock);
+extern void rt_write_unlock(rwlock_t *rwlock);
+
+static __always_inline void read_lock(rwlock_t *rwlock)
+{
+ rt_read_lock(rwlock);
+}
+
+static __always_inline void read_lock_bh(rwlock_t *rwlock)
+{
+ local_bh_disable();
+ rt_read_lock(rwlock);
+}
+
+static __always_inline void read_lock_irq(rwlock_t *rwlock)
+{
+ rt_read_lock(rwlock);
+}
+
+#define read_lock_irqsave(lock, flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ rt_read_lock(lock); \
+ flags = 0; \
+ } while (0)
+
+#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
+
+static __always_inline void read_unlock(rwlock_t *rwlock)
+{
+ rt_read_unlock(rwlock);
+}
+
+static __always_inline void read_unlock_bh(rwlock_t *rwlock)
+{
+ rt_read_unlock(rwlock);
+ local_bh_enable();
+}
+
+static __always_inline void read_unlock_irq(rwlock_t *rwlock)
+{
+ rt_read_unlock(rwlock);
+}
+
+static __always_inline void read_unlock_irqrestore(rwlock_t *rwlock,
+ unsigned long flags)
+{
+ rt_read_unlock(rwlock);
+}
+
+static __always_inline void write_lock(rwlock_t *rwlock)
+{
+ rt_write_lock(rwlock);
+}
+
+static __always_inline void write_lock_bh(rwlock_t *rwlock)
+{
+ local_bh_disable();
+ rt_write_lock(rwlock);
+}
+
+static __always_inline void write_lock_irq(rwlock_t *rwlock)
+{
+ rt_write_lock(rwlock);
+}
+
+#define write_lock_irqsave(lock, flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ rt_write_lock(lock); \
+ flags = 0; \
+ } while (0)
+
+#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
+
+#define write_trylock_irqsave(lock, flags) \
+({ \
+ int __locked; \
+ \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __locked = write_trylock(lock); \
+ __locked; \
+})
+
+static __always_inline void write_unlock(rwlock_t *rwlock)
+{
+ rt_write_unlock(rwlock);
+}
+
+static __always_inline void write_unlock_bh(rwlock_t *rwlock)
+{
+ rt_write_unlock(rwlock);
+ local_bh_enable();
+}
+
+static __always_inline void write_unlock_irq(rwlock_t *rwlock)
+{
+ rt_write_unlock(rwlock);
+}
+
+static __always_inline void write_unlock_irqrestore(rwlock_t *rwlock,
+ unsigned long flags)
+{
+ rt_write_unlock(rwlock);
+}
+
+#define rwlock_is_contended(lock) (((void)(lock), 0))
+
+#endif
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index 3bd03e18061c..243339e3e27c 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -1,9 +1,13 @@
#ifndef __LINUX_RWLOCK_TYPES_H
#define __LINUX_RWLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H)
+# error "Do not include directly, include spinlock_types.h"
+#endif
+
+#ifndef CONFIG_PREEMPT_RT
/*
- * include/linux/rwlock_types.h - generic rwlock type definitions
- * and initializers
+ * generic rwlock type definitions and initializers
*
* portions Copyright 2005, Red Hat, Inc., Ingo Molnar
* Released under the General Public License (GPL).
@@ -46,4 +50,35 @@ typedef struct {
#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#else /* !CONFIG_PREEMPT_RT */
+
+#include <linux/rwbase_rt.h>
+
+typedef struct {
+ struct rwbase_rt rwbase;
+ atomic_t readers;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} rwlock_t;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
+#else
+# define RW_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
+
+#define DEFINE_RWLOCK(name) \
+ rwlock_t name = __RW_LOCK_UNLOCKED(name)
+
+#define __RWLOCK_RT_INITIALIZER(name) \
+{ \
+ .rwbase = __RWBASE_INITIALIZER(name), \
+ RW_DEP_MAP_INIT(name) \
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
#endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index a66038d88878..67a3face07b1 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,9 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
+
+#ifndef CONFIG_PREEMPT_RT
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif
@@ -119,6 +122,61 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
return !list_empty(&sem->wait_list);
}
+#else /* !CONFIG_PREEMPT_RT */
+
+#include <linux/rwbase_rt.h>
+
+struct rw_semaphore {
+ struct rwbase_rt rwbase;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+#define __RWSEM_INITIALIZER(name) \
+ { \
+ .rwbase = __RWBASE_INITIALIZER(name), \
+ RW_DEP_MAP_INIT(name) \
+}
+
+#define DECLARE_RWSEM(lockname) \
+ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
+ struct lock_class_key *key);
+#else
+static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
+ struct lock_class_key *key)
+{
+}
+#endif
+
+#define init_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ init_rwbase_rt(&(sem)->rwbase); \
+ __rwsem_init((sem), #sem, &__key); \
+} while (0)
+
+static __always_inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ return rw_base_is_locked(&sem->rwbase);
+}
+
+static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
+{
+ return rw_base_is_contended(&sem->rwbase);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
+ * The functions below are the same for all rwsem implementations including
+ * the RT specific variant.
+ */
+
/*
* lock for reading
*/
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 32813c345115..efdbdf654876 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -95,7 +95,9 @@ struct task_group;
#define TASK_WAKING 0x0200
#define TASK_NOLOAD 0x0400
#define TASK_NEW 0x0800
-#define TASK_STATE_MAX 0x1000
+/* RT specific auxilliary flag to mark RT lock waiters */
+#define TASK_RTLOCK_WAIT 0x1000
+#define TASK_STATE_MAX 0x2000
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -113,12 +115,8 @@ struct task_group;
__TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
TASK_PARKED)
-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
-
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
-
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
/*
@@ -151,6 +149,27 @@ struct task_group;
current->state = (state_value); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
+
+
+#define current_save_and_set_rtlock_wait_state() \
+ do { \
+ raw_spin_lock(&current->pi_lock); \
+ current->saved_state = current->state; \
+ current->saved_state_change = current->task_state_change;\
+ current->task_state_change = _THIS_IP_; \
+ current->state = TASK_RTLOCK_WAIT; \
+ raw_spin_unlock(&current->pi_lock); \
+ } while (0);
+
+#define current_restore_rtlock_saved_state() \
+ do { \
+ raw_spin_lock(&current->pi_lock); \
+ current->task_state_change = current->saved_state_change;\
+ current->state = current->saved_state; \
+ current->saved_state = TASK_RUNNING; \
+ raw_spin_unlock(&current->pi_lock); \
+ } while (0);
+
#else
/*
* set_current_state() includes a barrier so that the write of current->state
@@ -209,6 +228,47 @@ struct task_group;
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
+/*
+ * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
+ *
+ * RT's spin/rwlock substitutions are state preserving. The state of the
+ * task when blocking on the lock is saved in task_struct::saved_state and
+ * restored after the lock has been acquired. These operations are
+ * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
+ * lock related wakeups while the task is blocked on the lock are
+ * redirected to operate on task_struct::saved_state to ensure that these
+ * are not dropped. On restore task_struct::saved_state is set to
+ * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
+ *
+ * The lock operation looks like this:
+ *
+ * current_save_and_set_rtlock_wait_state();
+ * for (;;) {
+ * if (try_lock())
+ * break;
+ * raw_spin_unlock_irq(&lock->wait_lock);
+ * schedule_rtlock();
+ * raw_spin_lock_irq(&lock->wait_lock);
+ * set_current_state(TASK_RTLOCK_WAIT);
+ * }
+ * current_restore_rtlock_saved_state();
+ */
+#define current_save_and_set_rtlock_wait_state() \
+ do { \
+ raw_spin_lock(&current->pi_lock); \
+ current->saved_state = current->state; \
+ current->state = TASK_RTLOCK_WAIT; \
+ raw_spin_unlock(&current->pi_lock); \
+ } while (0);
+
+#define current_restore_rtlock_saved_state() \
+ do { \
+ raw_spin_lock(&current->pi_lock); \
+ current->state = current->saved_state; \
+ current->saved_state = TASK_RUNNING; \
+ raw_spin_unlock(&current->pi_lock); \
+ } while (0);
+
#endif
/* Task command name length: */
@@ -226,6 +286,9 @@ extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
+#ifdef CONFIG_PREEMPT_RT
+ extern void schedule_rtlock(void);
+#endif
extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
@@ -665,6 +728,11 @@ struct task_struct {
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
+#ifdef CONFIG_PREEMPT_RT
+ /* saved state for "spinlock sleepers" */
+ long saved_state;
+#endif
+
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
@@ -1002,6 +1070,10 @@ struct task_struct {
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
+#ifdef CONFIG_PREEMPT_RT
+ /* TODO: move me into ->restart_block ? */
+ struct kernel_siginfo forced_info;
+#endif
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
@@ -1347,6 +1419,9 @@ struct task_struct {
struct kmap_ctrl kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
+# ifdef CONFIG_PREEMPT_RT
+ unsigned long saved_state_change;
+# endif
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
@@ -1887,6 +1962,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+#ifdef CONFIG_PREEMPT_LAZY
+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
+{
+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
+}
+
+static inline int need_resched_lazy(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
+}
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#else
+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
+static inline int need_resched_lazy(void) { return 0; }
+
+static inline int need_resched_now(void)
+{
+ return test_thread_flag(TIF_NEED_RESCHED);
+}
+
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+static inline bool task_match_saved_state(struct task_struct *p, long match_state)
+{
+ return p->saved_state == match_state;
+}
+
+static inline bool task_is_traced(struct task_struct *task)
+{
+ bool traced = false;
+
+ /* in case the task is sleeping on tasklist_lock */
+ raw_spin_lock_irq(&task->pi_lock);
+ if (task->state & __TASK_TRACED)
+ traced = true;
+ else if (task->saved_state & __TASK_TRACED)
+ traced = true;
+ raw_spin_unlock_irq(&task->pi_lock);
+ return traced;
+}
+
+static inline bool task_is_stopped_or_traced(struct task_struct *task)
+{
+ bool traced_stopped = false;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ if (task->state & (__TASK_STOPPED | __TASK_TRACED))
+ traced_stopped = true;
+ else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
+ traced_stopped = true;
+
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return traced_stopped;
+}
+
+#else
+
+static inline bool task_match_saved_state(struct task_struct *p, long match_state)
+{
+ return false;
+}
+
+static inline bool task_is_traced(struct task_struct *task)
+{
+ return task->state & __TASK_TRACED;
+}
+
+static inline bool task_is_stopped_or_traced(struct task_struct *task)
+{
+ return task->state & (__TASK_STOPPED | __TASK_TRACED);
+}
+#endif
+
+static inline bool task_match_state_or_saved(struct task_struct *p,
+ long match_state)
+{
+ if (p->state == match_state)
+ return true;
+
+ return task_match_saved_state(p, match_state);
+}
+
+static inline bool task_match_state_lock(struct task_struct *p,
+ long match_state)
+{
+ bool match;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ match = task_match_state_or_saved(p, match_state);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..52ecb2c11c18 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+#ifdef CONFIG_PREEMPT_RT
+extern void __mmdrop_delayed(struct rcu_head *rhp);
+static inline void mmdrop_delayed(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_count))
+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
+}
+#else
+# define mmdrop_delayed(mm) mmdrop(mm)
+#endif
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 26a2013ac39c..06cd8fb2f409 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -42,8 +42,11 @@ struct wake_q_head {
#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
-#define DEFINE_WAKE_Q(name) \
- struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+#define WAKE_Q_HEAD_INITIALIZER(name) \
+ { WAKE_Q_TAIL, &name.first }
+
+#define DEFINE_WAKE_Q(name) \
+ struct wake_q_head name = WAKE_Q_HEAD_INITIALIZER(name)
static inline void wake_q_init(struct wake_q_head *head)
{
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 9e655055112d..ffef674deda7 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_SERIAL_8250_H
#define _LINUX_SERIAL_8250_H
+#include <linux/atomic.h>
#include <linux/serial_core.h>
#include <linux/serial_reg.h>
#include <linux/platform_device.h>
@@ -125,6 +126,8 @@ struct uart_8250_port {
#define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
unsigned char msr_saved_flags;
+ atomic_t console_printing;
+
struct uart_8250_dma *dma;
const struct uart_8250_ops *ops;
@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up);
void serial8250_set_defaults(struct uart_8250_port *up);
void serial8250_console_write(struct uart_8250_port *up, const char *s,
unsigned int count);
+void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s,
+ unsigned int count);
int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
int serial8250_console_exit(struct uart_port *port);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d82b6f396588..12b2e41d8f47 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -31,7 +31,7 @@ struct shmem_sb_info {
struct percpu_counter used_blocks; /* How many are allocated */
unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */
- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
+ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
umode_t mode; /* Mount mode for root directory */
unsigned char huge; /* Whether to try for hugepages */
kuid_t uid; /* Mount uid for root directory */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbf820a50a39..7be803493474 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -296,6 +296,7 @@ struct sk_buff_head {
__u32 qlen;
spinlock_t lock;
+ raw_spinlock_t raw_lock;
};
struct sk_buff;
@@ -1906,6 +1907,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
__skb_queue_head_init(list);
}
+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
+{
+ raw_spin_lock_init(&list->raw_lock);
+ __skb_queue_head_init(list);
+}
+
static inline void skb_queue_head_init_class(struct sk_buff_head *list,
struct lock_class_key *class)
{
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dcde82a4434c..b5bcac29b979 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
#include <linux/kfence.h>
#include <linux/kobject.h>
#include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
enum stat_item {
ALLOC_FASTPATH, /* Allocation from cpu slab */
@@ -41,6 +42,7 @@ enum stat_item {
NR_SLUB_STAT_ITEMS };
struct kmem_cache_cpu {
+ local_lock_t lock; /* Protects the fields below except stat */
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
struct page *page; /* The slab from which we are allocating */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 510519e8a1eb..7ac9fdb5ad09 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -268,6 +268,9 @@ static inline int get_boot_cpu_id(void)
#define get_cpu() ({ preempt_disable(); __smp_processor_id(); })
#define put_cpu() preempt_enable()
+#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); })
+#define put_cpu_light() migrate_enable()
+
/*
* Callback to arch code if there's nosmp or maxcpus=0 on the
* boot command line:
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 79897841a2cc..23925a6c489b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -12,6 +12,8 @@
* asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
* initializers
*
+ * linux/spinlock_types_raw:
+ * The raw types and initializers
* linux/spinlock_types.h:
* defines the generic type and initializers
*
@@ -31,6 +33,8 @@
* contains the generic, simplified UP spinlock type.
* (which is an empty structure on non-debug builds)
*
+ * linux/spinlock_types_raw:
+ * The raw RT types and initializers
* linux/spinlock_types.h:
* defines the generic type and initializers
*
@@ -308,8 +312,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
1 : ({ local_irq_restore(flags); 0; }); \
})
-/* Include rwlock functions */
+#ifndef CONFIG_PREEMPT_RT
+/* Include rwlock functions for !RT */
#include <linux/rwlock.h>
+#endif
/*
* Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -320,6 +326,9 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
# include <linux/spinlock_api_up.h>
#endif
+/* Non PREEMPT_RT kernel map to raw spinlocks */
+#ifndef CONFIG_PREEMPT_RT
+
/*
* Map the spin_lock functions to the raw variants for PREEMPT_RT=n
*/
@@ -454,6 +463,10 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock)
+#else /* !CONFIG_PREEMPT_RT */
+# include <linux/spinlock_rt.h>
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* Pull the atomic_t declaration:
* (asm-mips/atomic.h needs above definitions)
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 19a9be9d97ee..51bf88c84133 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -187,6 +187,9 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
return 0;
}
+/* PREEMPT_RT has it's own rwlock implementation */
+#ifndef CONFIG_PREEMPT_RT
#include <linux/rwlock_api_smp.h>
+#endif
#endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
new file mode 100644
index 000000000000..8ce04c07e944
--- /dev/null
+++ b/include/linux/spinlock_rt.h
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_SPINLOCK_RT_H
+#define __LINUX_SPINLOCK_RT_H
+
+#ifndef __LINUX_SPINLOCK_H
+#error Do not include directly. Use spinlock.h
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key);
+#else
+static inline void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+}
+#endif
+
+#define spin_lock_init(slock) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ rt_mutex_init(&(slock)->lock); \
+ __rt_spin_lock_init(slock, #slock, &__key); \
+} while (0)
+
+extern void rt_spin_lock(spinlock_t *lock);
+extern void rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock);
+extern void rt_spin_unlock(spinlock_t *lock);
+extern void rt_spin_lock_unlock(spinlock_t *lock);
+extern int rt_spin_trylock_bh(spinlock_t *lock);
+extern int rt_spin_trylock(spinlock_t *lock);
+
+static __always_inline void spin_lock(spinlock_t *lock)
+{
+ rt_spin_lock(lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+# define __spin_lock_nested(lock, subclass) \
+ rt_spin_lock_nested(lock, subclass)
+
+# define __spin_lock_nest_lock(lock, nest_lock) \
+ do { \
+ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
+ rt_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \
+ } while (0)
+# define __spin_lock_irqsave_nested(lock, flags, subclass) \
+ do { \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __spin_lock_nested(lock, subclass); \
+ } while (0)
+
+#else
+ /*
+ * Always evaluate the 'subclass' argument to avoid that the compiler
+ * warns about set-but-not-used variables when building with
+ * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
+ */
+# define __spin_lock_nested(lock, subclass) spin_lock(((void)(subclass), (lock)))
+# define __spin_lock_nest_lock(lock, subclass) spin_lock(((void)(subclass), (lock)))
+# define __spin_lock_irqsave_nested(lock, flags, subclass) \
+ spin_lock_irqsave(((void)(subclass), (lock)), flags)
+#endif
+
+#define spin_lock_nested(lock, subclass) \
+ __spin_lock_nested(lock, subclass)
+
+#define spin_lock_nest_lock(lock, nest_lock) \
+ __spin_lock_nest_lock(lock, nest_lock)
+
+#define spin_lock_irqsave_nested(lock, flags, subclass) \
+ __spin_lock_irqsave_nested(lock, flags, subclass)
+
+static __always_inline void spin_lock_bh(spinlock_t *lock)
+{
+ /* Investigate: Drop bh when blocking ? */
+ local_bh_disable();
+ rt_spin_lock(lock);
+}
+
+static __always_inline void spin_lock_irq(spinlock_t *lock)
+{
+ rt_spin_lock(lock);
+}
+
+#define spin_lock_irqsave(lock, flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ spin_lock(lock); \
+ } while (0)
+
+static __always_inline void spin_unlock(spinlock_t *lock)
+{
+ rt_spin_unlock(lock);
+}
+
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
+{
+ rt_spin_unlock(lock);
+ local_bh_enable();
+}
+
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
+{
+ rt_spin_unlock(lock);
+}
+
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock,
+ unsigned long flags)
+{
+ spin_unlock(lock);
+}
+
+#define spin_trylock(lock) \
+ __cond_lock(lock, rt_spin_trylock(lock))
+
+#define spin_trylock_bh(lock) \
+ __cond_lock(lock, rt_spin_trylock_bh(lock))
+
+#define spin_trylock_irq(lock) \
+ __cond_lock(lock, rt_spin_trylock(lock))
+
+#define __spin_trylock_irqsave(lock, flags) \
+({ \
+ int __locked; \
+ \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __locked = spin_trylock(lock); \
+ __locked; \
+})
+
+#define spin_trylock_irqsave(lock, flags) \
+ __cond_lock(lock, __spin_trylock_irqsave(lock, flags))
+
+#define spin_is_contended(lock) (((void)(lock), 0))
+
+static inline int spin_is_locked(spinlock_t *lock)
+{
+ return rt_mutex_is_locked(&lock->lock);
+}
+
+#define assert_spin_locked(lock) BUG_ON(!spin_is_locked(lock))
+
+#include <linux/rwlock_rt.h>
+
+#endif
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index b981caafe8bf..98d498f9e4fc 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,24 +9,7 @@
* Released under the General Public License (GPL).
*/
-#if defined(CONFIG_SMP)
-# include <asm/spinlock_types.h>
-#else
-# include <linux/spinlock_types_up.h>
-#endif
-
-#include <linux/lockdep_types.h>
-
-typedef struct raw_spinlock {
- arch_spinlock_t raw_lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
- unsigned int magic, owner_cpu;
- void *owner;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} raw_spinlock_t;
+#include <linux/spinlock_types_raw.h>
#define SPINLOCK_MAGIC 0xdead4ead
@@ -68,6 +51,9 @@ typedef struct raw_spinlock {
#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+#ifndef CONFIG_PREEMPT_RT
+
+/* Non PREEMPT_RT kernels map spinlock to raw_spinlock */
typedef struct spinlock {
union {
struct raw_spinlock rlock;
@@ -96,6 +82,29 @@ typedef struct spinlock {
#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#else /* !CONFIG_PREEMPT_RT */
+
+/* PREEMPT_RT kernels map spinlock to rt_mutex */
+#include <linux/rtmutex.h>
+
+typedef struct spinlock {
+ struct rt_mutex lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} spinlock_t;
+
+#define __SPIN_LOCK_UNLOCKED(name) \
+ { \
+ .lock = __RT_MUTEX_INITIALIZER(name.lock), \
+ SPIN_DEP_MAP_INIT(name) \
+ }
+
+#define DEFINE_SPINLOCK(name) \
+ spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#endif /* CONFIG_PREEMPT_RT */
+
#include <linux/rwlock_types.h>
#endif /* __LINUX_SPINLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
new file mode 100644
index 000000000000..1d4a180e983d
--- /dev/null
+++ b/include/linux/spinlock_types_raw.h
@@ -0,0 +1,65 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+#define __LINUX_SPINLOCK_TYPES_RAW_H
+
+#include <linux/types.h>
+
+#if defined(CONFIG_SMP)
+# include <asm/spinlock_types.h>
+#else
+# include <linux/spinlock_types_up.h>
+#endif
+
+#include <linux/lockdep_types.h>
+
+typedef struct raw_spinlock {
+ arch_spinlock_t raw_lock;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ unsigned int magic, owner_cpu;
+ void *owner;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} raw_spinlock_t;
+
+#define SPINLOCK_MAGIC 0xdead4ead
+
+#define SPINLOCK_OWNER_INIT ((void *)-1L)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define RAW_SPIN_DEP_MAP_INIT(lockname) \
+ .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_SPIN, \
+ }
+# define SPIN_DEP_MAP_INIT(lockname) \
+ .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_CONFIG, \
+ }
+#else
+# define RAW_SPIN_DEP_MAP_INIT(lockname)
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define SPIN_DEBUG_INIT(lockname) \
+ .magic = SPINLOCK_MAGIC, \
+ .owner_cpu = -1, \
+ .owner = SPINLOCK_OWNER_INIT,
+#else
+# define SPIN_DEBUG_INIT(lockname)
+#endif
+
+#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
+{ \
+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
+ SPIN_DEBUG_INIT(lockname) \
+ RAW_SPIN_DEP_MAP_INIT(lockname) }
+
+#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
+ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+
+#endif
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 157762db9d4b..6cf0fcc3e126 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -162,7 +162,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */
-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#ifdef CONFIG_PREEMPT_LAZY
+#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
+ test_thread_flag(TIF_NEED_RESCHED_LAZY))
+#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)
+
+#else
+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
+#define tif_need_resched_lazy() 0
+#endif
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ad413b382a3c..415fb02dd13f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -69,6 +69,8 @@ struct trace_entry {
unsigned char flags;
unsigned char preempt_count;
int pid;
+ unsigned char migrate_disable;
+ unsigned char preempt_lazy_count;
};
#define TRACE_EVENT_TYPE_MAX \
@@ -157,9 +159,11 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
unsigned int trace_ctx)
{
entry->preempt_count = trace_ctx & 0xff;
+ entry->migrate_disable = (trace_ctx >> 8) & 0xff;
+ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff;
entry->pid = current->pid;
entry->type = type;
- entry->flags = trace_ctx >> 16;
+ entry->flags = trace_ctx >> 24;
}
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
@@ -172,6 +176,7 @@ enum trace_flag_type {
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
TRACE_FLAG_NMI = 0x40,
+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
};
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index e81856c0ba13..66eb968a09d4 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -66,7 +66,7 @@
#include <linux/seqlock.h>
struct u64_stats_sync {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
seqcount_t seq;
#endif
};
@@ -115,7 +115,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
}
#endif
-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq)
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp)
@@ -125,15 +125,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp)
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
write_seqcount_begin(&syncp->seq);
#endif
}
static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
#endif
}
@@ -142,8 +146,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
{
unsigned long flags = 0;
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
- local_irq_save(flags);
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ local_irq_save(flags);
write_seqcount_begin(&syncp->seq);
#endif
return flags;
@@ -153,15 +160,18 @@ static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
- local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ local_irq_restore(flags);
#endif
}
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_begin(&syncp->seq);
#else
return 0;
@@ -170,7 +180,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
@@ -179,7 +189,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_retry(&syncp->seq, start);
#else
return false;
@@ -189,7 +199,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
@@ -203,7 +213,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
*/
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
+ preempt_disable();
+#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_disable();
#endif
return __u64_stats_fetch_begin(syncp);
@@ -212,7 +224,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync
static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start)
{
-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
+#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
+ preempt_enable();
+#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3299cd69e4ca..81f001e71a31 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -63,7 +63,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
*/
static inline void __count_vm_event(enum vm_event_item item)
{
+ preempt_disable_rt();
raw_cpu_inc(vm_event_states.event[item]);
+ preempt_enable_rt();
}
static inline void count_vm_event(enum vm_event_item item)
@@ -73,7 +75,9 @@ static inline void count_vm_event(enum vm_event_item item)
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
+ preempt_disable_rt();
raw_cpu_add(vm_event_states.event[item], delta);
+ preempt_enable_rt();
}
static inline void count_vm_events(enum vm_event_item item, long delta)
@@ -138,34 +142,27 @@ static inline void vm_events_fold_cpu(int cpu)
* Zone and node-based page accounting with per cpu differentials.
*/
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
+extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#ifdef CONFIG_NUMA
-static inline void zone_numa_state_add(long x, struct zone *zone,
- enum numa_stat_item item)
+static inline void zone_numa_event_add(long x, struct zone *zone,
+ enum numa_stat_item item)
{
- atomic_long_add(x, &zone->vm_numa_stat[item]);
- atomic_long_add(x, &vm_numa_stat[item]);
+ atomic_long_add(x, &zone->vm_numa_event[item]);
+ atomic_long_add(x, &vm_numa_event[item]);
}
-static inline unsigned long global_numa_state(enum numa_stat_item item)
+static inline unsigned long zone_numa_event_state(struct zone *zone,
+ enum numa_stat_item item)
{
- long x = atomic_long_read(&vm_numa_stat[item]);
-
- return x;
+ return atomic_long_read(&zone->vm_numa_event[item]);
}
-static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
- enum numa_stat_item item)
+static inline unsigned long
+global_numa_event_state(enum numa_stat_item item)
{
- long x = atomic_long_read(&zone->vm_numa_stat[item]);
- int cpu;
-
- for_each_online_cpu(cpu)
- x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
-
- return x;
+ return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */
@@ -236,7 +233,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
#ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
- x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+ x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];
if (x < 0)
x = 0;
@@ -245,18 +242,38 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
}
#ifdef CONFIG_NUMA
-extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+ raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+ raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
extern unsigned long sum_zone_node_page_state(int node,
enum zone_stat_item item);
-extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
+extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
enum node_stat_item item);
+extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
+static inline void fold_vm_numa_events(void)
+{
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_SMP
@@ -291,7 +308,7 @@ struct ctl_table;
int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +416,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }
static inline void drain_zonestat(struct zone *zone,
- struct per_cpu_pageset *pset) { }
+ struct per_cpu_zonestat *pzstats) { }
#endif /* CONFIG_SMP */
static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
@@ -428,7 +445,7 @@ static inline const char *numa_stat_name(enum numa_stat_item item)
static inline const char *node_stat_name(enum node_stat_item item)
{
return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
- NR_VM_NUMA_STAT_ITEMS +
+ NR_VM_NUMA_EVENT_ITEMS +
item];
}
@@ -440,7 +457,7 @@ static inline const char *lru_list_name(enum lru_list lru)
static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
- NR_VM_NUMA_STAT_ITEMS +
+ NR_VM_NUMA_EVENT_ITEMS +
NR_VM_NODE_STAT_ITEMS +
item];
}
@@ -449,7 +466,7 @@ static inline const char *writeback_stat_name(enum writeback_stat_item item)
static inline const char *vm_event_name(enum vm_event_item item)
{
return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
- NR_VM_NUMA_STAT_ITEMS +
+ NR_VM_NUMA_EVENT_ITEMS +
NR_VM_NODE_STAT_ITEMS +
NR_VM_WRITEBACK_STAT_ITEMS +
item];
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6598ae35e1b5..172efca327e9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -10,6 +10,7 @@
#include <asm/current.h>
#include <uapi/linux/wait.h>
+#include <linux/atomic.h>
typedef struct wait_queue_entry wait_queue_entry_t;
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index b77f39f319ad..4f56ec47c698 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -28,6 +28,14 @@ struct ww_class {
unsigned int is_wait_die;
};
+struct ww_mutex {
+ _mutex_t base;
+ struct ww_acquire_ctx *ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ struct ww_class *ww_class;
+#endif
+};
+
struct ww_acquire_ctx {
struct task_struct *task;
unsigned long stamp;
@@ -74,7 +82,7 @@ struct ww_acquire_ctx {
static inline void ww_mutex_init(struct ww_mutex *lock,
struct ww_class *ww_class)
{
- __mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
+ __mutex_t_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
lock->ctx = NULL;
#ifdef CONFIG_DEBUG_MUTEXES
lock->ww_class = ww_class;
@@ -322,7 +330,7 @@ extern void ww_mutex_unlock(struct ww_mutex *lock);
*/
static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
{
- return mutex_trylock(&lock->base);
+ return _mutex_t_trylock(&lock->base);
}
/***
@@ -335,7 +343,7 @@ static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
*/
static inline void ww_mutex_destroy(struct ww_mutex *lock)
{
- mutex_destroy(&lock->base);
+ _mutex_t_destroy(&lock->base);
}
/**
@@ -346,7 +354,7 @@ static inline void ww_mutex_destroy(struct ww_mutex *lock)
*/
static inline bool ww_mutex_is_locked(struct ww_mutex *lock)
{
- return mutex_is_locked(&lock->base);
+ return _mutex_t_is_locked(&lock->base);
}
#endif
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 1424e02cef90..163f8415e5db 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -6,6 +6,7 @@
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/pkt_sched.h>
+#include <net/net_seq_lock.h>
/* Note: this used to be in include/uapi/linux/gen_stats.h */
struct gnet_stats_basic_packed {
@@ -42,15 +43,15 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
spinlock_t *lock, struct gnet_dump *d,
int padattr);
-int gnet_stats_copy_basic(const seqcount_t *running,
+int gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
-void __gnet_stats_copy_basic(const seqcount_t *running,
+void __gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
-int gnet_stats_copy_basic_hw(const seqcount_t *running,
+int gnet_stats_copy_basic_hw(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
@@ -70,13 +71,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt);
+ net_seqlock_t *running, struct nlattr *opt);
void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **ptr,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt);
+ net_seqlock_t *running, struct nlattr *opt);
bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
struct gnet_stats_rate_est64 *sample);
diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
new file mode 100644
index 000000000000..67710bace741
--- /dev/null
+++ b/include/net/net_seq_lock.h
@@ -0,0 +1,15 @@
+#ifndef __NET_NET_SEQ_LOCK_H__
+#define __NET_NET_SEQ_LOCK_H__
+
+#ifdef CONFIG_PREEMPT_RT
+# define net_seqlock_t seqlock_t
+# define net_seq_begin(__r) read_seqbegin(__r)
+# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
+
+#else
+# define net_seqlock_t seqcount_t
+# define net_seq_begin(__r) read_seqcount_begin(__r)
+# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
+#endif
+
+#endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 57710303908c..c6fec32c661d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -10,6 +10,7 @@
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
+#include <net/net_seq_lock.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
@@ -101,7 +102,7 @@ struct Qdisc {
struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
struct qdisc_skb_head q;
struct gnet_stats_basic_packed bstats;
- seqcount_t running;
+ net_seqlock_t running;
struct gnet_stats_queue qstats;
unsigned long state;
struct Qdisc *next_sched;
@@ -142,7 +143,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
if (qdisc->flags & TCQ_F_NOLOCK)
return spin_is_locked(&qdisc->seqlock);
+#ifdef CONFIG_PREEMPT_RT
+ return spin_is_locked(&qdisc->running.lock) ? true : false;
+#else
return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
+#endif
}
static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
@@ -203,17 +208,35 @@ nolock_empty:
} else if (qdisc_is_running(qdisc)) {
return false;
}
+#ifdef CONFIG_PREEMPT_RT
+ if (spin_trylock(&qdisc->running.lock)) {
+ seqcount_t *s = &qdisc->running.seqcount.seqcount;
+ /*
+ * Variant of write_seqcount_t_begin() telling lockdep that a
+ * trylock was attempted.
+ */
+ do_raw_write_seqcount_begin(s);
+ seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_);
+ return true;
+ }
+ return false;
+#else
/* Variant of write_seqcount_begin() telling lockdep a trylock
* was attempted.
*/
raw_write_seqcount_begin(&qdisc->running);
seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
+#endif
return true;
}
static inline void qdisc_run_end(struct Qdisc *qdisc)
{
+#ifdef CONFIG_PREEMPT_RT
+ write_sequnlock(&qdisc->running);
+#else
write_seqcount_end(&qdisc->running);
+#endif
if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(&qdisc->seqlock);
@@ -585,7 +608,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
return qdisc_lock(root);
}
-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
+static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
struct Qdisc *root = qdisc_root_sleeping(qdisc);
diff --git a/init/Kconfig b/init/Kconfig
index a61c92066c2e..6ca28e2268ba 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -977,6 +977,7 @@ config CFS_BANDWIDTH
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on CGROUP_SCHED
+ depends on !PREEMPT_RT
default n
help
This feature lets you explicitly allocate real CPU bandwidth
@@ -1870,6 +1871,7 @@ choice
config SLAB
bool "SLAB"
+ depends on !PREEMPT_RT
select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
The regular slab allocator that is established and known to work
@@ -1890,6 +1892,7 @@ config SLUB
config SLOB
depends on EXPERT
bool "SLOB (Simple Allocator)"
+ depends on !PREEMPT_RT
help
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but
@@ -1955,7 +1958,7 @@ config SHUFFLE_PAGE_ALLOCATOR
Say Y if unsure.
config SLUB_CPU_PARTIAL
- default y
+ default y if !PREEMPT_RT
depends on SLUB && SMP
bool "SLUB per cpu partial cache"
help
diff --git a/init/main.c b/init/main.c
index a1cbc0ffe9e1..bdc5545570cd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1531,6 +1531,12 @@ void __init console_on_rootfs(void)
fput(file);
}
+#ifdef CONFIG_PROVE_RCU
+void rcu_tasks_initiate_self_tests(void);
+#else
+static inline void rcu_tasks_initiate_self_tests(void) {}
+#endif
+
static noinline void __init kernel_init_freeable(void)
{
/*
@@ -1556,6 +1562,7 @@ static noinline void __init kernel_init_freeable(void)
rcu_init_tasks_generic();
do_pre_smp_initcalls();
+ rcu_tasks_initiate_self_tests();
lockup_detector_init();
smp_init();
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 416017301660..c3ccb459ed97 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,5 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
+config HAVE_PREEMPT_LAZY
+ bool
+
+config PREEMPT_LAZY
+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..920ff974f662 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -345,7 +345,7 @@ void cpuset_read_unlock(void)
percpu_up_read(&cpuset_rwsem);
}
-static DEFINE_SPINLOCK(callback_lock);
+static DEFINE_RAW_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -1280,7 +1280,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (adding) {
cpumask_or(parent->subparts_cpus,
parent->subparts_cpus, tmp->addmask);
@@ -1299,7 +1299,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
return cmd == partcmd_update;
}
@@ -1404,7 +1404,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
if (cp->nr_subparts_cpus &&
@@ -1435,7 +1435,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
= cpumask_weight(cp->subparts_cpus);
}
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1553,7 +1553,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
return -EINVAL;
}
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
@@ -1564,7 +1564,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
update_cpumasks_hier(cs, &tmp);
@@ -1758,9 +1758,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1828,9 +1828,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &trialcs->mems_allowed);
@@ -1921,9 +1921,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -2432,7 +2432,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
cpuset_filetype_t type = seq_cft(sf)->private;
int ret = 0;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
@@ -2454,7 +2454,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
ret = -EINVAL;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
return ret;
}
@@ -2767,14 +2767,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -2801,12 +2801,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
out_unlock:
percpu_up_write(&cpuset_rwsem);
put_online_cpus();
@@ -2862,7 +2862,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
percpu_down_write(&cpuset_rwsem);
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2873,7 +2873,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
percpu_up_write(&cpuset_rwsem);
}
@@ -2970,12 +2970,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -3012,10 +3012,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
@@ -3170,7 +3170,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
/*
@@ -3190,17 +3190,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- spin_lock_irq(&callback_lock);
+ raw_spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- spin_unlock_irq(&callback_lock);
+ raw_spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -3301,11 +3301,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
}
/**
@@ -3366,11 +3366,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
nodemask_t mask;
unsigned long flags;
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -3462,14 +3462,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- spin_lock_irqsave(&callback_lock, flags);
+ raw_spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 03527f518dfb..a04c91b91cf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 4708aec492df..8a073198c4e8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -241,35 +241,42 @@ NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
CSD_INIT(kgdb_call_nmi_hook, NULL);
-void __weak kgdb_roundup_cpus(void)
+void __weak kgdb_roundup_cpu(unsigned int cpu)
{
call_single_data_t *csd;
+ int ret;
+
+ csd = &per_cpu(kgdb_roundup_csd, cpu);
+
+ /*
+ * If it didn't round up last time, don't try again
+ * since smp_call_function_single_async() will block.
+ *
+ * If rounding_up is false then we know that the
+ * previous call must have at least started and that
+ * means smp_call_function_single_async() won't block.
+ */
+ if (kgdb_info[cpu].rounding_up)
+ return;
+ kgdb_info[cpu].rounding_up = true;
+
+ ret = smp_call_function_single_async(cpu, csd);
+ if (ret)
+ kgdb_info[cpu].rounding_up = false;
+}
+NOKPROBE_SYMBOL(kgdb_roundup_cpu);
+
+void __weak kgdb_roundup_cpus(void)
+{
int this_cpu = raw_smp_processor_id();
int cpu;
- int ret;
for_each_online_cpu(cpu) {
/* No need to roundup ourselves */
if (cpu == this_cpu)
continue;
- csd = &per_cpu(kgdb_roundup_csd, cpu);
-
- /*
- * If it didn't round up last time, don't try again
- * since smp_call_function_single_async() will block.
- *
- * If rounding_up is false then we know that the
- * previous call must have at least started and that
- * means smp_call_function_single_async() won't block.
- */
- if (kgdb_info[cpu].rounding_up)
- continue;
- kgdb_info[cpu].rounding_up = true;
-
- ret = smp_call_function_single_async(cpu, csd);
- if (ret)
- kgdb_info[cpu].rounding_up = false;
+ kgdb_roundup_cpu(cpu);
}
}
NOKPROBE_SYMBOL(kgdb_roundup_cpus);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6735ac36b718..539a2f0dc89d 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -559,23 +559,17 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
+ /* mirror output on atomic consoles */
for_each_console(c) {
if (!(c->flags & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
- /*
- * Set oops_in_progress to encourage the console drivers to
- * disregard their internal spin locks: in the current calling
- * context the risk of deadlock is a bigger problem than risks
- * due to re-entering the console driver. We operate directly on
- * oops_in_progress rather than using bust_spinlocks() because
- * the calls bust_spinlocks() makes on exit are not appropriate
- * for this calling context.
- */
- ++oops_in_progress;
- c->write(c, msg, msg_len);
- --oops_in_progress;
+
+ if (!c->write_atomic)
+ continue;
+ c->write_atomic(c, msg, msg_len);
+
touch_nmi_watchdog();
}
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..d8bedab935b2 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -159,9 +159,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & _TIF_NEED_RESCHED_MASK)
schedule();
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (unlikely(current->forced_info.si_signo)) {
+ struct task_struct *t = current;
+ force_sig_info(&t->forced_info);
+ t->forced_info.si_signo = 0;
+ }
+#endif
+
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
@@ -389,7 +397,7 @@ void irqentry_exit_cond_resched(void)
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
- if (need_resched())
+ if (should_resched(0))
preempt_schedule_irq();
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 5ffbfa7ddcfa..e8fd80d51564 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kprobes.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -289,7 +290,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
return;
}
- vfree_atomic(tsk->stack);
+ vfree(tsk->stack);
return;
}
#endif
@@ -693,6 +694,19 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
+ * want another facility to make this work.
+ */
+void __mmdrop_delayed(struct rcu_head *rhp)
+{
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
+
+ __mmdrop(mm);
+}
+#endif
+
static void mmdrop_async_fn(struct work_struct *work)
{
struct mm_struct *mm;
@@ -734,6 +748,15 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(tsk);
+
+ /* Task is done with its stack. */
+ put_task_stack(tsk);
+
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
diff --git a/kernel/futex.c b/kernel/futex.c
index 408cad5e8968..c2565c3dddcd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -197,6 +197,8 @@ struct futex_pi_state {
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
* we can wake only the relevant ones (hashed queues may be shared).
@@ -219,6 +221,10 @@ struct futex_q {
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
+ atomic_t requeue_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
} __randomize_layout;
static const struct futex_q futex_q_init = {
@@ -1354,7 +1360,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* - 1 - acquired the lock;
* - <0 - error
*
- * The hb->lock and futex_key refs shall be held by the caller.
+ * The hb->lock must be held by the caller.
*
* @exiting is only set when the return value is -EBUSY. If so, this holds
* a refcount on the exiting task on return and the caller needs to drop it
@@ -1493,11 +1499,11 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
*/
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
+ DEFINE_RT_MUTEX_WAKE_Q_HEAD(wqh);
u32 curval, newval;
struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
- DEFINE_WAKE_Q(wake_q);
int ret = 0;
top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
@@ -1549,14 +1555,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
* not fail.
*/
pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
}
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
if (postunlock)
- rt_mutex_postunlock(&wake_q);
+ rt_mutex_postunlock(&wqh);
return ret;
}
@@ -1796,6 +1802,158 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
q->key = *key2;
}
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int cur, res, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ for (cur = atomic_read(&q->requeue_state);; cur = res) {
+ if (cur == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (cur != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ res = atomic_cmpxchg(&q->requeue_state, cur, new);
+ if (likely(cur == res))
+ break;
+ }
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int cur, res, new;
+
+ for (cur = atomic_read(&q->requeue_state);; cur = res) {
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+
+ res = atomic_cmpxchg(&q->requeue_state, cur, new);
+ if (likely(cur == res))
+ break;
+ }
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(cur == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int cur, new, res;
+
+ for (cur = atomic_read(&q->requeue_state);; cur = res) {
+ /* Is requeue done already? */
+ if (cur >= Q_REQUEUE_PI_DONE)
+ break;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = !cur ? Q_REQUEUE_PI_IGNORE : Q_REQUEUE_PI_WAIT;
+ res = atomic_cmpxchg(&q->requeue_state, cur, new);
+ if (likely(cur == res))
+ break;
+ }
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ while (atomic_read(&q->requeue_state) == Q_REQUEUE_PI_WAIT)
+ cpu_relax();
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
/**
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* @q: the futex_q
@@ -1823,6 +1981,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
q->lock_ptr = &hb->lock;
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1882,10 +2042,18 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
if (!top_waiter)
return 0;
+ /* Ensure that this is a waiter sitting in futex_wait_requeue_pi() */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ ret = -EINVAL;
+
/* Ensure we requeue to the expected futex. */
if (!match_futex(top_waiter->requeue_pi_key, key2))
return -EINVAL;
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
/*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
* the contended case or if set_waiters is 1. The pi_state is returned
@@ -1895,8 +2063,22 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
exiting, set_waiters);
if (ret == 1) {
+ /* Dequeue, wake up and update top_waiter::requeue_state */
requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
}
return ret;
}
@@ -1956,15 +2138,27 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
*/
if (refill_pi_state_cache())
return -ENOMEM;
+
/*
- * requeue_pi must wake as many tasks as it can, up to nr_wake
- * + nr_requeue, since it acquires the rt_mutex prior to
- * returning to userspace, so as to not leave the rt_mutex with
- * waiters and no owner. However, second and third wake-ups
- * cannot be predicted as they involve race conditions with the
- * first wake and a fault while looking up the pi_state. Both
- * pthread_cond_signal() and pthread_cond_broadcast() should
- * use nr_wake=1.
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
*/
if (nr_wake != 1)
return -EINVAL;
@@ -2025,6 +2219,8 @@ retry_private:
* intend to requeue waiters, force setting the FUTEX_WAITERS
* bit. We force this here where we are able to easily handle
* faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
&key2, &pi_state,
@@ -2038,6 +2234,27 @@ retry_private:
* vpid of the top waiter task.
* If the lock was not taken, we have pi_state and an initial
* refcount on it. In case of an error we have nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret > 0), then
+ * the state is Q_REQUEUE_PI_LOCKED. No matter whether
+ * the below lookup_pi_state() fails or not requeue_state
+ * is correct because that waiter is dequeued and woken
+ * up and nothing can hold it up.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
*/
if (ret > 0) {
WARN_ON(pi_state);
@@ -2063,7 +2280,10 @@ retry_private:
/* We hold a reference on the pi state. */
break;
- /* If the above failed, then pi_state is NULL */
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
case -EFAULT:
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -2115,18 +2335,17 @@ retry_private:
break;
}
- /*
- * Wake nr_wake waiters. For requeue_pi, if we acquired the
- * lock, we already woke the top_waiter. If not, it will be
- * woken by futex_unlock_pi().
- */
- if (++task_count <= nr_wake && !requeue_pi) {
- mark_wake_futex(&wake_q, this);
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ mark_wake_futex(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
continue;
}
/* Ensure we requeue to the expected futex for requeue_pi. */
- if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ if (!match_futex(this->requeue_pi_key, &key2)) {
ret = -EINVAL;
break;
}
@@ -2134,48 +2353,61 @@ retry_private:
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
*/
- if (requeue_pi) {
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
- * Prepare the waiter to take the rt_mutex. Take a
- * refcount on the pi_state and store the pointer in
- * the futex_q object of the waiter.
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
*/
- get_pi_state(pi_state);
- this->pi_state = pi_state;
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the
- * refcount on pi_state nor clear
- * this->pi_state because the waiter needs the
- * pi_state for cleaning up the user space
- * value. It will drop the refcount after
- * doing so.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- continue;
- } else if (ret) {
- /*
- * rt_mutex_start_proxy_lock() detected a
- * potential deadlock when we tried to queue
- * that waiter. Drop the pi_state reference
- * which we took above and remove the pointer
- * to the state from the waiters futex_q
- * object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- /*
- * We stop queueing more waiters and let user
- * space deal with the mess.
- */
- break;
- }
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
}
- requeue_futex(this, hb1, hb2, &key2);
}
/*
@@ -2617,8 +2849,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
*
* Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held and a q.key reference on success, and unlocked
- * with no q.key reference on failure.
+ * Return with the hb lock held on success, and unlocked on failure.
*
* Return:
* - 0 - uaddr contains val and hb has been locked;
@@ -3075,27 +3306,22 @@ pi_faulted:
}
/**
- * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
* @hb: the hash_bucket futex_q was original enqueued on
* @q: the futex_q woken while waiting to be requeued
- * @key2: the futex_key of the requeue target futex
* @timeout: the timeout associated with the wait (NULL if none)
*
- * Detect if the task was woken on the initial futex as opposed to the requeue
- * target futex. If so, determine if it was a timeout or a signal that caused
- * the wakeup and return the appropriate error code to the caller. Must be
- * called with the hb lock held.
+ * Determine the cause for the early wakeup.
*
* Return:
- * - 0 = no early wakeup detected;
- * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q, union futex_key *key2,
+ struct futex_q *q,
struct hrtimer_sleeper *timeout)
{
- int ret = 0;
+ int ret;
/*
* With the hb lock held, we avoid races while we process the wakeup.
@@ -3104,22 +3330,21 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* It can't be requeued from uaddr2 to something else since we don't
* support a PI aware source futex for requeue.
*/
- if (!match_futex(&q->key, key2)) {
- WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- }
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
return ret;
}
@@ -3172,6 +3397,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
+ struct rt_mutex *pi_mutex;
int res, ret;
if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -3221,32 +3447,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out;
-
- /*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
- */
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ break;
- /*
- * Check if the requeue code acquired the second futex for us and do
- * any pertinent fixup.
- */
- if (!q.rt_waiter) {
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_owner(uaddr2, &q, true);
/*
- * Drop the reference to the pi state which
- * the requeue_pi() code acquired for us.
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
@@ -3256,18 +3472,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
ret = ret < 0 ? ret : 0;
}
- } else {
- struct rt_mutex *pi_mutex;
+ break;
- /*
- * We have been woken up by futex_unlock_pi(), a timeout, or a
- * signal. futex_unlock_pi() will not destroy the lock_ptr nor
- * the pi_state.
- */
- WARN_ON(!q.pi_state);
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+ /* Current is not longer pi_blocked_on */
spin_lock(q.lock_ptr);
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
@@ -3287,17 +3499,21 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
unqueue_me_pi(&q);
spin_unlock(q.lock_ptr);
- }
- if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart by calling
- * futex_lock_pi() directly. We could restart this syscall, but
- * it would detect that the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart and return
- * -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
}
out:
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 762a928e18f9..7929fcdb7817 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -192,10 +192,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
unsigned int flags = 0;
+ struct pt_regs *regs = get_irq_regs();
+ u64 ip = regs ? instruction_pointer(regs) : 0;
retval = __handle_irq_event_percpu(desc, &flags);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+#ifdef CONFIG_PREEMPT_RT
+ desc->random_ip = ip;
+#else
+ add_interrupt_randomness(desc->irq_data.irq, flags, ip);
+#endif
if (!noirqdebug)
note_interrupt(desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..5770a50ced1e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1229,6 +1229,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
+ sched_set_fifo(current);
+
if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
@@ -1249,6 +1251,12 @@ static int irq_thread(void *data)
if (action_ret == IRQ_WAKE_THREAD)
irq_wake_secondary(desc, action);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ migrate_disable();
+ add_interrupt_randomness(action->irq, 0,
+ desc->random_ip ^ (unsigned long) action);
+ migrate_enable();
+ }
wake_threads_waitq(desc);
}
@@ -1394,8 +1402,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
if (IS_ERR(t))
return PTR_ERR(t);
- sched_set_fifo(t);
-
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@@ -2792,7 +2798,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
- * This function should be called with preemption disabled if the
+ * This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c481d8458325..ca4bdc53d6c7 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT
+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT\n");
+ return 1;
+#endif
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT
+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT\n");
+ return 1;
+#endif
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index db8c248ebc8c..0ec825dbe9f0 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -18,6 +18,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/interrupt.h>
#include <asm/processor.h>
#include <linux/kasan.h>
@@ -52,13 +53,27 @@ void __weak arch_irq_work_raise(void)
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
- /* If the work is "lazy", handle it from next tick if any */
- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
- } else {
- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
+ struct llist_head *list;
+ bool lazy_work;
+ int work_flags;
+
+ work_flags = atomic_read(&work->node.a_flags);
+ if (work_flags & IRQ_WORK_LAZY)
+ lazy_work = true;
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(work_flags & IRQ_WORK_HARD_IRQ))
+ lazy_work = true;
+ else
+ lazy_work = false;
+
+ if (lazy_work)
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (llist_add(&work->node.llist, list)) {
+ /* If the work is "lazy", handle it from next tick if any */
+ if (!lazy_work || tick_nohz_tick_stopped())
arch_irq_work_raise();
}
}
@@ -104,7 +119,14 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- __smp_call_single_queue(cpu, &work->node.llist);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
+ if (llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
+ /* && tick_nohz_tick_stopped_cpu(cpu) */
+ arch_send_call_function_single_ipi(cpu);
+ } else {
+ __smp_call_single_queue(cpu, &work->node.llist);
+ }
} else {
__irq_work_queue_local(work);
}
@@ -122,9 +144,8 @@ bool irq_work_needs_cpu(void)
raised = this_cpu_ptr(&raised_list);
lazy = this_cpu_ptr(&lazy_list);
- if (llist_empty(raised) || arch_irq_work_has_interrupt())
- if (llist_empty(lazy))
- return false;
+ if (llist_empty(raised) && llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -167,8 +188,12 @@ static void irq_work_run_list(struct llist_head *list)
struct irq_work *work, *tmp;
struct llist_node *llnode;
+#ifndef CONFIG_PREEMPT_RT
+ /*
+ * nort: On RT IRQ-work may run in SOFTIRQ context.
+ */
BUG_ON(!irqs_disabled());
-
+#endif
if (llist_empty(list))
return;
@@ -184,7 +209,16 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * NOTE: we raise softirq via IPI for safety,
+ * and execute in irq_work_tick() to move the
+ * overhead from hard to soft irq context.
+ */
+ if (!llist_empty(this_cpu_ptr(&lazy_list)))
+ raise_softirq(TIMER_SOFTIRQ);
+ } else
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -194,8 +228,17 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT)
+void irq_work_tick_soft(void)
+{
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
+#endif
/*
* Synchronize against the irq_work @entry, ensures the entry is not
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index f099baee3578..69c6e9b7761c 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -978,7 +978,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..dfff31ed644a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_CRASH_CORE */
+#if defined(CONFIG_PREEMPT_RT)
+static ssize_t realtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -229,6 +238,9 @@ static struct attribute * kernel_attrs[] = {
&rcu_expedited_attr.attr,
&rcu_normal_attr.attr,
#endif
+#ifdef CONFIG_PREEMPT_RT
+ &realtime_attr.attr,
+#endif
NULL
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 08931e525dd9..424d5c34b9ff 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@@ -300,6 +301,13 @@ static int kthread(void *_create)
init_completion(&self->parked);
current->vfork_done = &self->exited;
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
}
task = create->result;
if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
char name[TASK_COMM_LEN];
/*
@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
vsnprintf(name, sizeof(name), namefmt, args);
set_task_comm(task, name);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task,
- housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 3572808223e4..683f0b7fbacc 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,7 +24,8 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 38b2a5a512eb..f48b1a374f26 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5478,6 +5478,7 @@ static noinstr void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -5492,6 +5493,7 @@ static noinstr void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index db9301591e3f..fa61e6b24513 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -22,19 +20,19 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
*/
-void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+void debug_mutex_lock_common(_mutex_t *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
}
-void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+void debug_mutex_wake_waiter(_mutex_t *lock, struct mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
@@ -48,7 +46,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}
-void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_add_waiter(_mutex_t *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
lockdep_assert_held(&lock->wait_lock);
@@ -57,7 +55,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
task->blocked_on = waiter;
}
-void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+void debug_mutex_remove_waiter(_mutex_t *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -69,7 +67,7 @@ void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
waiter->task = NULL;
}
-void debug_mutex_unlock(struct mutex *lock)
+void debug_mutex_unlock(_mutex_t *lock)
{
if (likely(debug_locks)) {
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
@@ -77,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
}
}
-void debug_mutex_init(struct mutex *lock, const char *name,
+void debug_mutex_init(_mutex_t *lock, const char *name,
struct lock_class_key *key)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -91,17 +89,16 @@ void debug_mutex_init(struct mutex *lock, const char *name,
}
/***
- * mutex_destroy - mark a mutex unusable
+ * _mutex_t_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
* This function marks the mutex uninitialized, and any subsequent
* use of the mutex is forbidden. The mutex must not be locked when
* this function is called.
*/
-void mutex_destroy(struct mutex *lock)
+void _mutex_t_destroy(_mutex_t *lock)
{
- DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
+ DEBUG_LOCKS_WARN_ON(_mutex_t_is_locked(lock));
lock->magic = NULL;
}
-
-EXPORT_SYMBOL_GPL(mutex_destroy);
+EXPORT_SYMBOL_GPL(_mutex_t_destroy);
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 53e631e1d76d..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct task_struct *task);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 013e1b08a1bf..3f4359154267 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,17 +30,13 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
-#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
-#else
-# include "mutex.h"
-#endif
+#include "mutex.h"
void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+__mutex_t_init(_mutex_t *lock, const char *name, struct lock_class_key *key)
{
atomic_long_set(&lock->owner, 0);
- spin_lock_init(&lock->wait_lock);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
@@ -48,7 +44,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
debug_mutex_init(lock, name, key);
}
-EXPORT_SYMBOL(__mutex_init);
+EXPORT_SYMBOL(__mutex_t_init);
/*
* @owner: contains: 'struct task_struct *' to the current lock owner,
@@ -70,7 +66,7 @@ EXPORT_SYMBOL(__mutex_init);
*
* DO NOT USE (outside of mutex code).
*/
-static inline struct task_struct *__mutex_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_owner(_mutex_t *lock)
{
return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
}
@@ -80,11 +76,11 @@ static inline struct task_struct *__owner_task(unsigned long owner)
return (struct task_struct *)(owner & ~MUTEX_FLAGS);
}
-bool mutex_is_locked(struct mutex *lock)
+bool _mutex_t_is_locked(_mutex_t *lock)
{
return __mutex_owner(lock) != NULL;
}
-EXPORT_SYMBOL(mutex_is_locked);
+EXPORT_SYMBOL(_mutex_t_is_locked);
static inline unsigned long __owner_flags(unsigned long owner)
{
@@ -94,7 +90,7 @@ static inline unsigned long __owner_flags(unsigned long owner)
/*
* Trylock variant that returns the owning task on failure.
*/
-static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
+static inline struct task_struct *__mutex_trylock_or_owner(_mutex_t *lock)
{
unsigned long owner, curr = (unsigned long)current;
@@ -137,7 +133,7 @@ static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
/*
* Actual trylock that will work on any unlocked state.
*/
-static inline bool __mutex_trylock(struct mutex *lock)
+static inline bool __mutex_trylock(_mutex_t *lock)
{
return !__mutex_trylock_or_owner(lock);
}
@@ -153,7 +149,7 @@ static inline bool __mutex_trylock(struct mutex *lock)
* Optimistic trylock that only works in the uncontended case. Make sure to
* follow with a __mutex_trylock() before failing.
*/
-static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+static __always_inline bool __mutex_trylock_fast(_mutex_t *lock)
{
unsigned long curr = (unsigned long)current;
unsigned long zero = 0UL;
@@ -164,7 +160,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
return false;
}
-static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+static __always_inline bool __mutex_unlock_fast(_mutex_t *lock)
{
unsigned long curr = (unsigned long)current;
@@ -175,17 +171,17 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
}
#endif
-static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
+static inline void __mutex_set_flag(_mutex_t *lock, unsigned long flag)
{
atomic_long_or(flag, &lock->owner);
}
-static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
+static inline void __mutex_clear_flag(_mutex_t *lock, unsigned long flag)
{
atomic_long_andnot(flag, &lock->owner);
}
-static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
+static inline bool __mutex_waiter_is_first(_mutex_t *lock, struct mutex_waiter *waiter)
{
return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
}
@@ -195,7 +191,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
* FLAG_WAITERS flag if it's the first waiter.
*/
static void
-__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+__mutex_add_waiter(_mutex_t *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
debug_mutex_add_waiter(lock, waiter, current);
@@ -206,7 +202,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
}
static void
-__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+__mutex_remove_waiter(_mutex_t *lock, struct mutex_waiter *waiter)
{
list_del(&waiter->list);
if (likely(list_empty(&lock->wait_list)))
@@ -221,7 +217,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
* WAITERS. Provides RELEASE semantics like a regular unlock, the
* __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
*/
-static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
+static void __mutex_handoff(_mutex_t *lock, struct task_struct *task)
{
unsigned long owner = atomic_long_read(&lock->owner);
@@ -246,7 +242,7 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
}
}
-#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#if !defined(CONFIG_DEBUG_LOCK_ALLOC) && !defined(CONFIG_PREEMPT_RT)
/*
* We split the mutex lock/unlock logic into separate fastpath and
* slowpath functions, to reduce the register pressure on the fastpath.
@@ -284,7 +280,7 @@ void __sched mutex_lock(struct mutex *lock)
__mutex_lock_slowpath(lock);
}
EXPORT_SYMBOL(mutex_lock);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC && !CONFIG_PREEMPT_RT */
/*
* Wait-Die:
@@ -364,7 +360,7 @@ __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
* __ww_mutex_check_kill() wake any but the earliest context.
*/
static bool __sched
-__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
+__ww_mutex_die(_mutex_t *lock, struct mutex_waiter *waiter,
struct ww_acquire_ctx *ww_ctx)
{
if (!ww_ctx->is_wait_die)
@@ -386,7 +382,7 @@ __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
* the lock holders. Even if multiple waiters may wound the lock holder,
* it's sufficient that only one does.
*/
-static bool __ww_mutex_wound(struct mutex *lock,
+static bool __ww_mutex_wound(_mutex_t *lock,
struct ww_acquire_ctx *ww_ctx,
struct ww_acquire_ctx *hold_ctx)
{
@@ -441,7 +437,7 @@ static bool __ww_mutex_wound(struct mutex *lock,
* The current task must not be on the wait list.
*/
static void __sched
-__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_check_waiters(_mutex_t *lock, struct ww_acquire_ctx *ww_ctx)
{
struct mutex_waiter *cur;
@@ -491,15 +487,15 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
* Uh oh, we raced in fastpath, check if any of the waiters need to
* die or wound us.
*/
- spin_lock(&lock->base.wait_lock);
+ raw_spin_lock(&lock->base.wait_lock);
__ww_mutex_check_waiters(&lock->base, ctx);
- spin_unlock(&lock->base.wait_lock);
+ raw_spin_unlock(&lock->base.wait_lock);
}
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline
-bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+bool ww_mutex_spin_on_owner(_mutex_t *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
{
struct ww_mutex *ww;
@@ -547,7 +543,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
* "noinline" so that this function shows up on perf profiles.
*/
static noinline
-bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+bool mutex_spin_on_owner(_mutex_t *lock, struct task_struct *owner,
struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
{
bool ret = true;
@@ -586,7 +582,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
/*
* Initial check for entering the mutex spinning loop
*/
-static inline int mutex_can_spin_on_owner(struct mutex *lock)
+static inline int mutex_can_spin_on_owner(_mutex_t *lock)
{
struct task_struct *owner;
int retval = 1;
@@ -635,7 +631,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
* changed to itself.
*/
static __always_inline bool
-mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+mutex_optimistic_spin(_mutex_t *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
{
if (!waiter) {
@@ -709,17 +705,27 @@ fail:
return false;
}
-#else
+#else /* CONFIG_MUTEX_SPIN_ON_OWNER */
static __always_inline bool
-mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+mutex_optimistic_spin(_mutex_t *lock, struct ww_acquire_ctx *ww_ctx,
struct mutex_waiter *waiter)
{
return false;
}
-#endif
+#endif /* !CONFIG_MUTEX_SPIN_ON_OWNER */
+
+static noinline void __sched __mutex_unlock_slowpath(_mutex_t *lock, unsigned long ip);
-static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
+static __always_inline void __mutex_unlock(_mutex_t *lock)
+{
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+ if (__mutex_unlock_fast(lock))
+ return;
+#endif
+ __mutex_unlock_slowpath(lock, _RET_IP_);
+}
+#ifndef CONFIG_PREEMPT_RT
/**
* mutex_unlock - release the mutex
* @lock: the mutex to be released
@@ -733,13 +739,10 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
void __sched mutex_unlock(struct mutex *lock)
{
-#ifndef CONFIG_DEBUG_LOCK_ALLOC
- if (__mutex_unlock_fast(lock))
- return;
-#endif
- __mutex_unlock_slowpath(lock, _RET_IP_);
+ __mutex_unlock(lock);
}
EXPORT_SYMBOL(mutex_unlock);
+#endif /* !CONFIG_PREEMPT_RT */
/**
* ww_mutex_unlock - release the w/w mutex
@@ -767,13 +770,13 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
lock->ctx = NULL;
}
- mutex_unlock(&lock->base);
+ __mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
static __always_inline int __sched
-__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_kill(_mutex_t *lock, struct ww_acquire_ctx *ww_ctx)
{
if (ww_ctx->acquired > 0) {
#ifdef CONFIG_DEBUG_MUTEXES
@@ -802,7 +805,7 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
* look at waiters before us in the wait-list.
*/
static inline int __sched
-__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
+__ww_mutex_check_kill(_mutex_t *lock, struct mutex_waiter *waiter,
struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
@@ -850,7 +853,7 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
*/
static inline int __sched
__ww_mutex_add_waiter(struct mutex_waiter *waiter,
- struct mutex *lock,
+ _mutex_t *lock,
struct ww_acquire_ctx *ww_ctx)
{
struct mutex_waiter *cur;
@@ -923,7 +926,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(_mutex_t *lock, long state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
@@ -968,7 +971,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
return 0;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
/*
* After waiting to acquire the wait_lock, try again.
*/
@@ -1032,7 +1035,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
schedule_preempt_disabled();
/*
@@ -1055,9 +1058,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
(first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
acquired:
__set_current_state(TASK_RUNNING);
@@ -1082,7 +1085,7 @@ skip_wait:
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
preempt_enable();
return 0;
@@ -1090,22 +1093,24 @@ err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
preempt_enable();
return ret;
}
+#ifndef CONFIG_PREEMPT_RT
static int __sched
__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip)
{
return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
}
+#endif /* !CONFIG_PREEMPT_RT */
static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__ww_mutex_lock(_mutex_t *lock, long state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx)
{
@@ -1113,6 +1118,7 @@ __ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifndef CONFIG_PREEMPT_RT
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
@@ -1155,6 +1161,7 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
io_schedule_finish(token);
}
EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+# endif /* !CONFIG_PREEMPT_RT */
static inline int
ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
@@ -1220,7 +1227,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
/*
* Release the lock, slowpath:
*/
-static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
+static noinline void __sched __mutex_unlock_slowpath(_mutex_t *lock, unsigned long ip)
{
struct task_struct *next = NULL;
DEFINE_WAKE_Q(wake_q);
@@ -1259,7 +1266,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
owner = old;
}
- spin_lock(&lock->wait_lock);
+ raw_spin_lock(&lock->wait_lock);
debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
@@ -1276,12 +1283,13 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&lock->wait_lock);
wake_up_q(&wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef CONFIG_PREEMPT_RT
/*
* Here come the less common (and hence less performance-critical) APIs:
* mutex_lock_interruptible() and mutex_trylock().
@@ -1376,6 +1384,7 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock)
{
return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
+#endif /* !CONFIG_PREEMPT_RT */
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
@@ -1394,21 +1403,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
#endif
-/**
- * mutex_trylock - try to acquire the mutex, without waiting
- * @lock: the mutex to be acquired
- *
- * Try to acquire the mutex atomically. Returns 1 if the mutex
- * has been acquired successfully, and 0 on contention.
- *
- * NOTE: this function follows the spin_trylock() convention, so
- * it is negated from the down_trylock() return values! Be careful
- * about this when converting semaphore users to mutexes.
- *
- * This function must not be used in interrupt context. The
- * mutex must be released by the same task that acquired it.
- */
-int __sched mutex_trylock(struct mutex *lock)
+int __sched _mutex_t_trylock(_mutex_t *lock)
{
bool locked;
@@ -1422,7 +1417,7 @@ int __sched mutex_trylock(struct mutex *lock)
return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_t_trylock);
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index f0c710b1d192..226e5c4f2a51 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -5,11 +5,37 @@
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(_mutex_t *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(_mutex_t *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(_mutex_t *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(_mutex_t *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(_mutex_t *lock);
+extern void debug_mutex_init(_mutex_t *lock, const char *name,
+ struct lock_class_key *key);
+#else /* CONFIG_DEBUG_MUTEXES */
+
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
@@ -18,6 +44,7 @@
#define debug_mutex_init(lock, name, key) do { } while (0)
static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+debug_mutex_lock_common(_mutex_t *lock, struct mutex_waiter *waiter)
{
}
+#endif /* !CONFIG_DEBUG_MUTEXES */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3c20afbc19e1..f7bb9f6fae93 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,17 +8,20 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/timer.h>
#include "rtmutex_common.h"
@@ -141,8 +144,19 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
-# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return cmpxchg_acquire(&lock->owner, old, new) == old;
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return cmpxchg_release(&lock->owner, old, new) == old;
+}
/*
* Callers must hold the ->wait_lock -- which is the whole purpose as we force
@@ -201,8 +215,20 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#else
-# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
-# define rt_mutex_cmpxchg_release(l,c,n) (0)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
@@ -265,6 +291,26 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
return 1;
}
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
+{
+ if (rt_mutex_waiter_less(waiter, top_waiter))
+ return true;
+
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ return false;
+
+ return rt_mutex_waiter_equal(waiter, top_waiter);
+#else
+ return false;
+#endif
+}
+
#define __node_2_waiter(node) \
rb_entry((node), struct rt_mutex_waiter, tree_entry)
@@ -326,6 +372,33 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
rt_mutex_setprio(p, pi_task);
}
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add(struct rt_mutex_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
+ get_task_struct(w->task);
+ wqh->rtlock_task = w->task;
+ } else {
+ wake_q_add(&wqh->head, w->task);
+ }
+}
+
+static __always_inline void rt_mutex_wake_up_q(struct rt_mutex_wake_q_head *wqh)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
+
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
+}
+
/*
* Deadlock detection is conditional:
*
@@ -348,11 +421,6 @@ rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
@@ -676,7 +744,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* to get the lock.
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
+ wake_up_state(waiter->task, waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -815,19 +883,21 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters tree.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -838,13 +908,9 @@ try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (!rt_mutex_waiter_less(task_to_waiter(task),
- rt_mutex_top_waiter(lock)))
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -986,7 +1052,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
*
* Called with lock->wait_lock held and interrupts disabled.
*/
-static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+static void __sched mark_wakeup_next_waiter(struct rt_mutex_wake_q_head *wqh,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
@@ -1023,17 +1089,146 @@ static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
* deboost but before waking our donor task, hence the preempt_disable()
* before unlock.
*
- * Pairs with preempt_enable() in rt_mutex_postunlock();
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
*/
preempt_disable();
- wake_q_add(wake_q, waiter->task);
+ rt_mutex_wake_q_add(wqh, waiter);
raw_spin_unlock(&current->pi_lock);
}
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ ret = __rt_mutex_slowtrylock(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return ret;
+}
+
+static __always_inline int __rt_mutex_trylock(struct rt_mutex *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
+/*
+ * Slow path to release a rt-mutex.
+ */
+static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ DEFINE_RT_MUTEX_WAKE_Q_HEAD(wqh);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock, flags) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ }
+
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
+ */
+ mark_wakeup_next_waiter(&wqh, lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void __rt_mutex_unlock(struct rt_mutex *lock)
+{
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(lock);
+}
+
+#ifdef RT_MUTEX_BUILD_MUTEX
+/*
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
+ */
+
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and interrupts disabled. I must
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
* have just failed to try_to_take_rt_mutex().
*/
static void __sched remove_waiter(struct rt_mutex *lock,
@@ -1089,44 +1284,8 @@ static void __sched remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
}
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void __sched rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
-}
-
-void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
- waiter->task = NULL;
-}
-
/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
* or TASK_UNINTERRUPTIBLE)
@@ -1135,9 +1294,10 @@ void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
-static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex *lock,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
{
int ret = 0;
@@ -1187,51 +1347,37 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
}
}
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
*/
-static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
+static int __sched __rt_mutex_slowlock(struct rt_mutex *lock,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter)
{
- struct rt_mutex_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- rt_mutex_init_waiter(&waiter);
+ int ret;
- /*
- * Technically we could use raw_spin_[un]lock_irq() here, but this can
- * be called in early boot if the cmpxchg() fast path is disabled
- * (debug, no architecture support). In this case we will acquire the
- * rtmutex with lock->wait_lock held. But we cannot unconditionally
- * enable interrupts in that early boot case. So we need to use the
- * irqsave/restore variants.
- */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ lockdep_assert_held(&lock->wait_lock);
/* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ if (try_to_take_rt_mutex(lock, current, NULL))
return 0;
- }
set_current_state(state);
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+ ret = rt_mutex_slowlock_block(lock, state, NULL, waiter);
if (unlikely(ret)) {
__set_current_state(TASK_RUNNING);
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
}
/*
@@ -1239,547 +1385,153 @@ static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
-
return ret;
}
-static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex *lock,
+ unsigned int state)
{
- int ret = try_to_take_rt_mutex(lock, current, NULL);
+ struct rt_mutex_waiter waiter;
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ rt_mutex_init_waiter(&waiter);
- return ret;
+ return __rt_mutex_slowlock(lock, state, RT_MUTEX_MIN_CHAINWALK, &waiter);
}
/*
- * Slow path try-lock function:
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @state: The task state for sleeping
*/
-static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state)
{
unsigned long flags;
int ret;
/*
- * If the lock already has an owner we fail to get the lock.
- * This can be done without taking the @lock->wait_lock as
- * it is only being read, and this is a trylock anyway.
- */
- if (rt_mutex_owner(lock))
- return 0;
-
- /*
- * The mutex has currently no owner. Lock the wait lock and try to
- * acquire the lock. We use irqsave here to support early boot calls.
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- ret = __rt_mutex_slowtrylock(lock);
-
+ ret = __rt_mutex_slowlock_locked(lock, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
-/*
- * Performs the wakeup of the top-waiter and re-enables preemption.
- */
-void __sched rt_mutex_postunlock(struct wake_q_head *wake_q)
+static __always_inline int __rt_mutex_lock(struct rt_mutex *lock,
+ unsigned int state)
{
- wake_up_q(wake_q);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
- /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
- preempt_enable();
+ return rt_mutex_slowlock(lock, state);
}
+#endif /* RT_MUTEX_BUILD_MUTEX */
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
/*
- * Slow path to release a rt-mutex.
- *
- * Return whether the current task needs to call rt_mutex_postunlock().
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
-{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
-
- /* irqsave required to support early boot calls */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- debug_rt_mutex_unlock(lock);
-
- /*
- * We must be careful here if the fast path is enabled. If we
- * have no waiters queued we cannot set owner to NULL here
- * because of:
- *
- * foo->lock->owner = NULL;
- * rtmutex_lock(foo->lock); <- fast path
- * free = atomic_dec_and_test(foo->refcnt);
- * rtmutex_unlock(foo->lock); <- fast path
- * if (free)
- * kfree(foo);
- * raw_spin_unlock(foo->lock->wait_lock);
- *
- * So for the fastpath enabled kernel:
- *
- * Nothing can set the waiters bit as long as we hold
- * lock->wait_lock. So we do the following sequence:
- *
- * owner = rt_mutex_owner(lock);
- * clear_rt_mutex_waiters(lock);
- * raw_spin_unlock(&lock->wait_lock);
- * if (cmpxchg(&lock->owner, owner, 0) == owner)
- * return;
- * goto retry;
- *
- * The fastpath disabled variant is simple as all access to
- * lock->owner is serialized by lock->wait_lock:
- *
- * lock->owner = NULL;
- * raw_spin_unlock(&lock->wait_lock);
- */
- while (!rt_mutex_has_waiters(lock)) {
- /* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock, flags) == true)
- return;
- /* Relock the rtmutex and try again */
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- }
-
- /*
- * The wakeup next waiter path does not suffer from the above
- * race. See the comments there.
- *
- * Queue the next waiter for wakeup once we release the wait_lock.
- */
- mark_wakeup_next_waiter(&wake_q, lock);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- rt_mutex_postunlock(&wake_q);
-}
+#ifdef CONFIG_SMP
/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
+ * Note that owner is a speculative pointer and dereferencing relies
+ * on rcu_read_lock() and the check against the lock owner.
*/
-static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state,
- unsigned int subclass)
+static bool rtlock_adaptive_spinwait(struct rt_mutex *lock,
+ struct task_struct *owner)
{
- int ret;
-
- might_sleep();
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
-
- ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
- return ret;
-}
+ bool res = true;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
- * rt_mutex_lock_nested - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- * @subclass: the lockdep subclass
- */
-void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
-{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass);
+ rcu_read_lock();
+ for (;;) {
+ /* Owner changed. Trylock again */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that owner->on_cpu is dereferenced _after_
+ * checking the above to be valid.
+ */
+ barrier();
+ if (!owner->on_cpu) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
-
-#else /* !CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
+#else
+static bool rtlock_adaptive_spinwait(struct rt_mutex *lock,
+ struct task_struct *owner)
{
- __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0);
+ return false;
}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
#endif
/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying rt mutex
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+static void __sched rtlock_slowlock_locked(struct rt_mutex *lock)
{
- return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
-
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * This function can only be called in thread context. It's safe to call it
- * from atomic regions, but not from hard or soft interrupt context.
- *
- * Returns:
- * 1 on success
- * 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- int ret;
-
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
- return 0;
-
- /*
- * No lockdep annotation required because lockdep disables the fast
- * path.
- */
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 1;
-
- ret = rt_mutex_slowtrylock(lock);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
-{
- mutex_release(&lock->dep_map, _RET_IP_);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
- return;
-
- rt_mutex_slowunlock(lock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-
-/*
- * Futex variants, must not use fastpath.
- */
-int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_slowtrylock(lock);
-}
-
-int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
-{
- return __rt_mutex_slowtrylock(lock);
-}
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
-/**
- * __rt_mutex_futex_unlock - Futex variant, that since futex variants
- * do not use the fast-path, can be simple and will not need to retry.
- *
- * @lock: The rt_mutex to be unlocked
- * @wake_q: The wake queue head from which to get the next lock waiter
- */
-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wake_q)
-{
lockdep_assert_held(&lock->wait_lock);
- debug_rt_mutex_unlock(lock);
-
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- return false; /* done */
- }
-
- /*
- * We've already deboosted, mark_wakeup_next_waiter() will
- * retain preempt_disabled when we drop the wait_lock, to
- * avoid inversion prior to the wakeup. preempt_disable()
- * therein pairs with rt_mutex_postunlock().
- */
- mark_wakeup_next_waiter(wake_q, lock);
-
- return true; /* call postunlock() */
-}
-
-void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
-{
- DEFINE_WAKE_Q(wake_q);
- unsigned long flags;
- bool postunlock;
-
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-
- if (postunlock)
- rt_mutex_postunlock(&wake_q);
-}
-
-/**
- * __rt_mutex_init - initialize the rt_mutex
- *
- * @lock: The rt_mutex to be initialized
- * @name: The lock name used for debugging
- * @key: The lock class key used for debugging
- *
- * Initialize the rt_mutex to unlocked state.
- *
- * Initializing of a locked rt_mutex is not allowed
- */
-void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
- struct lock_class_key *key)
-{
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ if (try_to_take_rt_mutex(lock, current, NULL))
+ return;
- __rt_mutex_basic_init(lock);
-}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+ rt_mutex_init_rtlock_waiter(&waiter);
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This initializes the rtmutex and
- * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
- * possible at this point because the pi_state which contains the rtmutex
- * is not yet visible to other tasks.
- */
-void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- __rt_mutex_basic_init(lock);
- rt_mutex_set_owner(lock, proxy_owner);
-}
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- *
- * Special API call for PI-futex support. This merrily cleans up the rtmutex
- * (debugging) state. Concurrent operations on this rt_mutex are not
- * possible because it belongs to the pi_state which is about to be freed
- * and it is not longer visible to other tasks.
- */
-void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
-}
+ task_blocks_on_rt_mutex(lock, &waiter, current, RT_MUTEX_MIN_CHAINWALK);
-/**
- * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: does _NOT_ remove the @waiter on failure; must either call
- * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- lockdep_assert_held(&lock->wait_lock);
+ for (;;) {
+ /* Try to acquire the lock again. */
+ if (try_to_take_rt_mutex(lock, current, &waiter))
+ break;
- if (try_to_take_rt_mutex(lock, task, NULL))
- return 1;
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq(&lock->wait_lock);
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ if (!owner || !rtlock_adaptive_spinwait(lock, owner))
+ schedule_rtlock();
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
}
- return ret;
-}
-
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
- * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
- *
- * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
- * on failure.
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for PI-futex support.
- */
-int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
-
- raw_spin_lock_irq(&lock->wait_lock);
- ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return ret;
-}
-
-/**
- * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Wait for the lock acquisition started on our behalf by
- * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
- * rt_mutex_cleanup_proxy_lock().
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex support
- */
-int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
- raw_spin_lock_irq(&lock->wait_lock);
- /* sleep on the mutex */
- set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
/*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We
+ * might have to fix that up:
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock_irq(&lock->wait_lock);
-
- return ret;
+ debug_rt_mutex_free_waiter(&waiter);
}
-/**
- * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
- * @lock: the rt_mutex we were woken on
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
- * rt_mutex_wait_proxy_lock().
- *
- * Unless we acquired the lock; we're still enqueued on the wait-list and can
- * in fact still be granted ownership until we're removed. Therefore we can
- * find we are in fact the owner and must disregard the
- * rt_mutex_wait_proxy_lock() failure.
- *
- * Returns:
- * true - did the cleanup, we done.
- * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
- * caller should disregards its return value.
- *
- * Special API call for PI-futex support
- */
-bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex *lock)
{
- bool cleanup = false;
-
- raw_spin_lock_irq(&lock->wait_lock);
- /*
- * Do an unconditional try-lock, this deals with the lock stealing
- * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
- * sets a NULL owner.
- *
- * We're not interested in the return value, because the subsequent
- * test on rt_mutex_owner() will infer that. If the trylock succeeded,
- * we will own the lock and it will have removed the waiter. If we
- * failed the trylock, we're still not owner and we need to remove
- * ourselves.
- */
- try_to_take_rt_mutex(lock, current, waiter);
- /*
- * Unless we're the owner; we're still enqueued on the wait_list.
- * So check if we became owner, if not, take us off the wait_list.
- */
- if (rt_mutex_owner(lock) != current) {
- remove_waiter(lock, waiter);
- cleanup = true;
- }
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock_irq(&lock->wait_lock);
+ unsigned long flags;
- return cleanup;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
}
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-#endif
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..e4eed067873c
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ ret = __rt_mutex_lock(lock, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(lock);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct rt_mutex_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ DEFINE_RT_MUTEX_WAKE_Q_HEAD(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+
+ __rt_mutex_basic_init(lock);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_basic_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a seperate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, TASK_INTERRUPTIBLE, to, waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_mutex_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+/* Interfaces for PREEMPT_RT lock substitutions */
+#ifdef CONFIG_PREEMPT_RT
+/**
+ * rwsem_rt_mutex_slowlock_locked - Lock slowpath invoked with @lock::wait_lock held
+ * @lock: The rtmutex to acquire
+ * @state: The task state for blocking
+ */
+int __sched rwsem_rt_mutex_slowlock_locked(struct rt_mutex *lock,
+ unsigned int state)
+{
+ return __rt_mutex_slowlock_locked(lock, state);
+}
+
+/**
+ * rwsem_rt_mutex_lock_state - Lock a rt_mutex with a given state
+ * @lock: The rt_mutex to be locked
+ * @state: The state to set when blocking on the rt_mutex
+ *
+ * The function does no lockdep operations on @lock. The lockdep state
+ * changes have to be done on the callsite related to the locking primitive
+ * which embeds the rtmutex. Otherwise lockdep has double tracking.
+ */
+int __sched rwsem_rt_mutex_lock_state(struct rt_mutex *lock, unsigned int state)
+{
+ return __rt_mutex_lock(lock, state);
+}
+
+/**
+ * rwsem_rt_mutex_try_lock_nolockdep - Try to lock a rt_mutex
+ * @lock: The rt_mutex to be locked
+ *
+ * The function does no lockdep operations on @lock. The lockdep state
+ * changes have to be done on the callsite related to the locking primitive
+ * which embeds the rtmutex. Otherwise lockdep has double tracking.
+ */
+int __sched rwsem_rt_mutex_trylock(struct rt_mutex *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) &&
+ WARN_ON_ONCE(in_nmi() | in_hardirq()))
+ return 0;
+
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * rwsem_rt_mutex_unlock - Unlock a rt_mutex
+ * @lock: The rt_mutex to be unlocked
+ *
+ * The function does no lockdep operations on @lock. The lockdep state
+ * changes have to be done on the callsite related to the locking primitive
+ * which embeds the rtmutex. Otherwise lockdep has double tracking.
+ */
+void rwsem_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(lock);
+}
+
+/* Mutexes */
+void __mutex_rt_init(struct mutex *mutex, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+ lockdep_init_map(&mutex->dep_map, name, key, 0);
+}
+EXPORT_SYMBOL(__mutex_rt_init);
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched mutex_lock_killable_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index a90c22abdbca..fecc839cf082 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -25,6 +25,7 @@
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
* @prio: Priority of the waiter
* @deadline: Deadline of the waiter if applicable
*/
@@ -33,10 +34,61 @@ struct rt_mutex_waiter {
struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
+ int wake_state;
int prio;
u64 deadline;
};
+/**
+ * rt_mutex_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_mutex_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
+};
+
+#define DEFINE_RT_MUTEX_WAKE_Q_HEAD(name) \
+ struct rt_mutex_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct rt_mutex_wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct rt_mutex_wake_q_head *wqh);
+
+/* Special interfaces for RT lock substitutions */
+int rwsem_rt_mutex_slowlock_locked(struct rt_mutex *lock, unsigned int state);
+int rwsem_rt_mutex_lock_state(struct rt_mutex *lock, unsigned int state);
+int rwsem_rt_mutex_trylock(struct rt_mutex *lock);
+void rwsem_rt_mutex_unlock(struct rt_mutex *lock);
+
/*
* Must be guarded because this header is included from rcu/tree_plugin.h
* unconditionally.
@@ -78,13 +130,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
}
-#else /* CONFIG_RT_MUTEXES */
-/* Used in rcu/tree_plugin.h */
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
-{
- return NULL;
-}
-#endif /* !CONFIG_RT_MUTEXES */
/*
* Constants for rt mutex functions which have a selectable deadlock
@@ -108,34 +153,6 @@ static inline void __rt_mutex_basic_init(struct rt_mutex *lock)
lock->waiters = RB_ROOT_CACHED;
}
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter);
-
-extern int rt_mutex_futex_trylock(struct rt_mutex *l);
-extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
-
-extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
-extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
-
/* Debug functions */
static inline void debug_rt_mutex_unlock(struct rt_mutex *lock)
{
@@ -161,4 +178,27 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
memset(waiter, 0x22, sizeof(*waiter));
}
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..c8d5c77954c4
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical region
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS so readers can use the fast path again
+ * 3) Unlock rtmutex to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r, old;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ old = atomic_cmpxchg(&rwb->readers, r, r + 1);
+ if (likely(old == r))
+ return 1;
+ r = old;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ int ret;
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Allow readers as long as the writer has not completely
+ * acquired the semaphore for write.
+ */
+ if (atomic_read(&rwb->readers) != WRITER_BIAS) {
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ return 0;
+ }
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read() which might be unbound.
+ */
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ wake_up_state(owner, state);
+
+ raw_spin_unlock_irq(&rtm->wait_lock);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical region.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+
+ atomic_add(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /*
+ * set_current_state() for rw_semaphore
+ * current_save_and_set_rtlock_wait_state() for rwlock
+ */
+ rwbase_set_and_save_current_state(state);
+
+ /* Block until all readers have left the critical region. */
+ for (; atomic_read(&rwb->readers);) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ __set_current_state(TASK_RUNNING);
+ __rwbase_write_unlock(rwb, 0, flags);
+ return -EINTR;
+ }
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+
+ /*
+ * Schedule and wait for the readers to leave the critical
+ * section. The last reader leaving it wakes the waiter.
+ */
+ if (atomic_read(&rwb->readers) != 0)
+ rwbase_schedule();
+ set_current_state(state);
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ }
+
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ rwbase_restore_current_state();
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (!atomic_read(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 809b0016d344..5870cddddb62 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,6 +28,7 @@
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
/*
@@ -1344,6 +1345,113 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
rwsem_downgrade_wake(sem);
}
+#else /* !CONFIG_PREEMPT_RT */
+
+#include "rtmutex_common.h"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ rwsem_rt_mutex_lock_state(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state) \
+ rwsem_rt_mutex_slowlock_locked(rtm, state)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ rwsem_rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ rwsem_rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_schedule() \
+ schedule()
+
+#include "rwbase_rt.c"
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rwsem_init(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+}
+EXPORT_SYMBOL(__rwsem_init);
+#endif
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
+}
+
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
/*
* lock for reading
*/
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index c8d7ad9fb9b2..c5830cfa379a 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index b9d93087ee66..14235671a1a7 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..85b19e41e26d
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spin_lock and rw_lock on RT are based on rtmutex with a few twists to
+ * resemble the non RT semantics
+ *
+ * - Contrary to a plain rtmutex, spin_lock and rw_lock are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rw_locks disable preemption and evtl. interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+static __always_inline void rtlock_lock(struct rt_mutex *rtm)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ ___might_sleep(__FILE__, __LINE__, 0);
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int rwbase_rtmutex_lock_state(struct rt_mutex *rtm,
+ unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int rwbase_rtmutex_slowlock_locked(struct rt_mutex *rtm,
+ unsigned int state)
+{
+ rtlock_slowlock_locked(rtm);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock)
+{
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock)
+{
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __sched rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+int __sched rt_rwlock_is_contended(rwlock_t *rwlock)
+{
+ return rw_base_is_contended(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_rwlock_is_contended);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map(&rwlock->dep_map, name, key, 0);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 1b019cbca594..c20782f07643 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
unsigned long flags;
int ret;
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_register(&nh->head, n);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
@@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
unsigned long flags;
int ret;
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_unregister(&nh->head, n);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
synchronize_rcu();
return ret;
}
@@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
* Musn't use RCU; because then the notifier list can
* change between the up and down traversal.
*/
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 21d59a508272..14017d1b77b0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,12 +177,28 @@ static void panic_print_sys_info(void)
void panic(const char *fmt, ...)
{
static char buf[1024];
+ va_list args2;
va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ console_verbose();
+ pr_emerg("Kernel panic - not syncing:\n");
+ va_start(args2, fmt);
+ va_copy(args, args2);
+ vprintk(fmt, args2);
+ va_end(args2);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ dump_stack();
+#endif
+ pr_flush(1000, true);
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -213,24 +229,13 @@ void panic(const char *fmt, ...)
if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
- console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
- pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
- /*
- * Avoid nested stack-dumping if a panic occurs during oops processing
- */
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
- dump_stack();
-#endif
-
/*
* If kgdb is enabled, give it a chance to run before we stop all
* the other CPUs or else we won't be able to debug processes left
@@ -247,7 +252,6 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -271,8 +275,6 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -542,9 +544,11 @@ static u64 oops_id;
static int init_oops_id(void)
{
+#ifndef CONFIG_PREEMPT_RT
if (!oops_id)
get_random_bytes(&oops_id, sizeof(oops_id));
else
+#endif
oops_id++;
return 0;
@@ -555,6 +559,7 @@ static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_flush(1000, true);
}
/*
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index eee3dc9b60a9..59cb24e25f00 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
deleted file mode 100644
index 51615c909b2f..000000000000
--- a/kernel/printk/internal.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * internal.h - printk internal definitions
- */
-#include <linux/percpu.h>
-
-#ifdef CONFIG_PRINTK
-
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
-
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-
-__printf(4, 0)
-int vprintk_store(int facility, int level,
- const struct dev_printk_info *dev_info,
- const char *fmt, va_list args);
-
-__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-
-void printk_safe_init(void);
-bool printk_percpu_data_ready(void);
-
-#define printk_safe_enter_irqsave(flags) \
- do { \
- local_irq_save(flags); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irqrestore(flags) \
- do { \
- __printk_safe_exit(); \
- local_irq_restore(flags); \
- } while (0)
-
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
-
-void defer_console_output(void);
-
-#else
-
-/*
- * In !PRINTK builds we still export console_sem
- * semaphore and some of console functions (console_unlock()/etc.), so
- * printk-safe must preserve the existing local IRQ guarantees.
- */
-#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline void printk_safe_init(void) { }
-static inline bool printk_percpu_data_ready(void) { return false; }
-#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 421c35571797..209d2392f0d8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -44,6 +44,10 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/kthread.h>
+#include <linux/clocksource.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
@@ -58,7 +62,6 @@
#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
-#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -225,19 +228,7 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- int lock_failed;
- unsigned long flags;
-
- /*
- * Here and in __up_console_sem() we need to be in safe mode,
- * because spindump/WARN/etc from under console ->lock will
- * deadlock in printk()->down_trylock_console_sem() otherwise.
- */
- printk_safe_enter_irqsave(flags);
- lock_failed = down_trylock(&console_sem);
- printk_safe_exit_irqrestore(flags);
-
- if (lock_failed)
+ if (down_trylock(&console_sem))
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
@@ -246,13 +237,9 @@ static int __down_trylock_console_sem(unsigned long ip)
static void __up_console_sem(unsigned long ip)
{
- unsigned long flags;
-
mutex_release(&console_lock_dep_map, ip);
- printk_safe_enter_irqsave(flags);
up(&console_sem);
- printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)
@@ -267,11 +254,6 @@ static void __up_console_sem(unsigned long ip)
static int console_locked, console_suspended;
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
* Array of consoles built from command line options (console=)
*/
@@ -355,10 +337,13 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+#ifdef CONFIG_PRINTK
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
-static DEFINE_RAW_SPINLOCK(syslog_lock);
+static DEFINE_SPINLOCK(syslog_lock);
+
+/* Set to enable sync mode. Once set, it is never cleared. */
+static bool sync_mode;
-#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -366,17 +351,6 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* All 3 protected by @console_sem. */
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
-
-struct latched_seq {
- seqcount_latch_t latch;
- u64 val[2];
-};
-
/*
* The next printk record to read after the last 'clear' command. There are
* two copies (updated with seqcount_latch) so that reads can locklessly
@@ -394,9 +368,6 @@ static struct latched_seq clear_seq = {
#define PREFIX_MAX 32
#endif
-/* the maximum size of a formatted record (i.e. with prefix added per line) */
-#define CONSOLE_LOG_MAX 1024
-
/* the maximum size allowed to be reserved for a record */
#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
@@ -435,12 +406,12 @@ static struct printk_ringbuffer *prb = &printk_rb_static;
*/
static bool __printk_percpu_data_ready __read_mostly;
-bool printk_percpu_data_ready(void)
+static bool printk_percpu_data_ready(void)
{
return __printk_percpu_data_ready;
}
-/* Must be called under syslog_lock. */
+/* Must be called under associated write-protection lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
raw_write_seqcount_latch(&ls->latch);
@@ -732,27 +703,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- printk_safe_enter_irq();
if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- printk_safe_exit_irq();
goto out;
}
- printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- printk_safe_enter_irq();
}
if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- printk_safe_exit_irq();
goto out;
}
@@ -762,7 +728,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->info->dev_info);
atomic64_set(&user->seq, r->info->seq + 1);
- printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -797,7 +762,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -818,7 +782,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- printk_safe_exit_irq();
return ret;
}
@@ -833,7 +796,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- printk_safe_enter_irq();
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
@@ -841,7 +803,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
else
ret = EPOLLIN|EPOLLRDNORM;
}
- printk_safe_exit_irq();
return ret;
}
@@ -874,9 +835,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- printk_safe_enter_irq();
atomic64_set(&user->seq, prb_first_valid_seq(prb));
- printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -1042,9 +1001,6 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
@@ -1084,7 +1040,6 @@ void __init setup_log_buf(int early)
struct printk_record r;
size_t new_descs_size;
size_t new_infos_size;
- unsigned long flags;
char *new_log_buf;
unsigned int free;
u64 seq;
@@ -1142,8 +1097,6 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- printk_safe_enter_irqsave(flags);
-
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
@@ -1159,8 +1112,6 @@ void __init setup_log_buf(int early)
*/
prb = &printk_rb_dynamic;
- printk_safe_exit_irqrestore(flags);
-
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
prb_next_seq(&printk_rb_static) - seq);
@@ -1498,11 +1449,9 @@ static int syslog_print(char __user *buf, int size)
size_t n;
size_t skip;
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ spin_unlock_irq(&syslog_lock);
break;
}
if (r.info->seq != syslog_seq) {
@@ -1531,8 +1480,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ spin_unlock_irq(&syslog_lock);
if (!n)
break;
@@ -1566,7 +1514,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return -ENOMEM;
time = printk_time;
- printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1587,23 +1534,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- printk_safe_enter_irq();
if (len < 0)
break;
}
if (clear) {
- raw_spin_lock(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, seq);
- raw_spin_unlock(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
}
- printk_safe_exit_irq();
kfree(text);
return len;
@@ -1611,11 +1555,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ spin_unlock_irq(&syslog_lock);
}
/* Return a consistent copy of @syslog_seq. */
@@ -1623,9 +1565,9 @@ static u64 read_syslog_seq_irq(void)
{
u64 seq;
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
seq = syslog_seq;
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
return seq;
}
@@ -1703,12 +1645,10 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ spin_unlock_irq(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
@@ -1736,8 +1676,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ spin_unlock_irq(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1756,202 +1695,213 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
-/*
- * Special console_lock variants that help to reduce the risk of soft-lockups.
- * They allow to pass console_lock to another printk() call using a busy wait.
- */
+int printk_delay_msec __read_mostly;
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map console_owner_dep_map = {
- .name = "console_owner"
-};
-#endif
+static inline void printk_delay(int level)
+{
+ boot_delay_msec(level);
-static DEFINE_RAW_SPINLOCK(console_owner_lock);
-static struct task_struct *console_owner;
-static bool console_waiter;
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
-/**
- * console_lock_spinning_enable - mark beginning of code where another
- * thread might safely busy wait
- *
- * This basically converts console_lock into a spinlock. This marks
- * the section where the console_lock owner can not sleep, because
- * there may be a waiter spinning (like a spinlock). Also it must be
- * ready to hand over the lock at the end of the section.
- */
-static void console_lock_spinning_enable(void)
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
+
+static bool kernel_sync_mode(void)
{
- raw_spin_lock(&console_owner_lock);
- console_owner = current;
- raw_spin_unlock(&console_owner_lock);
+ return (oops_in_progress || sync_mode);
+}
- /* The waiter may spin on us after setting console_owner */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+static bool console_may_sync(struct console *con)
+{
+ if (!(con->flags & CON_ENABLED))
+ return false;
+ if (con->write_atomic && kernel_sync_mode())
+ return true;
+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread)
+ return true;
+ if (con->write && (con->flags & CON_BOOT) && !con->thread)
+ return true;
+ return false;
}
-/**
- * console_lock_spinning_disable_and_check - mark end of code where another
- * thread was able to busy wait and check if there is a waiter
- *
- * This is called at the end of the section where spinning is allowed.
- * It has two functions. First, it is a signal that it is no longer
- * safe to start busy waiting for the lock. Second, it checks if
- * there is a busy waiter and passes the lock rights to her.
- *
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
- *
- * Return: 1 if the lock rights were passed, 0 otherwise.
- */
-static int console_lock_spinning_disable_and_check(void)
+static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len)
{
- int waiter;
+ if (!(con->flags & CON_ENABLED))
+ return false;
+
+ if (con->write_atomic && kernel_sync_mode()) {
+ con->write_atomic(con, text, text_len);
+ return true;
+ }
- raw_spin_lock(&console_owner_lock);
- waiter = READ_ONCE(console_waiter);
- console_owner = NULL;
- raw_spin_unlock(&console_owner_lock);
+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) {
+ if (console_trylock()) {
+ con->write_atomic(con, text, text_len);
+ console_unlock();
+ return true;
+ }
- if (!waiter) {
- spin_release(&console_owner_dep_map, _THIS_IP_);
- return 0;
+ } else if (con->write && (con->flags & CON_BOOT) && !con->thread) {
+ if (console_trylock()) {
+ con->write(con, text, text_len);
+ console_unlock();
+ return true;
+ }
}
- /* The waiter is now free to continue */
- WRITE_ONCE(console_waiter, false);
+ return false;
+}
- spin_release(&console_owner_dep_map, _THIS_IP_);
+static bool have_atomic_console(void)
+{
+ struct console *con;
- /*
- * Hand off console_lock to waiter. The waiter will perform
- * the up(). After this, the waiter is the console_lock owner.
- */
- mutex_release(&console_lock_dep_map, _THIS_IP_);
- return 1;
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic)
+ return true;
+ }
+ return false;
}
-/**
- * console_trylock_spinning - try to get console_lock by busy waiting
- *
- * This allows to busy wait for the console_lock when the current
- * owner is running in specially marked sections. It means that
- * the current owner is running and cannot reschedule until it
- * is ready to lose the lock.
- *
- * Return: 1 if we got the lock, 0 othrewise
- */
-static int console_trylock_spinning(void)
+static bool print_sync(struct console *con, u64 *seq)
{
- struct task_struct *owner = NULL;
- bool waiter;
- bool spin = false;
- unsigned long flags;
+ struct printk_info info;
+ struct printk_record r;
+ size_t text_len;
- if (console_trylock())
- return 1;
+ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf));
- printk_safe_enter_irqsave(flags);
+ if (!prb_read_valid(prb, *seq, &r))
+ return false;
- raw_spin_lock(&console_owner_lock);
- owner = READ_ONCE(console_owner);
- waiter = READ_ONCE(console_waiter);
- if (!waiter && owner && owner != current) {
- WRITE_ONCE(console_waiter, true);
- spin = true;
- }
- raw_spin_unlock(&console_owner_lock);
+ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
- /*
- * If there is an active printk() writing to the
- * consoles, instead of having it write our data too,
- * see if we can offload that load from the active
- * printer, and do some printing ourselves.
- * Go into a spin only if there isn't already a waiter
- * spinning, and there is an active printer, and
- * that active printer isn't us (recursive printk?).
- */
- if (!spin) {
- printk_safe_exit_irqrestore(flags);
- return 0;
- }
+ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len))
+ return false;
- /* We spin waiting for the owner to release us */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
- /* Owner will clear console_waiter on hand off */
- while (READ_ONCE(console_waiter))
- cpu_relax();
- spin_release(&console_owner_dep_map, _THIS_IP_);
+ *seq = r.info->seq;
- printk_safe_exit_irqrestore(flags);
- /*
- * The owner passed the console lock to us.
- * Since we did not spin on console lock, annotate
- * this as a trylock. Otherwise lockdep will
- * complain.
- */
- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
- return 1;
+ if (text_len)
+ printk_delay(r.info->level);
+
+ return true;
}
-/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
- */
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
+static u64 read_console_seq(struct console *con)
{
- static char dropped_text[64];
- size_t dropped_len = 0;
- struct console *con;
+ u64 seq2;
+ u64 seq;
- trace_console_rcuidle(text, len);
+ seq = latched_seq_read_nolock(&con->printk_seq);
+ seq2 = latched_seq_read_nolock(&con->printk_sync_seq[0]);
+ if (seq2 > seq)
+ seq = seq2;
+#if PRINTK_CTX_NUM > 1
+ seq2 = latched_seq_read_nolock(&con->printk_sync_seq[1]);
+ if (seq2 > seq)
+ seq = seq2;
+#endif
+ return seq;
+}
- if (!console_drivers)
- return;
+static void print_sync_until(struct console *con, u64 seq, bool is_locked)
+{
+ unsigned int flags;
+ u64 printk_seq;
- if (console_dropped) {
- dropped_len = snprintf(dropped_text, sizeof(dropped_text),
- "** %lu printk messages dropped **\n",
- console_dropped);
- console_dropped = 0;
- }
+ console_atomic_lock(&flags);
+ for (;;) {
+ printk_seq = read_console_seq(con);
+ if (printk_seq >= seq)
+ break;
+ if (!print_sync(con, &printk_seq))
+ break;
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else {
- if (dropped_len)
- con->write(con, dropped_text, dropped_len);
- con->write(con, text, len);
+ if (is_locked) {
+ latched_seq_write(&con->printk_seq, printk_seq + 1);
+ } else {
+ int ctx = 0;
+
+#ifdef CONFIG_PRINTK_NMI
+ if (in_nmi())
+ ctx = 1;
+#endif
+ latched_seq_write(&con->printk_sync_seq[ctx], printk_seq + 1);
}
}
+ console_atomic_unlock(flags);
}
-int printk_delay_msec __read_mostly;
+/*
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
+ */
+static DEFINE_PER_CPU(char [PRINTK_CTX_NUM], printk_count);
+static char printk_count_early[PRINTK_CTX_NUM];
-static inline void printk_delay(void)
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
+
+/* Return a pointer to the dedicated counter for the CPU+context of the caller. */
+static char *printk_recursion_counter(void)
{
- if (unlikely(printk_delay_msec)) {
- int m = printk_delay_msec;
+ int ctx = 0;
- while (m--) {
- mdelay(1);
- touch_nmi_watchdog();
- }
+#ifdef CONFIG_PRINTK_NMI
+ if (in_nmi())
+ ctx = 1;
+#endif
+ if (!printk_percpu_data_ready())
+ return &printk_count_early[ctx];
+ return &((*this_cpu_ptr(&printk_count))[ctx]);
+}
+
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the return value to see if the recursion is allowed.
+ * On failure, interrupts are not disabled.
+ */
+static bool printk_enter_irqsave(unsigned long *flags)
+{
+ char *count;
+
+ local_irq_save(*flags);
+ count = printk_recursion_counter();
+ if (*count > PRINTK_MAX_RECURSION) {
+ local_irq_restore(*flags);
+ return false;
}
+ (*count)++;
+
+ return true;
+}
+
+/* Exit recursion tracking, restoring interrupts. */
+static void printk_exit_irqrestore(unsigned long flags)
+{
+ char *count;
+
+ count = printk_recursion_counter();
+ (*count)--;
+ local_irq_restore(flags);
}
static inline u32 printk_caller_id(void)
@@ -2032,20 +1982,24 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl
}
__printf(4, 0)
-int vprintk_store(int facility, int level,
- const struct dev_printk_info *dev_info,
- const char *fmt, va_list args)
+static int vprintk_store(int facility, int level,
+ const struct dev_printk_info *dev_info,
+ const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
enum log_flags lflags = 0;
+ bool final_commit = false;
struct printk_record r;
+ unsigned long irqflags;
u16 trunc_msg_len = 0;
char prefix_buf[8];
u16 reserve_size;
va_list args2;
u16 text_len;
+ int ret = 0;
u64 ts_nsec;
+ u64 seq;
/*
* Since the duration of printk() can vary depending on the message
@@ -2055,6 +2009,9 @@ int vprintk_store(int facility, int level,
*/
ts_nsec = local_clock();
+ if (!printk_enter_irqsave(&irqflags))
+ return 0;
+
/*
* The sprintf needs to come first since the syslog prefix might be
* passed in as a parameter. An extra byte must be reserved so that
@@ -2081,6 +2038,7 @@ int vprintk_store(int facility, int level,
if (lflags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ seq = r.info->seq;
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
facility, &lflags, fmt, args);
r.info->text_len += text_len;
@@ -2088,11 +2046,13 @@ int vprintk_store(int facility, int level,
if (lflags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
+ final_commit = true;
} else {
prb_commit(&e);
}
- return text_len;
+ ret = text_len;
+ goto out;
}
}
@@ -2108,9 +2068,11 @@ int vprintk_store(int facility, int level,
prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
- return 0;
+ goto out;
}
+ seq = r.info->seq;
+
/* fill message */
text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
if (trunc_msg_len)
@@ -2125,12 +2087,27 @@ int vprintk_store(int facility, int level,
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* A message without a trailing newline can be continued. */
- if (!(lflags & LOG_NEWLINE))
+ if (!(lflags & LOG_NEWLINE)) {
prb_commit(&e);
- else
+ } else {
prb_final_commit(&e);
+ final_commit = true;
+ }
+
+ ret = text_len + trunc_msg_len;
+out:
+ /* only the kernel may perform synchronous printing */
+ if (facility == 0 && final_commit) {
+ struct console *con;
+
+ for_each_console(con) {
+ if (console_may_sync(con))
+ print_sync_until(con, seq + 1, false);
+ }
+ }
- return (text_len + trunc_msg_len);
+ printk_exit_irqrestore(irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
@@ -2138,53 +2115,43 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *fmt, va_list args)
{
int printed_len;
- bool in_sched = false;
- unsigned long flags;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
- if (level == LOGLEVEL_SCHED) {
+ if (level == LOGLEVEL_SCHED)
level = LOGLEVEL_DEFAULT;
- in_sched = true;
- }
-
- boot_delay_msec(level);
- printk_delay();
- printk_safe_enter_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- printk_safe_exit_irqrestore(flags);
-
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
- /*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
- /*
- * Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
- */
- if (console_trylock_spinning())
- console_unlock();
- preempt_enable();
- }
wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
-int vprintk_default(const char *fmt, va_list args)
+__printf(1, 0)
+static int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
-EXPORT_SYMBOL_GPL(vprintk_default);
+
+__printf(1, 0)
+static int vprintk_func(const char *fmt, va_list args)
+{
+#ifdef CONFIG_KGDB_KDB
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+ return vprintk_default(fmt, args);
+}
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_func(fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
/**
* printk - print a kernel message
@@ -2220,37 +2187,164 @@ asmlinkage __visible int printk(const char *fmt, ...)
}
EXPORT_SYMBOL(printk);
-#else /* CONFIG_PRINTK */
+static int printk_kthread_func(void *data)
+{
+ struct console *con = data;
+ unsigned long dropped = 0;
+ char *dropped_text = NULL;
+ struct printk_info info;
+ struct printk_record r;
+ char *ext_text = NULL;
+ size_t dropped_len;
+ int ret = -ENOMEM;
+ char *text = NULL;
+ char *write_text;
+ size_t len;
+ int error;
+ u64 seq;
-#define CONSOLE_LOG_MAX 0
-#define printk_time false
+ if (con->flags & CON_EXTENDED) {
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ if (!ext_text)
+ goto out;
+ }
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ dropped_text = kmalloc(64, GFP_KERNEL);
+ if (!text || !dropped_text)
+ goto out;
-#define prb_read_valid(rb, seq, r) false
-#define prb_first_valid_seq(rb) 0
+ if (con->flags & CON_EXTENDED)
+ write_text = ext_text;
+ else
+ write_text = text;
-static u64 syslog_seq;
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
+ seq = read_console_seq(con);
+
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
+ for (;;) {
+ error = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, &r) || kthread_should_stop());
+
+ if (kthread_should_stop())
+ break;
+
+ if (error)
+ continue;
+
+ if (seq != r.info->seq) {
+ dropped += r.info->seq - seq;
+ seq = r.info->seq;
+ }
+
+ seq++;
+
+ if (!(con->flags & CON_ENABLED))
+ continue;
+
+ if (suppress_message_printing(r.info->level))
+ continue;
+
+ if (con->flags & CON_EXTENDED) {
+ len = info_print_ext_header(ext_text,
+ CONSOLE_EXT_LOG_MAX,
+ r.info);
+ len += msg_print_ext_body(ext_text + len,
+ CONSOLE_EXT_LOG_MAX - len,
+ &r.text_buf[0], r.info->text_len,
+ &r.info->dev_info);
+ } else {
+ len = record_print_text(&r,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time);
+ }
+
+ console_lock();
+
+ /*
+ * Even though the printk kthread is always preemptible, it is
+ * still not allowed to call cond_resched() from within
+ * console drivers. The task may become non-preemptible in the
+ * console driver call chain. For example, vt_console_print()
+ * takes a spinlock and then can call into fbcon_redraw(),
+ * which can conditionally invoke cond_resched().
+ */
+ console_may_schedule = 0;
+
+ if (kernel_sync_mode() && con->write_atomic) {
+ console_unlock();
+ break;
+ }
+
+ if (!(con->flags & CON_EXTENDED) && dropped) {
+ dropped_len = snprintf(dropped_text, 64,
+ "** %lu printk messages dropped **\n",
+ dropped);
+ dropped = 0;
+
+ con->write(con, dropped_text, dropped_len);
+ printk_delay(r.info->level);
+ }
+
+ con->write(con, write_text, len);
+ if (len)
+ printk_delay(r.info->level);
+
+ latched_seq_write(&con->printk_seq, seq);
+
+ console_unlock();
+ }
+out:
+ kfree(dropped_text);
+ kfree(text);
+ kfree(ext_text);
+ pr_info("%sconsole [%s%d]: printing thread stopped\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
+ return ret;
+}
-static size_t record_print_text(const struct printk_record *r,
- bool syslog, bool time)
+/* Must be called within console_lock(). */
+static void start_printk_kthread(struct console *con)
{
- return 0;
+ con->thread = kthread_run(printk_kthread_func, con,
+ "pr/%s%d", con->name, con->index);
+ if (IS_ERR(con->thread)) {
+ pr_err("%sconsole [%s%d]: unable to start printing thread\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
+ return;
+ }
+ pr_info("%sconsole [%s%d]: printing thread started\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
}
-static ssize_t info_print_ext_header(char *buf, size_t size,
- struct printk_info *info)
+
+/* protected by console_lock */
+static bool kthreads_started;
+
+/* Must be called within console_lock(). */
+static void console_try_thread(struct console *con)
{
- return 0;
+ if (kthreads_started) {
+ start_printk_kthread(con);
+ return;
+ }
+
+ /*
+ * The printing threads have not been started yet. If this console
+ * can print synchronously, print all unprinted messages.
+ */
+ if (console_may_sync(con))
+ print_sync_until(con, prb_next_seq(prb), true);
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *text, size_t text_len,
- struct dev_printk_info *dev_info) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
-static bool suppress_message_printing(int level) { return false; }
+
+#else /* CONFIG_PRINTK */
+
+#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
+
+#define console_try_thread(con)
#endif /* CONFIG_PRINTK */
@@ -2495,34 +2589,6 @@ int is_console_locked(void)
}
EXPORT_SYMBOL(is_console_locked);
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(void)
-{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
-}
-
/**
* console_unlock - unlock the console system
*
@@ -2539,136 +2605,14 @@ static inline int can_use_console(void)
*/
void console_unlock(void)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
- struct printk_info info;
- struct printk_record r;
-
if (console_suspended) {
up_console_sem();
return;
}
- prb_rec_init_rd(&r, &info, text, sizeof(text));
-
- /*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
- *
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the "again" goto label.
- */
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
-
- /*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
- */
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
- }
-
- for (;;) {
- size_t ext_len = 0;
- size_t len;
-
- printk_safe_enter_irqsave(flags);
-skip:
- if (!prb_read_valid(prb, console_seq, &r))
- break;
-
- if (console_seq != r.info->seq) {
- console_dropped += r.info->seq - console_seq;
- console_seq = r.info->seq;
- }
-
- if (suppress_message_printing(r.info->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_seq++;
- goto skip;
- }
-
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
-
- /*
- * Handle extended console text first because later
- * record_print_text() will modify the record buffer in-place.
- */
- if (nr_ext_console_drivers) {
- ext_len = info_print_ext_header(ext_text,
- sizeof(ext_text),
- r.info);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- &r.text_buf[0],
- r.info->text_len,
- &r.info->dev_info);
- }
- len = record_print_text(&r,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time);
- console_seq++;
-
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
-
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
-
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
-
- printk_safe_exit_irqrestore(flags);
-
- if (do_cond_resched)
- cond_resched();
- }
-
console_locked = 0;
up_console_sem();
-
- /*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
- */
- retry = prb_read_valid(prb, console_seq, NULL);
- printk_safe_exit_irqrestore(flags);
-
- if (retry && console_trylock())
- goto again;
}
EXPORT_SYMBOL(console_unlock);
@@ -2718,23 +2662,18 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
- /*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
- */
- console_trylock();
- console_may_schedule = 0;
+ struct console *c;
+ u64 seq;
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
+ if (!console_trylock())
+ return;
- printk_safe_enter_irqsave(flags);
- console_seq = prb_first_valid_seq(prb);
- printk_safe_exit_irqrestore(flags);
+ if (mode == CONSOLE_REPLAY_ALL) {
+ seq = prb_first_valid_seq(prb);
+ for_each_console(c)
+ latched_seq_write(&c->printk_seq, seq);
}
+
console_unlock();
}
@@ -2869,8 +2808,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- unsigned long flags;
struct console *bcon = NULL;
+ u64 seq = 0;
int err;
for_each_console(bcon) {
@@ -2893,6 +2832,8 @@ void register_console(struct console *newcon)
}
}
+ newcon->thread = NULL;
+
if (console_drivers && console_drivers->flags & CON_BOOT)
bcon = console_drivers;
@@ -2934,8 +2875,10 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ newcon->flags |= CON_HANDOVER;
+ }
/*
* Put this console in the list - keep the
@@ -2957,27 +2900,19 @@ void register_console(struct console *newcon)
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- *
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
+ if (!(newcon->flags & CON_PRINTBUFFER))
+ seq = prb_next_seq(prb);
- /* Get a consistent copy of @syslog_seq. */
- raw_spin_lock_irqsave(&syslog_lock, flags);
- console_seq = syslog_seq;
- raw_spin_unlock_irqrestore(&syslog_lock, flags);
- }
+ seqcount_latch_init(&newcon->printk_seq.latch);
+ latched_seq_write(&newcon->printk_seq, seq);
+ seqcount_latch_init(&newcon->printk_sync_seq[0].latch);
+ latched_seq_write(&newcon->printk_sync_seq[0], seq);
+#if PRINTK_CTX_NUM > 1
+ seqcount_latch_init(&newcon->printk_sync_seq[1].latch);
+ latched_seq_write(&newcon->printk_sync_seq[1], seq);
+#endif
+
+ console_try_thread(newcon);
console_unlock();
console_sysfs_notify();
@@ -3051,6 +2986,9 @@ int unregister_console(struct console *console)
console_unlock();
console_sysfs_notify();
+ if (console->thread && !IS_ERR(console->thread))
+ kthread_stop(console->thread);
+
if (console->exit)
res = console->exit(console);
@@ -3133,6 +3071,15 @@ static int __init printk_late_init(void)
unregister_console(con);
}
}
+
+#ifdef CONFIG_PRINTK
+ console_lock();
+ for_each_console(con)
+ start_printk_kthread(con);
+ kthreads_started = true;
+ console_unlock();
+#endif
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
@@ -3148,7 +3095,6 @@ late_initcall(printk_late_init);
* Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
@@ -3156,14 +3102,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = __this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
- }
-
if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
+ wake_up_interruptible_all(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
@@ -3182,25 +3122,10 @@ void wake_up_klogd(void)
preempt_enable();
}
-void defer_console_output(void)
+__printf(1, 0)
+static int vprintk_deferred(const char *fmt, va_list args)
{
- if (!printk_percpu_data_ready())
- return;
-
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
-}
-
-int vprintk_deferred(const char *fmt, va_list args)
-{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
int printk_deferred(const char *fmt, ...)
@@ -3341,6 +3266,24 @@ void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
+ if (!oops_in_progress) {
+ /*
+ * If atomic consoles are available, activate kernel sync mode
+ * to make sure any final messages are visible. The trailing
+ * printk message is important to flush any pending messages.
+ */
+ if (have_atomic_console()) {
+ sync_mode = true;
+ pr_info("enabled sync mode\n");
+ }
+
+ /*
+ * Give the printing threads time to flush, allowing up to
+ * 1s of no printing forward progress before giving up.
+ */
+ pr_flush(1000, true);
+ }
+
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
enum kmsg_dump_reason max_reason = dumper->max_reason;
@@ -3386,14 +3329,12 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
struct printk_info info;
unsigned int line_count;
struct printk_record r;
- unsigned long flags;
size_t l = 0;
bool ret = false;
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
@@ -3414,7 +3355,6 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
- printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
@@ -3446,7 +3386,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
struct printk_record r;
- unsigned long flags;
u64 seq;
u64 next_seq;
size_t len = 0;
@@ -3459,7 +3398,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
- printk_safe_enter_irqsave(flags);
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
@@ -3468,10 +3406,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
}
/* last entry */
- if (iter->cur_seq >= iter->next_seq) {
- printk_safe_exit_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
/*
* Find first record that fits, including all following records,
@@ -3503,7 +3439,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
iter->next_seq = next_seq;
ret = true;
- printk_safe_exit_irqrestore(flags);
out:
if (len_out)
*len_out = len;
@@ -3521,13 +3456,204 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
iter->cur_seq = latched_seq_read_nolock(&clear_seq);
iter->next_seq = prb_next_seq(prb);
- printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
+
+struct prb_cpulock {
+ atomic_t owner;
+ unsigned long __percpu *irqflags;
+};
+
+#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \
+static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \
+static struct prb_cpulock name = { \
+ .owner = ATOMIC_INIT(-1), \
+ .irqflags = &_##name##_percpu_irqflags, \
+}
+
+static unsigned int kgdb_cpu = -1;
+
+static bool __prb_trylock(struct prb_cpulock *cpu_lock,
+ unsigned int *cpu_store)
+{
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = get_cpu();
+
+ *cpu_store = atomic_read(&cpu_lock->owner);
+ /* memory barrier to ensure the current lock owner is visible */
+ smp_rmb();
+ if (*cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_save(*flags);
+ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner,
+ cpu_store, cpu)) {
+ return true;
+ }
+ local_irq_restore(*flags);
+ } else if (*cpu_store == cpu) {
+ return true;
+ }
+
+ put_cpu();
+ return false;
+}
+
+/*
+ * prb_lock: Perform a processor-reentrant spin lock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" pointer to store lock status information.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately. If lock is locked by another
+ * processor, this function spins until the calling processor becomes the
+ * owner.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store)
+{
+ for (;;) {
+ if (__prb_trylock(cpu_lock, cpu_store))
+ break;
+ cpu_relax();
+ }
+}
+
+/*
+ * prb_unlock: Perform a processor-reentrant spin unlock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" object storing lock status information.
+ *
+ * Release the lock. The calling processor must be the owner of the lock.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store)
+{
+ bool trigger_kgdb = false;
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = atomic_read(&cpu_lock->owner);
+ if (cpu == kgdb_cpu && cpu_store == -1) {
+ trigger_kgdb = true;
+ kgdb_cpu = -1;
+ }
+ atomic_set_release(&cpu_lock->owner, cpu_store);
+
+ if (cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_restore(*flags);
+ }
+
+ put_cpu();
+
+ if (trigger_kgdb) {
+ pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu);
+ kgdb_roundup_cpu(cpu);
+ }
+}
+
+DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
+
+void console_atomic_lock(unsigned int *flags)
+{
+ prb_lock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_lock);
+
+void console_atomic_unlock(unsigned int flags)
+{
+ prb_unlock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_unlock);
+
+bool console_atomic_kgdb_cpu_delay(unsigned int cpu)
+{
+ if (cpu != atomic_read(&printk_cpulock.owner))
+ return false;
+
+ kgdb_cpu = cpu;
+ return true;
+}
+EXPORT_SYMBOL(console_atomic_kgdb_cpu_delay);
+
+static void pr_msleep(bool may_sleep, int ms)
+{
+ if (may_sleep) {
+ msleep(ms);
+ } else {
+ while (ms--)
+ udelay(1000);
+ }
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Any context.
+ * Return: true if all enabled printers are caught up.
+ */
+bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ int remaining = timeout_ms;
+ struct console *con;
+ u64 last_diff = 0;
+ bool may_sleep;
+ u64 printk_seq;
+ u64 diff;
+ u64 seq;
+
+ may_sleep = (preemptible() && !in_softirq());
+
+ seq = prb_next_seq(prb);
+
+ for (;;) {
+ diff = 0;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ printk_seq = read_console_seq(con);
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+
+ if (diff != last_diff && reset_on_progress)
+ remaining = timeout_ms;
+
+ if (diff == 0 || remaining == 0)
+ break;
+
+ if (remaining < 0) {
+ pr_msleep(may_sleep, 100);
+ } else if (remaining < 100) {
+ pr_msleep(may_sleep, remaining);
+ remaining = 0;
+ } else {
+ pr_msleep(may_sleep, 100);
+ remaining -= 100;
+ }
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+EXPORT_SYMBOL(pr_flush);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
deleted file mode 100644
index 94232186fccb..000000000000
--- a/kernel/printk/printk_safe.c
+++ /dev/null
@@ -1,414 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/kdb.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-#include <linux/kprobes.h>
-
-#include "internal.h"
-
-/*
- * In NMI and safe mode, printk() avoids taking locks. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examining current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
-static DEFINE_PER_CPU(int, printk_context);
-
-static DEFINE_RAW_SPINLOCK(safe_read_lock);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&safe_read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&safe_read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the safe buffers.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&safe_read_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&safe_read_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will store the messages into the main logbuf directly.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
-{
- this_cpu_inc(printk_context);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
-{
- this_cpu_dec(printk_context);
-}
-
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
-#ifdef CONFIG_KGDB_KDB
- /* Allow to pass printk() to kdb but avoid a recursion. */
- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
- return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
-#endif
-
- /*
- * Use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
- unsigned long flags;
- int len;
-
- printk_safe_enter_irqsave(flags);
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- printk_safe_exit_irqrestore(flags);
- defer_console_output();
- return len;
- }
-
- /* Use extra buffer in NMI. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
- /* No obstacles. */
- return vprintk_default(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2997ca600d18..3ed6598357ce 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
+#ifdef CONFIG_PREEMPT_RT
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (task->state & __TASK_TRACED)
+ task->state = __TASK_TRACED;
+ else
+ task->saved_state = __TASK_TRACED;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+#else
task->state = __TASK_TRACED;
+#endif
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -207,8 +218,8 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (task->state != __TASK_TRACED)
- return;
+ unsigned long flags;
+ bool frozen = true;
WARN_ON(!task->ptrace || task->parent != current);
@@ -217,12 +228,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
* Recheck state under the lock to close this race.
*/
spin_lock_irq(&task->sighand->siglock);
- if (task->state == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
- }
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (task->state == __TASK_TRACED)
+ task->state = TASK_TRACED;
+#ifdef CONFIG_PREEMPT_RT
+ else if (task->saved_state == __TASK_TRACED)
+ task->saved_state = TASK_TRACED;
+#endif
+ else
+ frozen = false;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ if (frozen && __fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+
spin_unlock_irq(&task->sighand->siglock);
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 29d2f4c647d3..6096a7d14342 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -61,10 +61,13 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_ATOM_BH 0x40 /* ... disabling bh while atomic */
+#define RCUTORTURE_RDR_ATOM_RBH 0x80 /* ... RBH while atomic */
+#define RCUTORTURE_RDR_NBITS 8 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
- RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED | \
+ RCUTORTURE_RDR_ATOM_BH | RCUTORTURE_RDR_ATOM_RBH)
#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
/* Must be power of two minus one. */
#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
@@ -1418,31 +1421,53 @@ static void rcutorture_one_extend(int *readstate, int newstate,
WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
rtrsp->rt_readstate = newstate;
- /* First, put new protection in place to avoid critical-section gap. */
+ /*
+ * First, put new protection in place to avoid critical-section gap.
+ * Disable preemption around the ATOM disables to ensure that
+ * in_atomic() is true.
+ */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
+ preempt_disable();
+ if (statesnew & RCUTORTURE_RDR_ATOM_BH)
+ local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_ATOM_RBH)
+ rcu_read_lock_bh();
+ preempt_enable();
if (statesnew & RCUTORTURE_RDR_RCU)
idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Disable preemption around the ATOM enables in
+ * case the context was only atomic due to IRQ disabling.
+ */
+ preempt_disable();
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
+ if (statesold & RCUTORTURE_RDR_ATOM_BH)
local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_ATOM_RBH)
+ rcu_read_unlock_bh();
+ preempt_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+
if (statesold & RCUTORTURE_RDR_RCU) {
bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
@@ -1485,6 +1510,12 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
int mask = rcutorture_extend_mask_max();
unsigned long randmask1 = torture_random(trsp) >> 8;
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long nonatomic_bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+ unsigned long atomic_bhs = RCUTORTURE_RDR_ATOM_BH |
+ RCUTORTURE_RDR_ATOM_RBH;
+ unsigned long tmp;
WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
@@ -1492,11 +1523,49 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ tmp = atomic_bhs | nonatomic_bhs;
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & tmp;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * Can't release the outermost rcu lock in an irq disabled
+ * section without preemption also being disabled, if irqs
+ * had ever been enabled during this RCU critical section
+ * (could leak a special flag and delay reporting the qs).
+ */
+ if ((oldmask & RCUTORTURE_RDR_RCU) &&
+ (mask & RCUTORTURE_RDR_IRQ) &&
+ !(mask & preempts))
+ mask |= RCUTORTURE_RDR_RCU;
+
+ /* Can't modify atomic bh in non-atomic context */
+ if ((oldmask & atomic_bhs) && (mask & atomic_bhs) &&
+ !(mask & preempts_irq)) {
+ mask |= oldmask & preempts_irq;
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & tmp;
+ }
+ if ((mask & atomic_bhs) && !(mask & preempts_irq))
+ mask |= RCUTORTURE_RDR_PREEMPT;
+
+ /* Can't modify non-atomic bh in atomic context */
+ tmp = nonatomic_bhs;
+ if (oldmask & preempts_irq)
+ mask &= ~tmp;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & tmp;
+ }
+
return mask ?: RCUTORTURE_RDR_RCU;
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index fcef5f0c60b8..d212815f99a9 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1293,7 +1293,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
rttd->notrun = true;
}
-static void rcu_tasks_initiate_self_tests(void)
+void rcu_tasks_initiate_self_tests(void)
{
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
@@ -1330,9 +1330,7 @@ static int rcu_tasks_verify_self_tests(void)
return ret;
}
late_initcall(rcu_tasks_verify_self_tests);
-#else /* #ifdef CONFIG_PROVE_RCU */
-static void rcu_tasks_initiate_self_tests(void) { }
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
+#endif /* #ifdef CONFIG_PROVE_RCU */
void __init rcu_init_tasks_generic(void)
{
@@ -1347,9 +1345,6 @@ void __init rcu_init_tasks_generic(void)
#ifdef CONFIG_TASKS_TRACE_RCU
rcu_spawn_tasks_trace_kthread();
#endif
-
- // Run the self-tests.
- rcu_tasks_initiate_self_tests();
}
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15b4d2fb6be3..c58c517e1197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,7 +74,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
+#ifdef CONFIG_PREEMPT_RT
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#else
const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#endif
/*
* period over which we measure -rt task CPU usage in us.
@@ -630,6 +634,48 @@ void resched_curr(struct rq *rq)
trace_sched_wake_idle_without_ipi(cpu);
}
+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+ return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ if (!sched_feat(PREEMPT_LAZY)) {
+ resched_curr(rq);
+ return;
+ }
+
+ lockdep_assert_held(&rq->lock);
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ if (test_tsk_need_resched_lazy(curr))
+ return;
+
+ set_tsk_need_resched_lazy(curr);
+
+ cpu = cpu_of(rq);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(curr))
+ smp_send_reschedule(cpu);
+}
+#endif
+
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1759,6 +1805,7 @@ void migrate_disable(void)
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
+ preempt_lazy_disable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -1787,6 +1834,7 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
+ preempt_lazy_enable();
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -2634,7 +2682,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ if (match_state && !task_match_state_lock(p, match_state))
return 0;
cpu_relax();
}
@@ -2649,7 +2697,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if (!match_state || task_match_state_or_saved(p, match_state))
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3209,6 +3257,54 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ * bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ (state & TASK_RTLOCK_WAIT) != TASK_RTLOCK_WAIT);
+
+ if (p->state & state) {
+ *success = 1;
+ return true;
+ }
+
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Saved state preserves the task state accross blocking on
+ * a RT lock. If the state matches, set p::saved_state to
+ * TASK_RUNNING, but do not wake the task because it waits
+ * for a lock wakeup. Also indicate success because from
+ * the regular waker's point of view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ *success = 1;
+ }
+#endif
+ return false;
+}
+
+/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
@@ -3347,10 +3443,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(p->state & state))
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -3365,14 +3460,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(p->state & state))
+
+ if (!ttwu_state_match(p, state, &success))
goto unlock;
trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
-
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
@@ -3802,6 +3895,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(p)->preempt_lazy_count = 0;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -4234,23 +4330,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* provided by mmdrop(),
* - a sync_core for SYNC_CORE.
*/
+ /*
+ * We use mmdrop_delayed() here so we don't have to do the
+ * full __mmdrop() when we are the last user.
+ */
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_delayed(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
- /* Task is done with its stack. */
- put_task_stack(prev);
-
put_task_struct_rcu_user(prev);
}
@@ -4986,6 +5077,26 @@ restart:
}
/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock by having seperate
+ * mask values for SM_MASK_PREEMPT and SM_MASK_STATE while on a non RT
+ * enabled kernel the masks have all bits set which allows the compiler to
+ * optimize the AND operation out.
+ */
+#define SM_NONE 0x0
+#define SM_PREEMPT 0x1
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT UINT_MAX
+# define SM_MASK_STATE SM_MASK_PREEMPT
+#else
+# define SM_RTLOCK_WAIT 0x2
+# define SM_MASK_PREEMPT SM_PREEMPT
+# define SM_MASK_STATE (SM_PREEMPT | SM_RTLOCK_WAIT)
+#endif
+
+/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
@@ -5024,7 +5135,7 @@ restart:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -5037,13 +5148,13 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, preempt);
+ schedule_debug(prev, sched_mode & SM_MASK_STATE);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(preempt);
+ rcu_note_context_switch(sched_mode & SM_MASK_STATE);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -5077,7 +5188,7 @@ static void __sched notrace __schedule(bool preempt)
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
prev_state = prev->state;
- if (!preempt && prev_state) {
+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
@@ -5112,6 +5223,7 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
+ clear_tsk_need_resched_lazy(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
@@ -5143,7 +5255,7 @@ static void __sched notrace __schedule(bool preempt)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -5164,7 +5276,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -5225,7 +5337,7 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
@@ -5253,7 +5365,7 @@ void __sched schedule_idle(void)
*/
WARN_ON_ONCE(current->state);
do {
- __schedule(false);
+ __schedule(SM_NONE);
} while (need_resched());
}
@@ -5288,6 +5400,18 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ do {
+ preempt_disable();
+ __schedule(SM_RTLOCK_WAIT);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
@@ -5306,7 +5430,7 @@ static void __sched notrace preempt_schedule_common(void)
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -5317,6 +5441,30 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+ if (test_thread_flag(TIF_NEED_RESCHED))
+ return 1;
+ if (current_thread_info()->preempt_lazy_count)
+ return 0;
+ return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+ return 1;
+}
+
+#endif
+
#ifdef CONFIG_PREEMPTION
/*
* This is the entry point to schedule() from in-kernel preemption
@@ -5330,7 +5478,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
+ if (!preemptible_lazy())
+ return;
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
@@ -5363,6 +5512,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
if (likely(!preemptible()))
return;
+ if (!preemptible_lazy())
+ return;
+
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -5385,7 +5537,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -5534,7 +5686,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
@@ -7481,7 +7633,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+ task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
@@ -7586,6 +7740,7 @@ void sched_setnuma(struct task_struct *p, int nid)
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
+
/*
* Ensure that the idle task is using init_mm right before its CPU goes
* offline.
@@ -8262,7 +8417,7 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = preempt_count() + rcu_preempt_depth();
+ int nested = preempt_count() + sched_rcu_preempt_depth();
return (nested == preempt_offset);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f60ef0b4ec33..4541a379a0f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4446,7 +4446,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -4470,7 +4470,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static void
@@ -4613,7 +4613,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
/*
@@ -4750,7 +4750,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}
static __always_inline
@@ -5494,7 +5494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (task_current(rq, p))
- resched_curr(rq);
+ resched_curr_lazy(rq);
return;
}
hrtick_start(rq, delta);
@@ -7123,7 +7123,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -10854,7 +10854,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -10881,7 +10881,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (task_current(rq, p)) {
if (p->prio > oldprio)
- resched_curr(rq);
+ resched_curr_lazy(rq);
} else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7f8dace0964c..d5cee51819bf 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,11 +46,19 @@ SCHED_FEAT(DOUBLE_TICK, false)
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT
+SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f2bc99ca01e5..9bfcf75ad240 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2028,6 +2028,15 @@ extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+ resched_curr(rq);
+}
+#endif
+
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..f230b1ac7f91 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q)
struct swait_queue *curr;
LIST_HEAD(tmp);
+ WARN_ON(irqs_disabled());
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 55a0a243e871..3e2e6238ee8b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -526,7 +526,8 @@ static int init_rootdomain(struct root_domain *rd)
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+// init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
rd->visit_gen = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 30a0bee5ff9b..f722dea57a12 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1316,6 +1316,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
struct k_sigaction *action;
int sig = info->si_signo;
+ /*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (in_atomic()) {
+ struct task_struct *t = current;
+
+ if (WARN_ON_ONCE(t->forced_info.si_signo))
+ return 0;
+
+ if (is_si_special(info)) {
+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
+ t->forced_info.si_signo = info->si_signo;
+ t->forced_info.si_errno = 0;
+ t->forced_info.si_code = SI_KERNEL;
+ t->forced_info.si_pid = 0;
+ t->forced_info.si_uid = 0;
+ } else {
+ t->forced_info = *info;
+ }
+
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ return 0;
+ }
+#endif
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
@@ -2215,16 +2243,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
} else {
diff --git a/kernel/smp.c b/kernel/smp.c
index 52bf159ec400..86955bacbae6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -690,10 +690,21 @@ void flush_smp_call_function_from_idle(void)
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE);
+
local_irq_save(flags);
flush_smp_call_function_queue(true);
- if (local_softirq_pending())
- do_softirq();
+
+ if (local_softirq_pending()) {
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ do_softirq();
+ } else {
+ struct task_struct *ksoftirqd = this_cpu_ksoftirqd();
+
+ if (ksoftirqd && ksoftirqd->state != TASK_RUNNING)
+ wake_up_process(ksoftirqd);
+ }
+ }
local_irq_restore(flags);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4a66725b1d4a..b0ad29f80711 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1422,6 +1422,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
+ timer->is_chill = !!(mode & HRTIMER_MODE_CHILL);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1788,7 +1789,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
t->task = NULL;
if (task)
- wake_up_process(task);
+ wake_up_state(task, timer->is_chill ? TASK_RTLOCK_WAIT : TASK_NORMAL);
return HRTIMER_NORESTART;
}
@@ -2006,6 +2007,34 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
}
#endif
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
+ */
+void cpu_chill(void)
+{
+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
+ ktime_t chill_time;
+
+ local_irq_disable();
+ current_save_and_set_rtlock_wait_state();
+ local_irq_enable();
+
+ chill_time = ktime_set(0, NSEC_PER_MSEC);
+
+ current->flags |= PF_NOFREEZE;
+ schedule_hrtimeout(&chill_time,
+ HRTIMER_MODE_REL_HARD| HRTIMER_MODE_CHILL);
+ if (!freeze_flag)
+ current->flags &= ~PF_NOFREEZE;
+
+ local_irq_disable();
+ current_restore_rtlock_saved_state();
+ local_irq_enable();
+}
+EXPORT_SYMBOL(cpu_chill);
+#endif
+
/*
* Functions related to boot-time initialization:
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2870a7a51638..9c0f56ae3927 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1758,6 +1758,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ irq_work_tick_soft();
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa617a0a9eed..20696524214b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2595,6 +2595,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
+static unsigned short migration_disable_value(void)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+ return current->migration_disabled;
+#else
+ return 0;
+#endif
+}
+
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
unsigned int trace_flags = irqs_status;
@@ -2613,7 +2622,16 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
- return (trace_flags << 16) | (pc & 0xff);
+
+#ifdef CONFIG_PREEMPT_LAZY
+ if (need_resched_lazy())
+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+#endif
+
+ return (pc & 0xff) |
+ (migration_disable_value() & 0xff) << 8 |
+ (preempt_lazy_count() & 0xff) << 16 |
+ (trace_flags << 24);
}
struct ring_buffer_event *
@@ -4140,14 +4158,17 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _--------=> CPU# \n"
+ "# / _-------=> irqs-off \n"
+ "# | / _------=> need-resched \n"
+ "# || / _-----=> need-resched-lazy\n"
+ "# ||| / _----=> hardirq/softirq \n"
+ "# |||| / _---=> preempt-depth \n"
+ "# ||||| / _--=> preempt-lazy-depth\n"
+ "# |||||| / _-=> migrate-disable \n"
+ "# ||||||| / delay \n"
+ "# cmd pid |||||||| time | caller \n"
+ "# \\ / |||||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4181,13 +4202,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space);
+ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space);
+ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space);
+ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | ");
}
void
@@ -9684,7 +9708,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -9766,7 +9789,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 80e96989770e..0e6704248425 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -183,6 +183,8 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
+ __common_field(unsigned char, migrate_disable);
+ __common_field(unsigned char, preempt_lazy_count);
return ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d0368a569bfa..321e18fb6907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
char hardsoft_irq;
char need_resched;
+ char need_resched_lazy;
char irqs_off;
int hardirq;
int softirq;
@@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
break;
}
+ need_resched_lazy =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
hardsoft_irq =
(nmi && hardirq) ? 'Z' :
nmi ? 'z' :
@@ -489,14 +493,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.' ;
- trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq);
+ trace_seq_printf(s, "%c%c%c%c",
+ irqs_off, need_resched, need_resched_lazy,
+ hardsoft_irq);
if (entry->preempt_count)
trace_seq_printf(s, "%x", entry->preempt_count);
else
trace_seq_putc(s, '.');
+ if (entry->preempt_lazy_count)
+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+ else
+ trace_seq_putc(s, '.');
+
+ if (entry->migrate_disable)
+ trace_seq_printf(s, "%x", entry->migrate_disable);
+ else
+ trace_seq_putc(s, '.');
+
return !trace_seq_has_overflowed(s);
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e1bd6f4a13d..10c15a68588c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1440,7 +1440,7 @@ config DEBUG_ATOMIC_SLEEP
config DEBUG_LOCKING_API_SELFTESTS
bool "Locking API boot-time self-tests"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !PREEMPT_RT
help
Say Y here if you want the kernel to run a short self-test during
bootup. The self-test checks whether common types of locking bugs
diff --git a/lib/bug.c b/lib/bug.c
index 45a0584f6541..03a87df69ed2 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -206,6 +206,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
else
pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
(void *)bugaddr);
+ pr_flush(1000, true);
return BUG_TRAP_TYPE_BUG;
}
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 9e14ae02306b..083882a3cf2f 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -557,7 +557,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
struct debug_obj *obj;
unsigned long flags;
- fill_pool();
+#ifdef CONFIG_PREEMPT_RT
+ if (preempt_count() == 0 && !irqs_disabled())
+#endif
+ fill_pool();
db = get_bucket((unsigned long) addr);
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
index 2f17b488d58e..7557bf7ecf1f 100644
--- a/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop)
list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(irq_poll_sched);
@@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop)
local_irq_save(flags);
__irq_poll_complete(iop);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(irq_poll_complete);
@@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
}
local_irq_enable();
+ preempt_check_resched_rt();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
@@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_enable();
+ preempt_check_resched_rt();
}
/**
@@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
local_irq_enable();
+ preempt_check_resched_rt();
return 0;
}
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 0f6b262e0964..2b70ae9f90d3 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -795,6 +795,8 @@ GENERATE_TESTCASE(init_held_rtmutex);
#include "locking-selftest-spin-hardirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
+#ifndef CONFIG_PREEMPT_RT
+
#include "locking-selftest-rlock-hardirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
@@ -810,9 +812,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
#include "locking-selftest-wlock-softirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
+#endif
+
#undef E1
#undef E2
+#ifndef CONFIG_PREEMPT_RT
/*
* Enabling hardirqs with a softirq-safe lock held:
*/
@@ -845,6 +850,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
#undef E1
#undef E2
+#endif
+
/*
* Enabling irqs with an irq-safe lock held:
*/
@@ -868,6 +875,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
#include "locking-selftest-spin-hardirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
+#ifndef CONFIG_PREEMPT_RT
+
#include "locking-selftest-rlock-hardirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
@@ -883,6 +892,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
#include "locking-selftest-wlock-softirq.h"
GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
+#endif
+
#undef E1
#undef E2
@@ -914,6 +925,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
#include "locking-selftest-spin-hardirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
+#ifndef CONFIG_PREEMPT_RT
+
#include "locking-selftest-rlock-hardirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
@@ -929,6 +942,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
#include "locking-selftest-wlock-softirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
+#endif
+
#undef E1
#undef E2
#undef E3
@@ -962,6 +977,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
#include "locking-selftest-spin-hardirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
+#ifndef CONFIG_PREEMPT_RT
+
#include "locking-selftest-rlock-hardirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
@@ -977,10 +994,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
#include "locking-selftest-wlock-softirq.h"
GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
+#endif
+
#undef E1
#undef E2
#undef E3
+#ifndef CONFIG_PREEMPT_RT
+
/*
* read-lock / write-lock irq inversion.
*
@@ -1170,6 +1191,11 @@ GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1)
#undef E1
#undef E2
#undef E3
+
+#endif
+
+#ifndef CONFIG_PREEMPT_RT
+
/*
* read-lock / write-lock recursion that is actually safe.
*/
@@ -1216,6 +1242,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock)
#undef E2
#undef E3
+#endif
+
/*
* read-lock / write-lock recursion that is unsafe.
*/
@@ -2842,6 +2870,7 @@ void locking_selftest(void)
printk(" --------------------------------------------------------------------------\n");
+#ifndef CONFIG_PREEMPT_RT
/*
* irq-context testcases:
*/
@@ -2856,6 +2885,28 @@ void locking_selftest(void)
DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2);
DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3);
+#else
+ /* On -rt, we only do hardirq context test for raw spinlock */
+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
+
+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
+
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
+
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
+#endif
ww_tests();
force_read_lock_recursive = 0;
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 8abe1870dba4..b09a490f5f70 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
touch_softlockup_watchdog();
}
- /*
- * Force flush any remote buffers that might be stuck in IRQ context
- * and therefore could not run their irq_work.
- */
- printk_safe_flush();
-
clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index a59778946404..907f59045998 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -892,7 +892,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
flush_kernel_dcache_page(miter->page);
if (miter->__flags & SG_MITER_ATOMIC) {
- WARN_ON_ONCE(preemptible());
+ WARN_ON_ONCE(!pagefault_disabled());
kunmap_atomic(miter->addr);
} else
kunmap(miter->page);
diff --git a/lib/test_lockup.c b/lib/test_lockup.c
index 864554e76973..906b598740a7 100644
--- a/lib/test_lockup.c
+++ b/lib/test_lockup.c
@@ -485,13 +485,13 @@ static int __init test_lockup_init(void)
offsetof(spinlock_t, lock.wait_lock.magic),
SPINLOCK_MAGIC) ||
test_magic(lock_rwlock_ptr,
- offsetof(rwlock_t, rtmutex.wait_lock.magic),
+ offsetof(rwlock_t, rwbase.rtmutex.wait_lock.magic),
SPINLOCK_MAGIC) ||
test_magic(lock_mutex_ptr,
- offsetof(struct mutex, lock.wait_lock.magic),
+ offsetof(struct mutex, rtmutex.wait_lock.magic),
SPINLOCK_MAGIC) ||
test_magic(lock_rwsem_ptr,
- offsetof(struct rw_semaphore, rtmutex.wait_lock.magic),
+ offsetof(struct rw_semaphore, rwbase.rtmutex.wait_lock.magic),
SPINLOCK_MAGIC))
return -EINVAL;
#else
@@ -502,7 +502,7 @@ static int __init test_lockup_init(void)
offsetof(rwlock_t, magic),
RWLOCK_MAGIC) ||
test_magic(lock_mutex_ptr,
- offsetof(struct mutex, wait_lock.rlock.magic),
+ offsetof(struct mutex, wait_lock.magic),
SPINLOCK_MAGIC) ||
test_magic(lock_rwsem_ptr,
offsetof(struct rw_semaphore, wait_lock.magic),
diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 000000000000..6f206be67cd2
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt1
diff --git a/mm/Kconfig b/mm/Kconfig
index 02d44e3420f5..453ce5f4811f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -403,7 +403,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
help
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 946d84815693..a9b294463487 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,6 +66,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include <linux/local_lock.h>
#include <linux/uaccess.h>
@@ -96,6 +97,13 @@ bool cgroup_memory_noswap __read_mostly;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
+struct event_lock {
+ local_lock_t l;
+};
+static DEFINE_PER_CPU(struct event_lock, event_lock) = {
+ .l = INIT_LOCAL_LOCK(l),
+};
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -256,7 +264,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
extern spinlock_t css_set_lock;
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+ unsigned int nr_pages,
+ bool lock_memcg_stock);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -293,7 +302,7 @@ static void obj_cgroup_release(struct percpu_ref *ref)
spin_lock_irqsave(&css_set_lock, flags);
memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages, false);
list_del(&objcg->list);
mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);
@@ -693,6 +702,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
+ preempt_disable_rt();
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
@@ -712,6 +722,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
x = 0;
}
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+ preempt_enable_rt();
}
/**
@@ -2041,6 +2052,7 @@ void unlock_page_memcg(struct page *page)
EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
+ local_lock_t lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -2092,7 +2104,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2100,7 +2112,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
return ret;
}
@@ -2135,26 +2147,28 @@ static void drain_local_stock(struct work_struct *dummy)
* The only protection from memory hotplug vs. drain_stock races is
* that we always operate on local CPU stock here with IRQ disabled
*/
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+ bool lock_memcg_stock)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
- local_irq_save(flags);
+ if (lock_memcg_stock)
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
@@ -2167,7 +2181,8 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
- local_irq_restore(flags);
+ if (lock_memcg_stock)
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
/*
@@ -2187,7 +2202,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ curcpu = get_cpu_light();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2210,7 +2225,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ put_cpu_light();
mutex_unlock(&percpu_charge_mutex);
}
@@ -2651,7 +2666,7 @@ force:
done_restock:
if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
+ refill_stock(memcg, batch - nr_pages, true);
/*
* If the hierarchy is above the normal consumption range, schedule
@@ -2903,7 +2918,8 @@ static void memcg_free_cache_id(int id)
* @nr_pages: number of pages to uncharge
*/
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages)
+ unsigned int nr_pages,
+ bool lock_memcg_stock)
{
struct mem_cgroup *memcg;
@@ -2911,7 +2927,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
- refill_stock(memcg, nr_pages);
+ refill_stock(memcg, nr_pages, lock_memcg_stock);
css_put(&memcg->css);
}
@@ -2998,7 +3014,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
return;
objcg = __page_objcg(page);
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages, true);
page->memcg_data = 0;
obj_cgroup_put(objcg);
}
@@ -3009,7 +3025,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
@@ -3017,7 +3033,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
return ret;
}
@@ -3034,7 +3050,7 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
if (nr_pages)
- obj_cgroup_uncharge_pages(old, nr_pages);
+ obj_cgroup_uncharge_pages(old, nr_pages, false);
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3073,7 +3089,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
struct memcg_stock_pcp *stock;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
@@ -3087,7 +3103,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
if (stock->nr_bytes > PAGE_SIZE)
drain_obj_stock(stock);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
@@ -5512,12 +5528,12 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
- local_irq_disable();
+ local_lock_irq(&event_lock.l);
mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
- local_irq_enable();
+ local_unlock_irq(&event_lock.l);
out_unlock:
unlock_page(page);
out:
@@ -6535,10 +6551,10 @@ static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
css_get(&memcg->css);
commit_charge(page, memcg);
- local_irq_disable();
+ local_lock_irq(&event_lock.l);
mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
- local_irq_enable();
+ local_unlock_irq(&event_lock.l);
out:
return ret;
}
@@ -6665,11 +6681,11 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg_oom_recover(ug->memcg);
}
- local_irq_save(flags);
+ local_lock_irqsave(&event_lock.l, flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
memcg_check_events(ug->memcg, ug->dummy_page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&event_lock.l, flags);
/* drop reference from uncharge_page */
css_put(&ug->memcg->css);
@@ -6822,10 +6838,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
css_get(&memcg->css);
commit_charge(newpage, memcg);
- local_irq_save(flags);
+ local_lock_irqsave(&event_lock.l, flags);
mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
memcg_check_events(memcg, newpage);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&event_lock.l, flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -6911,7 +6927,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
- refill_stock(memcg, nr_pages);
+ refill_stock(memcg, nr_pages, true);
}
static int __init cgroup_memory(char *s)
@@ -6953,9 +6969,13 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock;
+
+ stock = per_cpu_ptr(&memcg_stock, cpu);
+ INIT_WORK(&stock->work, drain_local_stock);
+ local_lock_init(&stock->lock);
+ }
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
@@ -7004,6 +7024,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
+ unsigned long flags;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
@@ -7052,9 +7073,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
+ local_lock_irqsave(&event_lock.l, flags);
+#ifndef CONFIG_PREEMPT_RT
VM_BUG_ON(!irqs_disabled());
+#endif
mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
+ local_unlock_irqrestore(&event_lock.l, flags);
css_put(&memcg->css);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d79fa299b70c..9e4b406c79f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2150,7 +2150,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
- __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+ __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de392908bf64..a379dcdfa7de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -122,6 +122,13 @@ typedef int __bitwise fpi_t;
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
+struct pagesets {
+ local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1456,6 +1463,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
} while (--count && --batch_free && !list_empty(list));
}
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1482,13 +1493,15 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
- spin_lock(&zone->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1571,16 +1584,22 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
- local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -2958,6 +2977,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
{
int i, allocated = 0;
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -3010,12 +3033,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -3029,16 +3052,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
- struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- local_irq_save(flags);
- pset = per_cpu_ptr(zone->pageset, cpu);
+ local_lock_irqsave(&pagesets.lock, flags);
- pcp = &pset->pcp;
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3086,9 +3108,9 @@ static void drain_local_pages_wq(struct work_struct *work)
* cpu which is alright but we also have to make sure to not move to
* a different one.
*/
- preempt_disable();
+ migrate_disable();
drain_local_pages(drain->zone);
- preempt_enable();
+ migrate_enable();
}
/*
@@ -3136,7 +3158,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pcp;
+ struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
@@ -3147,13 +3169,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
*/
has_pcps = true;
} else if (zone) {
- pcp = per_cpu_ptr(zone->pageset, cpu);
- if (pcp->pcp.count)
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ if (pcp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
- pcp = per_cpu_ptr(z->pageset, cpu);
- if (pcp->pcp.count) {
+ pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+ if (pcp->count) {
has_pcps = true;
break;
}
@@ -3258,32 +3280,14 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ int migratetype)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
- int migratetype;
- migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
-
- /*
- * We only track unmovable, reclaimable and movable on pcp lists.
- * Free ISOLATE pages back to the allocator because they are being
- * offlined but treat HIGHATOMIC as movable pages so we can get those
- * areas back if necessary. Otherwise, we may have to free
- * excessively into the page allocator
- */
- if (migratetype >= MIGRATE_PCPTYPES) {
- if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
- return;
- }
- migratetype = MIGRATE_MOVABLE;
- }
-
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= READ_ONCE(pcp->high))
@@ -3297,13 +3301,30 @@ void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+ int migratetype;
if (!free_unref_page_prepare(page, pfn))
return;
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ /*
+ * We only track unmovable, reclaimable and movable on pcp lists.
+ * Place ISOLATE pages on the isolated list because they are being
+ * offlined but treat HIGHATOMIC as movable pages so we can get those
+ * areas back if necessary. Otherwise, we may have to free
+ * excessively into the page allocator
+ */
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ return;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
+ local_lock_irqsave(&pagesets.lock, flags);
+ free_unref_page_commit(page, pfn, migratetype);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3314,34 +3335,56 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
list_del(&page->lru);
+
+ /*
+ * Free isolated pages directly to the allocator, see
+ * comment in free_unref_page.
+ */
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ list_del(&page->lru);
+ free_one_page(page_zone(page), page, pfn, 0,
+ migratetype, FPI_NONE);
+ continue;
+ }
+
+ /*
+ * Non-isolated types over MIGRATE_PCPTYPES get added
+ * to the MIGRATE_MOVABLE pcp list.
+ */
+ set_pcppage_migratetype(page, MIGRATE_MOVABLE);
+ }
+
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_private(page);
-
+ pfn = page_private(page);
set_page_private(page, 0);
+ migratetype = get_pcppage_migratetype(page);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ free_unref_page_commit(page, pfn, migratetype);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
}
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3440,7 +3483,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
*
* Must be called with interrupts disabled.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ long nr_account)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3453,12 +3497,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
- __inc_numa_state(z, NUMA_HIT);
+ __count_numa_events(z, NUMA_HIT, nr_account);
else {
- __inc_numa_state(z, NUMA_MISS);
- __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+ __count_numa_events(z, NUMA_MISS, nr_account);
+ __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
- __inc_numa_state(z, local_stat);
+ __count_numa_events(z, local_stat, nr_account);
#endif
}
@@ -3498,15 +3542,15 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct page *page;
unsigned long flags;
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ local_lock_irqsave(&pagesets.lock, flags);
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
+ local_unlock_irqrestore(&pagesets.lock, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
- zone_statistics(preferred_zone, zone);
+ zone_statistics(preferred_zone, zone, 1);
}
- local_irq_restore(flags);
return page;
}
@@ -3558,15 +3602,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
- spin_unlock(&zone->lock);
if (!page)
goto failed;
+
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ zone_statistics(preferred_zone, zone, 1);
out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3579,7 +3623,7 @@ out:
return page;
failed:
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
@@ -5047,7 +5091,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- int nr_populated = 0;
+ int nr_populated = 0, nr_account = 0;
if (unlikely(nr_pages <= 0))
return 0;
@@ -5104,8 +5148,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
goto failed;
/* Attempt the batch allocation */
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ local_lock_irqsave(&pagesets.lock, flags);
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp_list = &pcp->lists[ac.migratetype];
while (nr_populated < nr_pages) {
@@ -5124,15 +5168,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
goto failed_irq;
break;
}
-
- /*
- * Ideally this would be batched but the best way to do
- * that cheaply is to first convert zone_statistics to
- * be inaccurate per-cpu counter like vm_events to avoid
- * a RMW cycle then do the accounting with IRQs enabled.
- */
- __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
- zone_statistics(ac.preferred_zoneref->zone, zone);
+ nr_account++;
prep_new_page(page, 0, gfp, 0);
if (page_list)
@@ -5142,12 +5178,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nr_populated++;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
+
+ __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+ zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
return nr_populated;
failed_irq:
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
failed:
page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@ -5720,7 +5759,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
continue;
for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@ -5812,7 +5851,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
free_pcp = 0;
for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
show_node(zone);
printk(KERN_CONT
@@ -5853,7 +5892,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
- K(this_cpu_read(zone->pageset->pcp.count)),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6180,11 +6219,12 @@ static void build_zonelists(pg_data_t *pgdat)
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static void pageset_init(struct per_cpu_pageset *p);
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
/* These effectively disable the pcplists in the boot pageset completely */
#define BOOT_PAGESET_HIGH 0
#define BOOT_PAGESET_BATCH 1
-static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
@@ -6251,7 +6291,7 @@ build_all_zonelists_init(void)
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
- pageset_init(&per_cpu(boot_pageset, cpu));
+ per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
@@ -6650,14 +6690,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
WRITE_ONCE(pcp->high, high);
}
-static void pageset_init(struct per_cpu_pageset *p)
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
- struct per_cpu_pages *pcp;
int migratetype;
- memset(p, 0, sizeof(*p));
+ memset(pcp, 0, sizeof(*pcp));
+ memset(pzstats, 0, sizeof(*pzstats));
- pcp = &p->pcp;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
@@ -6674,12 +6713,12 @@ static void pageset_init(struct per_cpu_pageset *p)
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
unsigned long batch)
{
- struct per_cpu_pageset *p;
+ struct per_cpu_pages *pcp;
int cpu;
for_each_possible_cpu(cpu) {
- p = per_cpu_ptr(zone->pageset, cpu);
- pageset_update(&p->pcp, high, batch);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pageset_update(pcp, high, batch);
}
}
@@ -6714,13 +6753,20 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
void __meminit setup_zone_pageset(struct zone *zone)
{
- struct per_cpu_pageset *p;
int cpu;
- zone->pageset = alloc_percpu(struct per_cpu_pageset);
+ /* Size may be 0 on !SMP && !NUMA */
+ if (sizeof(struct per_cpu_zonestat) > 0)
+ zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
+
+ zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
for_each_possible_cpu(cpu) {
- p = per_cpu_ptr(zone->pageset, cpu);
- pageset_init(p);
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat *pzstats;
+
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ per_cpu_pages_init(pcp, pzstats);
}
zone_set_pageset_high_and_batch(zone);
@@ -6747,9 +6793,9 @@ void __init setup_per_cpu_pageset(void)
* the nodes these zones are associated with.
*/
for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
- memset(pcp->vm_numa_stat_diff, 0,
- sizeof(pcp->vm_numa_stat_diff));
+ struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+ memset(pzstats->vm_numa_event, 0,
+ sizeof(pzstats->vm_numa_event));
}
#endif
@@ -6765,7 +6811,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
- zone->pageset = &boot_pageset;
+ zone->per_cpu_pageset = &boot_pageset;
+ zone->per_cpu_zonestats = &boot_zonestats;
zone->pageset_high = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
@@ -9050,15 +9097,17 @@ void zone_pcp_enable(struct zone *zone)
void zone_pcp_reset(struct zone *zone)
{
int cpu;
- struct per_cpu_pageset *pset;
+ struct per_cpu_zonestat *pzstats;
- if (zone->pageset != &boot_pageset) {
+ if (zone->per_cpu_pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
- pset = per_cpu_ptr(zone->pageset, cpu);
- drain_zonestat(zone, pset);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ drain_zonestat(zone, pzstats);
}
- free_percpu(zone->pageset);
- zone->pageset = &boot_pageset;
+ free_percpu(zone->per_cpu_pageset);
+ free_percpu(zone->per_cpu_zonestats);
+ zone->per_cpu_pageset = &boot_pageset;
+ zone->per_cpu_zonestats = &boot_zonestats;
}
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 680d83cab077..2d2296786db3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
@@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
}
*inop = ino;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
} else if (inop) {
/*
* __shmem_file_setup, one of our callers, is lock-free: it
@@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
* to worry about things like glibc compatibility.
*/
ino_t *next_ino;
+
next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
ino = *next_ino;
if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
ino = sbinfo->next_ino;
sbinfo->next_ino += SHMEM_INO_BATCH;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
if (unlikely(is_zero_ino(ino)))
ino++;
}
@@ -341,9 +342,9 @@ static void shmem_free_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -1453,10 +1454,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -3532,9 +3533,10 @@ static int shmem_reconfigure(struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
+ struct mempolicy *mpol = NULL;
const char *err;
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3579,14 +3581,15 @@ static int shmem_reconfigure(struct fs_context *fc)
* Preserve previous mempolicy unless mpol remount option was specified.
*/
if (ctx->mpol) {
- mpol_put(sbinfo->mpol);
+ mpol = sbinfo->mpol;
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return invalfc(fc, "%s", err);
}
@@ -3703,7 +3706,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
spin_lock_init(&sbinfo->shrinklist_lock);
diff --git a/mm/slub.c b/mm/slub.c
index e32ded30506e..27660a93d456 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -44,13 +44,22 @@
/*
* Lock order:
* 1. slab_mutex (Global Mutex)
- * 2. node->list_lock
- * 3. slab_lock(page) (Only on some arches and for debugging)
+ * 2. node->list_lock (Spinlock)
+ * OR
+ * kmem_cache->cpu_slab->lock (Local lock)
+ * 3. slab_lock(page) (Only on some arches or for debugging)
+ * 4. object_map_lock (Only for debugging)
*
* slab_mutex
*
* The role of the slab_mutex is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
+ * Also synchronizes memory hotplug callbacks.
+ *
+ * slab_lock
+ *
+ * The slab_lock is a wrapper around the page lock, thus it is a bit
+ * spinlock.
*
* The slab_lock is only used for debugging and on arches that do not
* have the ability to do a cmpxchg_double. It only protects:
@@ -59,6 +68,8 @@
* C. page->objects -> Number of objects in page
* D. page->frozen -> frozen state
*
+ * Frozen slabs
+ *
* If a slab is frozen then it is exempt from list management. It is not
* on any list except per cpu partial list. The processor that froze the
* slab is the one who can perform list operations on the page. Other
@@ -66,6 +77,8 @@
* froze the slab is the only one that can retrieve the objects from the
* page's freelist.
*
+ * list_lock
+ *
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
* removed from the lists nor make the number of partial slabs be modified.
@@ -77,10 +90,36 @@
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
- * Interrupts are disabled during allocation and deallocation in order to
- * make the slab allocator safe to use in the context of an irq. In addition
- * interrupts are disabled to ensure that the processor does not change
- * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * cpu_slab->lock local lock
+ *
+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ * except the stat counters. This is a percpu structure manipulated only by
+ * the local cpu, so the lock protects against being preempted or interrupted
+ * by an irq. Fast path operations rely on lockless operations instead.
+ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ * prevent the lockless operations), so fastpath operations also need to take
+ * the lock and are no longer lockless.
+ *
+ * lockless fastpaths
+ *
+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ * are fully lockless when satisfied from the percpu slab (and when
+ * cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ * They also don't disable preemption or migration or irqs. They rely on
+ * the transaction id (tid) field to detect being preempted or moved to
+ * another cpu.
+ *
+ * irq, preemption, migration considerations
+ *
+ * Interrupts are disabled as part of list_lock or local_lock operations, or
+ * around the slab_lock operation, in order to make the slab allocator safe
+ * to use in the context of an irq.
+ *
+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ * doesn't have to be revalidated in each section protected by the local lock.
*
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
@@ -116,6 +155,26 @@
* the fast path and disables lockless freelists.
*/
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#else
+#define slub_get_cpu_ptr(var) \
+({ \
+ migrate_disable(); \
+ this_cpu_ptr(var); \
+})
+#define slub_put_cpu_ptr(var) \
+do { \
+ (void)(var); \
+ migrate_enable(); \
+} while (0)
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
@@ -354,25 +413,42 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void slab_lock(struct page *page)
+static __always_inline void
+__slab_lock(struct page *page, unsigned long *flags, bool disable_irqs)
{
VM_BUG_ON_PAGE(PageTail(page), page);
+ if (disable_irqs)
+ local_irq_save(*flags);
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void slab_unlock(struct page *page)
+static __always_inline void
+__slab_unlock(struct page *page, unsigned long *flags, bool disable_irqs)
{
VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
+ if (disable_irqs)
+ local_irq_restore(*flags);
}
-/* Interrupts must be disabled (for the fallback code to work right) */
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static __always_inline void
+slab_lock(struct page *page, unsigned long *flags)
+{
+ __slab_lock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
+static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
+{
+ __slab_unlock(page, flags, IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
+static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
- const char *n)
+ const char *n, bool disable_irqs)
{
- VM_BUG_ON(!irqs_disabled());
+ if (!disable_irqs)
+ lockdep_assert_irqs_disabled();
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
@@ -383,15 +459,17 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
} else
#endif
{
- slab_lock(page);
+ unsigned long flags;
+
+ __slab_lock(page, &flags, disable_irqs);
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
page->counters = counters_new;
- slab_unlock(page);
+ __slab_unlock(page, &flags, disable_irqs);
return true;
}
- slab_unlock(page);
+ __slab_unlock(page, &flags, disable_irqs);
}
cpu_relax();
@@ -404,51 +482,46 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
return false;
}
+/*
+ * Interrupts must be disabled (for the fallback code to work right), typically
+ * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
+ * so we disable interrupts explicitly here.
+ */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old,
+ freelist_new, counters_new, n,
+ IS_ENABLED(CONFIG_PREEMPT_RT));
+}
+
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- unsigned long flags;
+ return ___cmpxchg_double_slab(s, page, freelist_old, counters_old,
+ freelist_new, counters_new, n, true);
+}
- local_irq_save(flags);
- slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- slab_unlock(page);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(page);
- local_irq_restore(flags);
- }
+#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_RAW_SPINLOCK(object_map_lock);
- cpu_relax();
- stat(s, CMPXCHG_DOUBLE_FAIL);
+static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
+ struct page *page)
+{
+ void *addr = page_address(page);
+ void *p;
-#ifdef SLUB_DEBUG_CMPXCHG
- pr_info("%s %s: cmpxchg double redo ", n, s->name);
-#endif
+ bitmap_zero(obj_map, page->objects);
- return false;
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(__obj_to_index(s, addr, p), obj_map);
}
-#ifdef CONFIG_SLUB_DEBUG
-static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_SPINLOCK(object_map_lock);
-
/*
* Determine a map of object in use on a page.
*
@@ -458,17 +531,11 @@ static DEFINE_SPINLOCK(object_map_lock);
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
__acquires(&object_map_lock)
{
- void *p;
- void *addr = page_address(page);
-
VM_BUG_ON(!irqs_disabled());
- spin_lock(&object_map_lock);
+ raw_spin_lock(&object_map_lock);
- bitmap_zero(object_map, page->objects);
-
- for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(__obj_to_index(s, addr, p), object_map);
+ __fill_map(object_map, s, page);
return object_map;
}
@@ -476,7 +543,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
- spin_unlock(&object_map_lock);
+ raw_spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -963,8 +1030,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
{
int maxobj;
- VM_BUG_ON(!irqs_disabled());
-
if (!PageSlab(page)) {
slab_err(s, page, "Not a valid slab page");
return 0;
@@ -1225,11 +1290,11 @@ static noinline int free_debug_processing(
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long flags;
+ unsigned long flags, flags2;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(page);
+ slab_lock(page, &flags2);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!check_slab(s, page))
@@ -1262,7 +1327,7 @@ out:
slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
- slab_unlock(page);
+ slab_unlock(page, &flags2);
spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
@@ -1540,20 +1605,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
{
kmemleak_free_recursive(x, s->flags);
- /*
- * Trouble is that we may no longer disable interrupts in the fast path
- * So in order to make the debug calls that expect irqs to be
- * disabled we need to disable interrupts temporarily.
- */
-#ifdef CONFIG_LOCKDEP
- {
- unsigned long flags;
+ debug_check_no_locks_freed(x, s->object_size);
- local_irq_save(flags);
- debug_check_no_locks_freed(x, s->object_size);
- local_irq_restore(flags);
- }
-#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
@@ -1770,9 +1823,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- if (gfpflags_allow_blocking(flags))
- local_irq_enable();
-
flags |= s->allocflags;
/*
@@ -1831,8 +1881,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->frozen = 1;
out:
- if (gfpflags_allow_blocking(flags))
- local_irq_disable();
if (!page)
return NULL;
@@ -1846,6 +1894,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
@@ -1976,11 +2026,12 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct kmem_cache_cpu *c, gfp_t flags)
+ struct page **ret_page, gfp_t gfpflags)
{
struct page *page, *page2;
void *object = NULL;
unsigned int available = 0;
+ unsigned long flags;
int objects;
/*
@@ -1992,11 +2043,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(page, flags))
+ if (!pfmemalloc_match(page, gfpflags))
continue;
t = acquire_slab(s, n, page, object == NULL, &objects);
@@ -2005,7 +2056,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
available += objects;
if (!object) {
- c->page = page;
+ *ret_page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
@@ -2017,7 +2068,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
break;
}
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return object;
}
@@ -2025,7 +2076,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
* Get a page from somewhere. Search in increasing NUMA distances.
*/
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct kmem_cache_cpu *c)
+ struct page **ret_page)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
@@ -2067,7 +2118,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, c, flags);
+ object = get_partial_node(s, n, ret_page, flags);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2089,7 +2140,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
* Get a partial page, lock it and return it.
*/
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct kmem_cache_cpu *c)
+ struct page **ret_page)
{
void *object;
int searchnode = node;
@@ -2097,11 +2148,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), c, flags);
+ object = get_partial_node(s, get_node(s, searchnode), ret_page, flags);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, c);
+ return get_any_partial(s, flags, ret_page);
}
#ifdef CONFIG_PREEMPTION
@@ -2168,16 +2219,23 @@ static inline void note_cmpxchg_failure(const char *n,
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
+ struct kmem_cache_cpu *c;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(s->cpu_slab, cpu);
+ local_lock_init(&c->lock);
+ c->tid = init_tid(cpu);
+ }
}
/*
- * Remove the cpu slab
+ * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist,
+ * unfreezes the slabs and puts it on the proper list.
+ * Assumes the slab has been already safely taken away from kmem_cache_cpu
+ * by the caller.
*/
static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist, struct kmem_cache_cpu *c)
+ void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2185,6 +2243,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
enum slab_modes l = M_NONE, m = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
+ unsigned long flags = 0;
struct page new;
struct page old;
@@ -2260,7 +2319,7 @@ redo:
* that acquire_slab() will see a slab page that
* is frozen
*/
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
} else {
m = M_FULL;
@@ -2271,7 +2330,7 @@ redo:
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2288,14 +2347,14 @@ redo:
}
l = m;
- if (!__cmpxchg_double_slab(s, page,
+ if (!cmpxchg_double_slab(s, page,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"))
goto redo;
if (lock)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
if (m == M_PARTIAL)
stat(s, tail);
@@ -2306,38 +2365,29 @@ redo:
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-
- c->page = NULL;
- c->freelist = NULL;
}
-/*
- * Unfreeze all the cpu partial slabs.
- *
- * This function must be called with interrupts disabled
- * for the cpu using c (or some other guarantee must be there
- * to guarantee no concurrent accesses).
- */
-static void unfreeze_partials(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
+{
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct page *page, *discard_page = NULL;
+ unsigned long flags = 0;
- while ((page = slub_percpu_partial(c))) {
+ while (partial_page) {
struct page new;
struct page old;
- slub_set_percpu_partial(c, page);
+ page = partial_page;
+ partial_page = page->next;
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
n = n2;
- spin_lock(&n->list_lock);
+ spin_lock_irqsave(&n->list_lock, flags);
}
do {
@@ -2366,7 +2416,7 @@ static void unfreeze_partials(struct kmem_cache *s,
}
if (n)
- spin_unlock(&n->list_lock);
+ spin_unlock_irqrestore(&n->list_lock, flags);
while (discard_page) {
page = discard_page;
@@ -2376,10 +2426,50 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with preemption or migration
+ * disabled.
+ */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct page *partial_page;
+
+ do {
+ partial_page = this_cpu_read(s->cpu_slab->partial);
+
+ } while (partial_page &&
+ this_cpu_cmpxchg(s->cpu_slab->partial, partial_page, NULL)
+ != partial_page);
+
+ if (partial_page)
+ __unfreeze_partials(s, partial_page);
+}
+
+static void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+ struct page *partial_page;
+
+ partial_page = slub_percpu_partial(c);
+ c->partial = NULL;
+
+ if (partial_page)
+ __unfreeze_partials(s, partial_page);
+}
+
+#else /* CONFIG_SLUB_CPU_PARTIAL */
+
+static void unfreeze_partials(struct kmem_cache *s) { }
+static void unfreeze_partials_cpu(struct kmem_cache *s,
+ struct kmem_cache_cpu *c) { }
+
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
+/*
* Put a page that was just frozen (in __slab_free|get_partial_node) into a
* partial page slot if available.
*
@@ -2393,7 +2483,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
- preempt_disable();
+ slub_get_cpu_ptr(s->cpu_slab);
do {
pages = 0;
pobjects = 0;
@@ -2403,14 +2493,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > slub_cpu_partial(s)) {
- unsigned long flags;
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
+ unfreeze_partials(s);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2427,58 +2514,113 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
- if (unlikely(!slub_cpu_partial(s))) {
- unsigned long flags;
-
- local_irq_save(flags);
- unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- local_irq_restore(flags);
- }
- preempt_enable();
+ slub_put_cpu_ptr(s->cpu_slab);
#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
-static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
+ bool lock)
{
- stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c->page, c->freelist, c);
+ unsigned long flags;
+ void *freelist;
+ struct page *page;
+
+ if (lock)
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+ freelist = c->freelist;
+ page = c->page;
+ c->page = NULL;
+ c->freelist = NULL;
c->tid = next_tid(c->tid);
+
+ if (lock)
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ if (page)
+ deactivate_slab(s, page, freelist);
+
+ stat(s, CPUSLAB_FLUSH);
}
-/*
- * Flush cpu slab.
- *
- * Called from IPI handler with interrupts disabled.
- */
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
if (c->page)
- flush_slab(s, c);
+ flush_slab(s, c, false);
- unfreeze_partials(s, c);
+ unfreeze_partials_cpu(s, c);
}
-static void flush_cpu_slab(void *d)
+struct slub_flush_work {
+ struct work_struct work;
+ struct kmem_cache *s;
+ bool skip;
+};
+
+/*
+ * Flush cpu slab.
+ *
+ * Called from CPU work handler with migration disabled.
+ */
+static void flush_cpu_slab(struct work_struct *w)
{
- struct kmem_cache *s = d;
+ struct kmem_cache *s;
+ struct kmem_cache_cpu *c;
+ struct slub_flush_work *sfw;
- __flush_cpu_slab(s, smp_processor_id());
+ sfw = container_of(w, struct slub_flush_work, work);
+
+ s = sfw->s;
+ c = this_cpu_ptr(s->cpu_slab);
+
+ if (c->page)
+ flush_slab(s, c, true);
+
+ unfreeze_partials(s);
}
-static bool has_cpu_slab(int cpu, void *info)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
{
- struct kmem_cache *s = info;
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
return c->page || slub_percpu_partial(c);
}
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+ struct slub_flush_work *sfw;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ mutex_lock(&flush_lock);
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (!has_cpu_slab(cpu, s)) {
+ sfw->skip = true;
+ continue;
+ }
+ INIT_WORK(&sfw->work, flush_cpu_slab);
+ sfw->skip = false;
+ sfw->s = s;
+ schedule_work_on(cpu, &sfw->work);
+ }
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ if (sfw->skip)
+ continue;
+ flush_work(&sfw->work);
+ }
+
+ mutex_unlock(&flush_lock);
+ cpus_read_unlock();
}
/*
@@ -2488,14 +2630,10 @@ static void flush_all(struct kmem_cache *s)
static int slub_cpu_dead(unsigned int cpu)
{
struct kmem_cache *s;
- unsigned long flags;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- local_irq_save(flags);
+ list_for_each_entry(s, &slab_caches, list)
__flush_cpu_slab(s, cpu);
- local_irq_restore(flags);
- }
mutex_unlock(&slab_mutex);
return 0;
}
@@ -2578,41 +2716,6 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
#endif
}
-static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
- int node, struct kmem_cache_cpu **pc)
-{
- void *freelist;
- struct kmem_cache_cpu *c = *pc;
- struct page *page;
-
- WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
-
- freelist = get_partial(s, flags, node, c);
-
- if (freelist)
- return freelist;
-
- page = new_slab(s, flags, node);
- if (page) {
- c = raw_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
-
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- freelist = page->freelist;
- page->freelist = NULL;
-
- stat(s, ALLOC_SLAB);
- c->page = page;
- *pc = c;
- }
-
- return freelist;
-}
-
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
if (unlikely(PageSlabPfmemalloc(page)))
@@ -2671,7 +2774,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*
- * Version of __slab_alloc to use when we know that interrupts are
+ * Version of __slab_alloc to use when we know that preemption is
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
@@ -2679,10 +2782,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void *freelist;
struct page *page;
+ unsigned long flags;
stat(s, ALLOC_SLOWPATH);
- page = c->page;
+reread_page:
+
+ page = READ_ONCE(c->page);
if (!page) {
/*
* if the node is not online or has no normal memory, just
@@ -2705,8 +2811,7 @@ redo:
goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ goto deactivate_slab;
}
}
@@ -2715,12 +2820,15 @@ redo:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match(page, gfpflags))) {
- deactivate_slab(s, page, c->freelist, c);
- goto new_slab;
+ if (unlikely(!pfmemalloc_match(page, gfpflags)))
+ goto deactivate_slab;
+
+ /* must check again c->page in case we got preempted and it changed */
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(page != c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
}
-
- /* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist;
@@ -2729,6 +2837,7 @@ redo:
if (!freelist) {
c->page = NULL;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2736,6 +2845,13 @@ redo:
stat(s, ALLOC_REFILL);
load_freelist:
+
+#ifdef CONFIG_PREEMPT_RT
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
+#else
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+#endif
+
/*
* freelist is pointing to the list of objects to be used.
* page is pointing to the page from which the objects are obtained.
@@ -2744,59 +2860,141 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;
+deactivate_slab:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (page != c->page) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
+ }
+ freelist = c->freelist;
+ c->page = NULL;
+ c->freelist = NULL;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ deactivate_slab(s, page, freelist);
+
new_slab:
if (slub_percpu_partial(c)) {
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto reread_page;
+ }
+ if (unlikely(!slub_percpu_partial(c))) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ /* we were preempted and partial list got empty */
+ goto new_objects;
+ }
+
page = c->page = slub_percpu_partial(c);
slub_set_percpu_partial(c, page);
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}
- freelist = new_slab_objects(s, gfpflags, node, &c);
+new_objects:
+
+ freelist = get_partial(s, gfpflags, node, &page);
+ if (freelist)
+ goto check_new_page;
- if (unlikely(!freelist)) {
+ slub_put_cpu_ptr(s->cpu_slab);
+ page = new_slab(s, gfpflags, node);
+ c = slub_get_cpu_ptr(s->cpu_slab);
+
+ if (unlikely(!page)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
- page = c->page;
- if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
- goto load_freelist;
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ freelist = page->freelist;
+ page->freelist = NULL;
- /* Only entered in the debug case */
- if (kmem_cache_debug(s) &&
- !alloc_debug_processing(s, page, freelist, addr))
- goto new_slab; /* Slab failed checks. Next slab needed */
+ stat(s, ALLOC_SLAB);
+
+check_new_page:
+
+ if (kmem_cache_debug(s)) {
+ if (!alloc_debug_processing(s, page, freelist, addr)) {
+ /* Slab failed checks. Next slab needed */
+ goto new_slab;
+ } else {
+ /*
+ * For debug case, we don't load freelist so that all
+ * allocations go through alloc_debug_processing()
+ */
+ goto return_single;
+ }
+ }
+
+ if (unlikely(!pfmemalloc_match(page, gfpflags)))
+ /*
+ * For !pfmemalloc_match() case we don't load freelist so that
+ * we don't make further mismatched allocations easier.
+ */
+ goto return_single;
+
+retry_load_page:
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ if (unlikely(c->page)) {
+ void *flush_freelist = c->freelist;
+ struct page *flush_page = c->page;
+
+ c->page = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ deactivate_slab(s, flush_page, flush_freelist);
- deactivate_slab(s, page, get_freepointer(s, freelist), c);
+ stat(s, CPUSLAB_FLUSH);
+
+ goto retry_load_page;
+ }
+ c->page = page;
+
+ goto load_freelist;
+
+return_single:
+
+ deactivate_slab(s, page, get_freepointer(s, freelist));
return freelist;
}
/*
- * Another one that disabled interrupt and compensates for possible
- * cpu changes by refetching the per cpu area pointer.
+ * A wrapper for ___slab_alloc() for contexts where preemption is not yet
+ * disabled. Compensates for possible cpu changes by refetching the per cpu area
+ * pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *p;
- unsigned long flags;
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_COUNT
/*
* We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
+ * cpu before disabling preemption. Need to reload cpu area
* pointer.
*/
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
#endif
p = ___slab_alloc(s, gfpflags, node, addr, c);
- local_irq_restore(flags);
+#ifdef CONFIG_PREEMPT_COUNT
+ slub_put_cpu_ptr(s->cpu_slab);
+#endif
return p;
}
@@ -2847,15 +3045,14 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * We should guarantee that tid and kmem_cache are retrieved on
- * the same cpu. It could be different if CONFIG_PREEMPTION so we need
- * to check if it is matched or not.
+ * We must guarantee that tid and kmem_cache_cpu are retrieved on the
+ * same cpu. We read first the kmem_cache_cpu pointer and use it to read
+ * the tid. If we are preempted and switched to another cpu between the
+ * two reads, it's OK as the two are still associated with the same cpu
+ * and cmpxchg later will validate the cpu.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/*
* Irqless object alloc/free algorithm used here depends on sequence
@@ -2876,7 +3073,15 @@ redo:
object = c->freelist;
page = c->page;
- if (unlikely(!object || !page || !node_match(page, node))) {
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if a
+ * slowpath has taken the local_lock_irqsave(), it is not protected
+ * against a fast path operation in an irq handler. So we need to take
+ * the slow path which uses local_lock. It is still relatively fast if
+ * there is a suitable cpu freelist.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+ unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3129,16 +3334,14 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succeed.
*/
- do {
- tid = this_cpu_read(s->cpu_slab->tid);
- c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPTION) &&
- unlikely(tid != READ_ONCE(c->tid)));
+ c = raw_cpu_ptr(s->cpu_slab);
+ tid = READ_ONCE(c->tid);
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
void **freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3151,6 +3354,32 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
+#else /* CONFIG_PREEMPT_RT */
+ /*
+ * We cannot use the lockless fastpath on PREEMPT_RT because if
+ * a slowpath has taken the local_lock_irqsave(), it is not
+ * protected against a fast path operation in an irq handler. So
+ * we need to take the local_lock. We shouldn't simply defer to
+ * __slab_free() as that wouldn't use the cpu freelist at all.
+ */
+ unsigned long flags;
+ void **freelist;
+
+ local_lock_irqsave(&s->cpu_slab->lock, flags);
+ c = this_cpu_ptr(s->cpu_slab);
+ if (unlikely(page != c->page)) {
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ goto redo;
+ }
+ tid = c->tid;
+ freelist = c->freelist;
+
+ set_freepointer(s, tail_obj, freelist);
+ c->freelist = head;
+ c->tid = next_tid(tid);
+
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+#endif
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3320,8 +3549,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* IRQs, which protects against PREEMPT and interrupts
* handlers invoking normal fastpath.
*/
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ c = slub_get_cpu_ptr(s->cpu_slab);
+ local_lock_irq(&s->cpu_slab->lock);
for (i = 0; i < size; i++) {
void *object = kfence_alloc(s, s->object_size, flags);
@@ -3342,6 +3571,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
*/
c->tid = next_tid(c->tid);
+ local_unlock_irq(&s->cpu_slab->lock);
+
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
@@ -3354,6 +3585,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
+ local_lock_irq(&s->cpu_slab->lock);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
@@ -3361,7 +3594,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
+ slub_put_cpu_ptr(s->cpu_slab);
/*
* memcg and kmem_cache debug support and memory initialization.
@@ -3371,7 +3605,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
slab_want_init_on_alloc(flags, s));
return i;
error:
- local_irq_enable();
+ local_unlock_irq(&s->cpu_slab->lock);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
@@ -3887,9 +4121,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
void *addr = page_address(page);
unsigned long *map;
void *p;
+ unsigned long flags;
slab_err(s, page, text, s->name);
- slab_lock(page);
+ slab_lock(page, &flags);
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
@@ -3900,7 +4135,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
put_map(map);
- slab_unlock(page);
+ slab_unlock(page, &flags);
#endif
}
@@ -4611,33 +4846,33 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page)
+static void validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *obj_map)
{
void *p;
void *addr = page_address(page);
- unsigned long *map;
+ unsigned long flags;
- slab_lock(page);
+ slab_lock(page, &flags);
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
goto unlock;
/* Now we know that a valid freelist exists */
- map = get_map(s, page);
+ __fill_map(obj_map, s, page);
for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
+ u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
if (!check_object(s, page, p, val))
break;
}
- put_map(map);
unlock:
- slab_unlock(page);
+ slab_unlock(page, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+ struct kmem_cache_node *n, unsigned long *obj_map)
{
unsigned long count = 0;
struct page *page;
@@ -4646,7 +4881,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab(s, page);
+ validate_slab(s, page, obj_map);
count++;
}
if (count != n->nr_partial)
@@ -4657,7 +4892,7 @@ static int validate_slab_node(struct kmem_cache *s,
goto out;
list_for_each_entry(page, &n->full, slab_list) {
- validate_slab(s, page);
+ validate_slab(s, page, obj_map);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4674,10 +4909,17 @@ static long validate_slab_cache(struct kmem_cache *s)
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
+ unsigned long *obj_map;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map)
+ return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n);
+ count += validate_slab_node(s, n, obj_map);
+
+ bitmap_free(obj_map);
return count;
}
@@ -4808,17 +5050,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc)
+ struct page *page, enum track_item alloc,
+ unsigned long *obj_map)
{
void *addr = page_address(page);
void *p;
- unsigned long *map;
- map = get_map(s, page);
+ __fill_map(obj_map, s, page);
+
for_each_object(p, s, addr, page->objects)
- if (!test_bit(__obj_to_index(s, addr, p), map))
+ if (!test_bit(__obj_to_index(s, addr, p), obj_map))
add_location(t, s, get_track(s, p, alloc));
- put_map(map);
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -4829,13 +5071,17 @@ static int list_locations(struct kmem_cache *s, char *buf,
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
+ unsigned long *obj_map;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map)
+ return sysfs_emit(buf, "Out of memory\n");
if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
+ bitmap_free(obj_map);
return sysfs_emit(buf, "Out of memory\n");
}
- /* Push back cpu slabs */
- flush_all(s);
for_each_kmem_cache_node(s, node, n) {
unsigned long flags;
@@ -4846,12 +5092,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, obj_map);
list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc);
+ process_slab(&t, s, page, alloc, obj_map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
+ bitmap_free(obj_map);
+
for (i = 0; i < t.count; i++) {
struct location *l = &t.loc[i];
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d0a7d89be091..62f73c4ada41 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1829,7 +1829,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
- int node, err;
+ int node, err, cpu;
void *vaddr;
node = numa_node_id();
@@ -1866,11 +1866,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_PTR(err);
}
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
return vaddr;
}
@@ -1935,6 +1936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
void *vaddr = NULL;
unsigned int order;
+ int cpu;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -1949,7 +1951,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
order = get_order(size);
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ cpu = get_cpu_light();
+ vbq = this_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
@@ -1972,7 +1975,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
break;
}
- put_cpu_var(vmap_block_queue);
+ put_cpu_light();
rcu_read_unlock();
/* Allocate new block if nothing was found */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cccee36b289c..d06332c221b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -31,8 +31,6 @@
#include "internal.h"
-#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
-
#ifdef CONFIG_NUMA
int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
@@ -41,11 +39,12 @@ static void zero_zone_numa_counters(struct zone *zone)
{
int item, cpu;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
- atomic_long_set(&zone->vm_numa_stat[item], 0);
- for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_event[item], 0);
+ for_each_online_cpu(cpu) {
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
= 0;
+ }
}
}
@@ -63,8 +62,8 @@ static void zero_global_numa_counters(void)
{
int item;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
- atomic_long_set(&vm_numa_stat[item], 0);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ atomic_long_set(&vm_numa_event[item], 0);
}
static void invalid_numa_statistics(void)
@@ -161,10 +160,9 @@ void vm_events_fold_cpu(int cpu)
* vm_stat contains the global counters
*/
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
-EXPORT_SYMBOL(vm_numa_stat);
EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_SMP
@@ -266,7 +264,7 @@ void refresh_zone_stat_thresholds(void)
for_each_online_cpu(cpu) {
int pgdat_threshold;
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
/* Base nodestat threshold on the largest populated zone. */
@@ -303,7 +301,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
threshold = (*calculate_pressure)(zone);
for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
}
}
@@ -316,11 +314,12 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -330,6 +329,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -352,6 +352,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
delta >>= PAGE_SHIFT;
}
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -361,6 +362,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -389,10 +391,11 @@ EXPORT_SYMBOL(__mod_node_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -401,6 +404,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -411,6 +415,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -419,6 +424,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -435,10 +441,11 @@ EXPORT_SYMBOL(__inc_node_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -447,6 +454,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -457,6 +465,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -465,6 +474,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -495,7 +505,7 @@ EXPORT_SYMBOL(__dec_node_page_state);
static inline void mod_zone_state(struct zone *zone,
enum zone_stat_item item, long delta, int overstep_mode)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long o, n, t, z;
@@ -706,8 +716,7 @@ EXPORT_SYMBOL(dec_node_page_state);
* Fold a differential into the global counters.
* Returns the number of counters updated.
*/
-#ifdef CONFIG_NUMA
-static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
int changes = 0;
@@ -718,12 +727,6 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
changes++;
}
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (numa_diff[i]) {
- atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
- changes++;
- }
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
if (node_diff[i]) {
atomic_long_add(node_diff[i], &vm_node_stat[i]);
@@ -731,26 +734,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
}
return changes;
}
-#else
-static int fold_diff(int *zone_diff, int *node_diff)
+
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
{
- int i;
- int changes = 0;
+ unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+ int cpu;
+ enum numa_stat_item item;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (zone_diff[i]) {
- atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
- changes++;
- }
+ for_each_online_cpu(cpu) {
+ struct per_cpu_zonestat *pzstats;
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- if (node_diff[i]) {
- atomic_long_add(node_diff[i], &vm_node_stat[i]);
- changes++;
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
}
- return changes;
+
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ fold_vm_zone_numa_events(zone);
}
-#endif /* CONFIG_NUMA */
+#endif
/*
* Update the zone counters for the current cpu.
@@ -774,41 +785,30 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;
for_each_populated_zone(zone) {
- struct per_cpu_pageset __percpu *p = zone->pageset;
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+#ifdef CONFIG_NUMA
+ struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
+#endif
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
int v;
- v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+ v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
if (v) {
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
#ifdef CONFIG_NUMA
/* 3 seconds idle till flush */
- __this_cpu_write(p->expire, 3);
+ __this_cpu_write(pcp->expire, 3);
#endif
}
}
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- int v;
-
- v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
- if (v) {
-
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
- __this_cpu_write(p->expire, 3);
- }
- }
if (do_pagesets) {
cond_resched();
@@ -819,23 +819,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
* Check if there are pages remaining in this pageset
* if not then there is nothing to expire.
*/
- if (!__this_cpu_read(p->expire) ||
- !__this_cpu_read(p->pcp.count))
+ if (!__this_cpu_read(pcp->expire) ||
+ !__this_cpu_read(pcp->count))
continue;
/*
* We never drain zones local to this processor.
*/
if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
+ __this_cpu_write(pcp->expire, 0);
continue;
}
- if (__this_cpu_dec_return(p->expire))
+ if (__this_cpu_dec_return(pcp->expire))
continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ if (__this_cpu_read(pcp->count)) {
+ drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
}
@@ -856,12 +856,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
-#ifdef CONFIG_NUMA
- changes += fold_diff(global_zone_diff, global_numa_diff,
- global_node_diff);
-#else
changes += fold_diff(global_zone_diff, global_node_diff);
-#endif
return changes;
}
@@ -876,36 +871,33 @@ void cpu_vm_stats_fold(int cpu)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p;
+ struct per_cpu_zonestat *pzstats;
- p = per_cpu_ptr(zone->pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (p->vm_stat_diff[i]) {
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
int v;
- v = p->vm_stat_diff[i];
- p->vm_stat_diff[i] = 0;
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
}
-
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (p->vm_numa_stat_diff[i]) {
- int v;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ unsigned long v;
- v = p->vm_numa_stat_diff[i];
- p->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
@@ -925,58 +917,39 @@ void cpu_vm_stats_fold(int cpu)
}
}
-#ifdef CONFIG_NUMA
- fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
-#else
fold_diff(global_zone_diff, global_node_diff);
-#endif
}
/*
* this is only called if !populated_zone(zone), which implies no other users of
* pset->vm_stat_diff[] exist.
*/
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
{
+ unsigned long v;
int i;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (pset->vm_stat_diff[i]) {
- int v = pset->vm_stat_diff[i];
- pset->vm_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_stat[i]);
- atomic_long_add(v, &vm_zone_stat[i]);
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
+ zone_page_state_add(v, zone, i);
}
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (pset->vm_numa_stat_diff[i]) {
- int v = pset->vm_numa_stat_diff[i];
-
- pset->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- atomic_long_add(v, &vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
#endif
#ifdef CONFIG_NUMA
-void __inc_numa_state(struct zone *zone,
- enum numa_stat_item item)
-{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
- u16 __percpu *p = pcp->vm_numa_stat_diff + item;
- u16 v;
-
- v = __this_cpu_inc_return(*p);
-
- if (unlikely(v > NUMA_STATS_THRESHOLD)) {
- zone_numa_state_add(v, zone, item);
- __this_cpu_write(*p, 0);
- }
-}
-
/*
* Determine the per node value of a stat item. This function
* is called frequently in a NUMA machine, so try to be as
@@ -995,19 +968,16 @@ unsigned long sum_zone_node_page_state(int node,
return count;
}
-/*
- * Determine the per node value of a numa stat item. To avoid deviation,
- * the per cpu stat number in vm_numa_stat_diff[] is also included.
- */
-unsigned long sum_zone_numa_state(int node,
+/* Determine the per node value of a numa stat item. */
+unsigned long sum_zone_numa_event_state(int node,
enum numa_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
- int i;
unsigned long count = 0;
+ int i;
for (i = 0; i < MAX_NR_ZONES; i++)
- count += zone_numa_state_snapshot(zones + i, item);
+ count += zone_numa_event_state(zones + i, item);
return count;
}
@@ -1686,28 +1656,30 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
- zone_numa_state_snapshot(zone, i));
+ zone_numa_event_state(zone, i));
#endif
seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
- struct per_cpu_pageset *pageset;
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat __maybe_unused *pzstats;
- pageset = per_cpu_ptr(zone->pageset, i);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
"\n high: %i"
"\n batch: %i",
i,
- pageset->pcp.count,
- pageset->pcp.high,
- pageset->pcp.batch);
+ pcp->count,
+ pcp->high,
+ pcp->batch);
#ifdef CONFIG_SMP
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
seq_printf(m, "\n vm stats threshold: %d",
- pageset->stat_threshold);
+ pzstats->stat_threshold);
#endif
}
seq_printf(m,
@@ -1740,7 +1712,7 @@ static const struct seq_operations zoneinfo_op = {
};
#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
- NR_VM_NUMA_STAT_ITEMS + \
+ NR_VM_NUMA_EVENT_ITEMS + \
NR_VM_NODE_STAT_ITEMS + \
NR_VM_WRITEBACK_STAT_ITEMS + \
(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
@@ -1755,6 +1727,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
return NULL;
BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
if (!v)
@@ -1764,9 +1737,9 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v += NR_VM_ZONE_STAT_ITEMS;
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- v[i] = global_numa_state(i);
- v += NR_VM_NUMA_STAT_ITEMS;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
+ v[i] = global_numa_event_state(i);
+ v += NR_VM_NUMA_EVENT_ITEMS;
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
@@ -1927,19 +1900,16 @@ static bool need_update(int cpu)
struct zone *zone;
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+ struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
struct per_cpu_nodestat *n;
+
/*
* The fast way of checking if there are any vmstat diffs.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
- sizeof(p->vm_stat_diff[0])))
- return true;
-#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
- sizeof(p->vm_numa_stat_diff[0])))
+ if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(pzstats->vm_stat_diff[0])))
return true;
-#endif
+
if (last_pgdat == zone->zone_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..47bbc0a7b153 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -430,6 +430,8 @@ static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -438,7 +440,8 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 19b563bc6c48..4ebdb55841f0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -57,6 +57,7 @@
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/local_lock.h>
#define ZSPAGE_MAGIC 0x58
@@ -77,6 +78,20 @@
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+#ifdef CONFIG_PREEMPT_RT
+
+struct zsmalloc_handle {
+ unsigned long addr;
+ struct mutex lock;
+};
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
+
+#else
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
+#endif
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* a single (unsigned long) handle value.
@@ -293,6 +308,7 @@ struct zspage {
};
struct mapping_area {
+ local_lock_t lock;
char *vm_buf; /* copy buffer for objects that span pages */
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
static int create_cache(struct zs_pool *pool)
{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
0, 0, NULL);
if (!pool->handle_cachep)
return 1;
@@ -346,9 +362,26 @@ static void destroy_cache(struct zs_pool *pool)
static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ void *p;
+
+ p = kmem_cache_alloc(pool->handle_cachep,
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+#ifdef CONFIG_PREEMPT_RT
+ if (p) {
+ struct zsmalloc_handle *zh = p;
+
+ mutex_init(&zh->lock);
+ }
+#endif
+ return (unsigned long)p;
+}
+
+#ifdef CONFIG_PREEMPT_RT
+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
+{
+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
}
+#endif
static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
static void record_obj(unsigned long handle, unsigned long obj)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ WRITE_ONCE(zh->addr, obj);
+#else
/*
* lsb of @obj represents handle lock while other bits
* represent object value the handle is pointing so
* updating shouldn't do store tearing.
*/
WRITE_ONCE(*(unsigned long *)handle, obj);
+#endif
}
/* zpool driver */
@@ -455,7 +494,10 @@ MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
+ /* XXX remove this and use a spin_lock_t in pin_tag() */
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static bool is_zspage_isolated(struct zspage *zspage)
{
@@ -862,7 +904,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
static unsigned long handle_to_obj(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return zh->addr;
+#else
return *(unsigned long *)handle;
+#endif
}
static unsigned long obj_to_head(struct page *page, void *obj)
@@ -876,22 +924,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
static inline int testpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_is_locked(&zh->lock);
+#else
return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static inline int trypin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_trylock(&zh->lock);
+#else
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void pin_tag(unsigned long handle) __acquires(bitlock)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_lock(&zh->lock);
+#else
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void unpin_tag(unsigned long handle) __releases(bitlock)
{
+#ifdef CONFIG_PREEMPT_RT
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_unlock(&zh->lock);
+#else
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void reset_page(struct page *page)
@@ -1274,7 +1346,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
class = pool->size_class[class_idx];
off = (class->size * obj_idx) & ~PAGE_MASK;
- area = &get_cpu_var(zs_map_area);
+ local_lock(&zs_map_area.lock);
+ area = this_cpu_ptr(&zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1328,7 +1401,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ local_unlock(&zs_map_area.lock);
migrate_read_unlock(zspage);
unpin_tag(handle);
diff --git a/net/Kconfig b/net/Kconfig
index c7392c449b25..d254b5143761 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -294,7 +294,7 @@ config CGROUP_NET_CLASSID
config NET_RX_BUSY_POLL
bool
- default y
+ default y if !PREEMPT_RT
config BQL
bool
diff --git a/net/core/dev.c b/net/core/dev.c
index 04c4e236952f..acc0d4a801b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -223,14 +223,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
#endif
}
static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
#endif
}
@@ -3121,6 +3121,7 @@ static void __netif_reschedule(struct Qdisc *q)
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
void __netif_schedule(struct Qdisc *q)
@@ -3183,6 +3184,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__dev_kfree_skb_irq);
@@ -3867,7 +3869,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
*/
+#ifdef CONFIG_PREEMPT_RT
+ contended = true;
+#else
contended = qdisc_is_running(q);
+#endif
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -4688,6 +4694,7 @@ drop:
rps_unlock(sd);
local_irq_restore(flags);
+ preempt_check_resched_rt();
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
@@ -4905,7 +4912,7 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
+ migrate_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4915,14 +4922,14 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
+ migrate_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+ put_cpu_light();
}
return ret;
}
@@ -4961,11 +4968,9 @@ int netif_rx_ni(struct sk_buff *skb)
trace_netif_rx_ni_entry(skb);
- preempt_disable();
+ local_bh_disable();
err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
+ local_bh_enable();
trace_netif_rx_ni_exit(err);
return err;
@@ -6413,12 +6418,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
sd->rps_ipi_list = NULL;
local_irq_enable();
+ preempt_check_resched_rt();
/* Send pending IPI's to kick RPS processing on remote cpus. */
net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
+ preempt_check_resched_rt();
}
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
@@ -6496,6 +6503,7 @@ void __napi_schedule(struct napi_struct *n)
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__napi_schedule);
@@ -11303,6 +11311,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ preempt_check_resched_rt();
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
@@ -11316,7 +11325,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
@@ -11632,7 +11641,7 @@ static int __init net_dev_init(void)
INIT_WORK(flush, flush_backlog);
- skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init_raw(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
skb_queue_head_init(&sd->xfrm_backlog);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 8e582e29a41e..e51f4854d8b2 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -42,7 +42,7 @@
struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
spinlock_t *stats_lock;
- seqcount_t *running;
+ net_seqlock_t *running;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
u8 ewma_log;
u8 intvl_log; /* period : (250ms << intvl_log) */
@@ -125,7 +125,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running,
+ net_seqlock_t *running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
@@ -226,7 +226,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt)
+ net_seqlock_t *running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
lock, running, opt);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index e491b083b348..ef432cea2e10 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -137,7 +137,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
}
void
-__gnet_stats_copy_basic(const seqcount_t *running,
+__gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -150,15 +150,15 @@ __gnet_stats_copy_basic(const seqcount_t *running,
}
do {
if (running)
- seq = read_seqcount_begin(running);
+ seq = net_seq_begin(running);
bstats->bytes = b->bytes;
bstats->packets = b->packets;
- } while (running && read_seqcount_retry(running, seq));
+ } while (running && net_seq_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
static int
-___gnet_stats_copy_basic(const seqcount_t *running,
+___gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b,
@@ -204,7 +204,7 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(const seqcount_t *running,
+gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -228,7 +228,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic_hw(const seqcount_t *running,
+gnet_stats_copy_basic_hw(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
diff --git a/net/core/sock.c b/net/core/sock.c
index 37d732fe3fcf..f67ca9b54875 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3073,12 +3073,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
if (sk->sk_lock.owned)
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
}
EXPORT_SYMBOL(lock_sock_nested);
@@ -3127,13 +3126,12 @@ bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
__acquire(&sk->sk_lock.slock);
- local_bh_enable();
return true;
}
EXPORT_SYMBOL(lock_sock_fast);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c96866a53a66..388e3ebb7f57 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -635,7 +635,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
inet_ehash_nolisten(sk, osk, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -667,11 +669,8 @@ int inet_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
@@ -682,17 +681,20 @@ void inet_unhash(struct sock *sk)
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb = NULL;
spinlock_t *lock;
+ bool state_listen;
if (sk_unhashed(sk))
return;
if (sk->sk_state == TCP_LISTEN) {
+ state_listen = true;
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
+ spin_lock(&ilb->lock);
} else {
+ state_listen = false;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ spin_lock_bh(lock);
}
- spin_lock_bh(lock);
if (sk_unhashed(sk))
goto unlock;
@@ -705,7 +707,10 @@ void inet_unhash(struct sock *sk)
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
- spin_unlock_bh(lock);
+ if (state_listen)
+ spin_unlock(&ilb->lock);
+ else
+ spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 55c290d55605..9bad345cba9a 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f87d07736a14..7a627b208393 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1258,7 +1258,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
- seqcount_t *running;
+ net_seqlock_t *running;
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1ee96a5c5ee0..f7bf683c3fc9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -578,7 +578,11 @@ struct Qdisc noop_qdisc = {
.ops = &noop_qdisc_ops,
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
+#ifdef CONFIG_PREEMPT_RT
+ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
+#else
.running = SEQCNT_ZERO(noop_qdisc.running),
+#endif
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
.gso_skb = {
.next = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -889,9 +893,15 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
lockdep_set_class(&sch->seqlock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+#ifdef CONFIG_PREEMPT_RT
+ seqlock_init(&sch->running);
+ lockdep_set_class(&sch->running.lock,
+ dev->qdisc_running_key ?: &qdisc_running_key);
+#else
seqcount_init(&sch->running);
lockdep_set_class(&sch->running,
dev->qdisc_running_key ?: &qdisc_running_key);
+#endif
sch->ops = ops;
sch->flags = ops->static_flags;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index dbb41821b1b8..558a45c0f7cb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -441,7 +441,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
return;
- cpu = get_cpu();
+ cpu = get_cpu_light();
pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
atomic_long_inc(&pool->sp_stats.packets);
@@ -465,7 +465,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
rqstp = NULL;
out_unlock:
rcu_read_unlock();
- put_cpu();
+ put_cpu_light();
trace_svc_xprt_do_enqueue(xprt, rqstp);
}
EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c
index 5a90aa527877..642d0748c169 100644
--- a/samples/kfifo/bytestream-example.c
+++ b/samples/kfifo/bytestream-example.c
@@ -22,10 +22,10 @@
#define PROC_FIFO "bytestream-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -116,12 +116,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
if (ret)
return ret;
@@ -134,12 +134,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
if (ret)
return ret;
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c
index e5403d8c971a..c61482ba94f4 100644
--- a/samples/kfifo/inttype-example.c
+++ b/samples/kfifo/inttype-example.c
@@ -22,10 +22,10 @@
#define PROC_FIFO "int-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -109,12 +109,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
if (ret)
return ret;
@@ -127,12 +127,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
if (ret)
return ret;
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c
index f64f3d62d6c2..e4087b2d3fc4 100644
--- a/samples/kfifo/record-example.c
+++ b/samples/kfifo/record-example.c
@@ -22,10 +22,10 @@
#define PROC_FIFO "record-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -123,12 +123,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
if (ret)
return ret;
@@ -141,12 +141,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
if (ret)
return ret;