aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/audit.c41
-rw-r--r--kernel/audit_tree.c4
-rw-r--r--kernel/auditsc.c45
-rw-r--r--kernel/backtracetest.c2
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/crash_core.c1
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/dma/Kconfig12
-rw-r--r--kernel/dma/Makefile3
-rw-r--r--kernel/dma/contiguous.c31
-rw-r--r--kernel/dma/debug.c12
-rw-r--r--kernel/dma/direct.c74
-rw-r--r--kernel/dma/mapping.c214
-rw-r--r--kernel/entry/Makefile13
-rw-r--r--kernel/entry/common.c374
-rw-r--r--kernel/entry/kvm.c51
-rw-r--r--kernel/events/core.c117
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c28
-rw-r--r--kernel/fork.c114
-rw-r--r--kernel/futex.c128
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/debugfs.c5
-rw-r--r--kernel/irq/irqdomain.c3
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/kcsan/Makefile9
-rw-r--r--kernel/kcsan/atomic.h6
-rw-r--r--kernel/kcsan/core.c37
-rw-r--r--kernel/kcsan/kcsan-test.c1107
-rw-r--r--kernel/kcsan/kcsan.h7
-rw-r--r--kernel/kcsan/report.c12
-rw-r--r--kernel/kcsan/selftest.c (renamed from kernel/kcsan/test.c)0
-rw-r--r--kernel/kprobes.c60
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/locking/lockdep.c162
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/locking/osq_lock.c6
-rw-r--r--kernel/nsproxy.c21
-rw-r--r--kernel/padata.c177
-rw-r--r--kernel/pid.c16
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/energy_model.c290
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/printk/printk.c16
-rw-r--r--kernel/rcu/Kconfig.debug19
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcuperf.c25
-rw-r--r--kernel/rcu/rcutorture.c119
-rw-r--r--kernel/rcu/refscale.c717
-rw-r--r--kernel/rcu/srcutree.c16
-rw-r--r--kernel/rcu/tasks.h37
-rw-r--r--kernel/rcu/tiny.c7
-rw-r--r--kernel/rcu/tree.c407
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h9
-rw-r--r--kernel/rcu/update.c16
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/core.c466
-rw-r--r--kernel/sched/cpudeadline.c24
-rw-r--r--kernel/sched/cpufreq_schedutil.c8
-rw-r--r--kernel/sched/cputime.c46
-rw-r--r--kernel/sched/deadline.c118
-rw-r--r--kernel/sched/fair.c95
-rw-r--r--kernel/sched/idle.c11
-rw-r--r--kernel/sched/isolation.c3
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c6
-rw-r--r--kernel/sched/pelt.h5
-rw-r--r--kernel/sched/psi.c113
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/sched.h126
-rw-r--r--kernel/sched/stop_task.c12
-rw-r--r--kernel/sched/topology.c22
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/seccomp.c376
-rw-r--r--kernel/smp.c3
-rw-r--r--kernel/softirq.c26
-rw-r--r--kernel/stackleak.c16
-rw-r--r--kernel/sys.c13
-rw-r--r--kernel/sysctl.c21
-rw-r--r--kernel/time/namespace.c22
-rw-r--r--kernel/time/sched_clock.c41
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c253
-rw-r--r--kernel/torture.c6
-rw-r--r--kernel/trace/blktrace.c86
-rw-r--r--kernel/trace/ftrace.c101
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/umh.c171
-rw-r--r--kernel/usermode_driver.c182
101 files changed, 5456 insertions, 1662 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..bdeb77e27042 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-$(CONFIG_BPFILTER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -48,6 +49,7 @@ obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
+obj-y += entry/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -125,6 +127,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
+CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCSAN_SANITIZE_stackleak.o := n
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..33258e6e20f8 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime, delta, rettime;
/* 1) run (and print duration) */
if (initcall_debug && system_state < SYSTEM_RUNNING) {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime, delta, endtime;
if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index b2301bdc9773..7efaece534a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -136,6 +136,11 @@ u32 audit_sig_sid = 0;
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
+
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
- s.enabled = audit_enabled;
- s.failure = audit_failure;
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
- s.pid = auditd_pid_vnr();
- s.rate_limit = audit_rate_limit;
- s.backlog_limit = audit_backlog_limit;
- s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_queue);
- s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time;
+ s.pid = auditd_pid_vnr();
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
+ if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+ u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+ audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+ return actual;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1800,7 +1812,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
{
struct audit_buffer *ab;
struct timespec64 t;
- unsigned int uninitialized_var(serial);
+ unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+ long rtime = stime;
+
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
- stime = schedule_timeout(stime);
+ stime = schedule_timeout(rtime);
+ atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -2079,13 +2094,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_string(ab, "<no_memory>");
+ audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_string(ab, "<too_long>");
+ audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..1b7a2f041793 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void)
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
- size_t size;
int i;
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
+ chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fd840c40abf7..8dba8f0983b5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -75,6 +75,7 @@
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
#include "audit.h"
@@ -136,9 +137,26 @@ struct audit_nfcfgop_tab {
};
static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
- { AUDIT_XT_OP_REGISTER, "register" },
- { AUDIT_XT_OP_REPLACE, "replace" },
- { AUDIT_XT_OP_UNREGISTER, "unregister" },
+ { AUDIT_XT_OP_REGISTER, "xt_register" },
+ { AUDIT_XT_OP_REPLACE, "xt_replace" },
+ { AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
+ { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
+ { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
+ { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
+ { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
+ { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
+ { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
+ { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
+ { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
+ { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
+ { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
+ { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
+ { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
+ { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
+ { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
+ { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
+ { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
static int audit_match_perm(struct audit_context *ctx, int mask)
@@ -1876,6 +1894,20 @@ __audit_reusename(const __user char *uptr)
return NULL;
}
+inline void _audit_getcwd(struct audit_context *context)
+{
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
+}
+
+void __audit_getcwd(void)
+{
+ struct audit_context *context = audit_context();
+
+ if (context->in_syscall)
+ _audit_getcwd(context);
+}
+
/**
* __audit_getname - add a name to the list
* @name: name to add
@@ -1900,8 +1932,7 @@ void __audit_getname(struct filename *name)
name->aname = n;
name->refcnt++;
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
+ _audit_getcwd(context);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2557,12 +2588,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
- enum audit_nfcfgop op)
+ enum audit_nfcfgop op, gfp_t gfp)
{
struct audit_buffer *ab;
char comm[sizeof(current->comm)];
- ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG);
+ ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
if (!ab)
return;
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a2a97fa3071b..370217dd7e39 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data)
complete(&backtrace_work);
}
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
static void backtrace_test_irq(void)
{
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b6397a186ce9..d51175cedfca 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -64,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
/**
* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 9f1557b98468..18175687133a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
+ VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
#endif
VMCOREINFO_STRUCT_SIZE(page);
VMCOREINFO_STRUCT_SIZE(pglist_data);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9e5934780f41..b16dbc1bf056 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing)
atomic_set(&kgdb_break_tasklet_var, 0);
}
-static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
void kgdb_schedule_breakpoint(void)
{
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 683a799618ad..9d847ab851db 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1da3f44f2565..f4770fcfa62b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -5,6 +5,17 @@ config HAS_DMA
depends on !NO_DMA
default y
+config DMA_OPS
+ bool
+
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -60,6 +71,7 @@ config DMA_NONCOHERENT_CACHE_SYNC
config DMA_VIRT_OPS
bool
depends on HAS_DMA
+ select DMA_OPS
config SWIOTLB
bool
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 370f63344e9c..32c7c1942bbd 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
+obj-$(CONFIG_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 15bc5026c485..cff7e60968b9 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
}
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+ unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+ return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
/**
* dma_alloc_contiguous() - allocate contiguous pages
* @dev: Pointer to device for which the allocation is performed.
@@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
- size_t count = size >> PAGE_SHIFT;
- struct page *page = NULL;
- struct cma *cma = NULL;
-
- if (dev && dev->cma_area)
- cma = dev->cma_area;
- else if (count > 1)
- cma = dma_contiguous_default_area;
-
/* CMA can be used only in the context which permits sleeping */
- if (cma && gfpflags_allow_blocking(gfp)) {
- size_t align = get_order(size);
- size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
-
- page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
- }
-
- return page;
+ if (!gfpflags_allow_blocking(gfp))
+ return NULL;
+ if (dev->cma_area)
+ return cma_alloc_aligned(dev->cma_area, size, gfp);
+ if (size <= PAGE_SIZE || !dma_contiguous_default_area)
+ return NULL;
+ return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
}
/**
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..f7f807fb6ebb 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -144,8 +144,12 @@ static const char *type2name[] = {
[dma_debug_resource] = "resource",
};
-static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
- "DMA_FROM_DEVICE", "DMA_NONE" };
+static const char *dir2name[] = {
+ [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL",
+ [DMA_TO_DEVICE] = "DMA_TO_DEVICE",
+ [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE",
+ [DMA_NONE] = "DMA_NONE",
+};
/*
* The access to some variables in this macro is racy. We can't use atomic_t
@@ -882,7 +886,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
{
struct device *dev = data;
- struct dma_debug_entry *uninitialized_var(entry);
+ struct dma_debug_entry *entry;
int count;
if (dma_debug_disabled())
@@ -1071,7 +1075,7 @@ static void check_unmap(struct dma_debug_entry *ref)
/*
* Drivers should use dma_mapping_error() to check the returned
* addresses of dma_map_single() and dma_map_page().
- * If not, print this warning message. See Documentation/DMA-API.txt.
+ * If not, print this warning message. See Documentation/core-api/dma-api.rst.
*/
if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
err_printk(ref->dev, entry,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 67f060b86a73..bb0041e99659 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -10,11 +10,9 @@
#include <linux/dma-direct.h>
#include <linux/scatterlist.h>
#include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
-#include <linux/swiotlb.h>
/*
* Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
@@ -304,19 +302,6 @@ void dma_direct_free(struct device *dev, size_t size,
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
-
- if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, size, dir);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
-
void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -335,27 +320,11 @@ void dma_direct_sync_sg_for_device(struct device *dev,
dir);
}
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
#endif
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
- arch_sync_dma_for_cpu_all();
- }
-
- if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-
void dma_direct_sync_sg_for_cpu(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
@@ -376,20 +345,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu_all();
}
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- phys_addr_t phys = dma_to_phys(dev, addr);
-
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
- if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
-}
-EXPORT_SYMBOL(dma_direct_unmap_page);
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -401,35 +356,8 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
attrs);
}
-EXPORT_SYMBOL(dma_direct_unmap_sg);
#endif
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
- if (unlikely(swiotlb_force == SWIOTLB_FORCE))
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- if (swiotlb_force != SWIOTLB_NO_FORCE)
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- dev_WARN_ONCE(dev, 1,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- return DMA_MAPPING_ERROR;
- }
-
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(phys, size, dir);
- return dma_addr;
-}
-EXPORT_SYMBOL(dma_direct_map_page);
-
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
@@ -450,7 +378,6 @@ out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
return 0;
}
-EXPORT_SYMBOL(dma_direct_map_sg);
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -467,7 +394,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
return dma_addr;
}
-EXPORT_SYMBOL(dma_direct_map_resource);
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a8c18c9a796f..0d129421e75f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,6 +105,196 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
EXPORT_SYMBOL(dmam_alloc_attrs);
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+ const struct dma_map_ops *ops)
+{
+ if (likely(!ops))
+ return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+ if (dev->dma_ops_bypass)
+ return min_not_zero(mask, dev->bus_dma_limit) >=
+ dma_direct_get_required_mask(dev);
+#endif
+ return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, *dev->dma_mask, ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr;
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ else
+ addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ debug_dma_map_page(dev, page, offset, size, dir, addr);
+
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_unmap_page(dev, addr, size, dir, attrs);
+ else if (ops->unmap_page)
+ ops->unmap_page(dev, addr, size, dir, attrs);
+ debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ int ents;
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else
+ ents = ops->map_sg(dev, sg, nents, dir, attrs);
+ BUG_ON(ents < 0);
+ debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+ return ents;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ debug_dma_unmap_sg(dev, sg, nents, dir);
+ if (dma_map_direct(dev, ops))
+ dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (ops->unmap_sg)
+ ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ dma_addr_t addr = DMA_MAPPING_ERROR;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ /* Don't allow RAM to be mapped */
+ if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_map_direct(dev, ops))
+ addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+ else if (ops->map_resource)
+ addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (!dma_map_direct(dev, ops) && ops->unmap_resource)
+ ops->unmap_resource(dev, addr, size, dir, attrs);
+ debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(dev, addr, size, dir);
+ debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (ops->sync_single_for_device)
+ ops->sync_single_for_device(dev, addr, size, dir);
+ debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -138,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
if (!ops->get_sgtable)
@@ -208,7 +398,7 @@ bool dma_can_mmap(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_can_mmap(dev);
return ops->mmap != NULL;
}
@@ -233,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
if (!ops->mmap)
@@ -246,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
@@ -277,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
@@ -309,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
return;
debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
else if (ops->free)
ops->free(dev, size, cpu_addr, dma_handle, attrs);
@@ -320,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ /*
+ * ->dma_supported sets the bypass flag, so we must always call
+ * into the method here unless the device is truly direct mapped.
+ */
+ if (!ops)
return dma_direct_supported(dev, mask);
if (!ops->dma_supported)
return 1;
@@ -376,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
BUG_ON(!valid_dma_direction(dir));
- if (dma_is_direct(ops))
+ if (dma_alloc_direct(dev, ops))
arch_dma_cache_sync(dev, vaddr, size, dir);
else if (ops->cache_sync)
ops->cache_sync(dev, vaddr, size, dir);
@@ -388,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev)
const struct dma_map_ops *ops = get_dma_ops(dev);
size_t size = SIZE_MAX;
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
size = dma_direct_max_mapping_size(dev);
else if (ops && ops->max_mapping_size)
size = ops->max_mapping_size(dev);
@@ -401,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops))
+ if (dma_map_direct(dev, ops))
return dma_direct_need_sync(dev, dma_addr);
return ops->sync_single_for_cpu || ops->sync_single_for_device;
}
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
new file mode 100644
index 000000000000..34c8a3f1c735
--- /dev/null
+++ b/kernel/entry/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Prevent the noinstr section from being pestered by sanitizer and other goodies
+# as long as these things cannot be disabled per function.
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
+CFLAGS_common.o += -fno-stack-protector
+
+obj-$(CONFIG_GENERIC_ENTRY) += common.o
+obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
new file mode 100644
index 000000000000..9852e0d62d95
--- /dev/null
+++ b/kernel/entry/common.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+#include <linux/livepatch.h>
+#include <linux/audit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall/interrupt entry disables interrupts, but user mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ */
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+ arch_check_user_regs(regs);
+ lockdep_hardirqs_off(CALLER_ADDR0);
+
+ CT_WARN_ON(ct_state() != CONTEXT_USER);
+ user_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long ti_work)
+{
+ long ret = 0;
+
+ /* Handle ptrace */
+ if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ ret = arch_syscall_enter_tracehook(regs);
+ if (ret || (ti_work & _TIF_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (ti_work & _TIF_SECCOMP) {
+ ret = __secure_computing(NULL);
+ if (ret == -1L)
+ return ret;
+ }
+
+ if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+ trace_sys_enter(regs, syscall);
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+ unsigned long ti_work;
+
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ local_irq_enable();
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ if (ti_work & SYSCALL_ENTER_WORK)
+ syscall = syscall_trace_enter(regs, syscall, ti_work);
+ instrumentation_end();
+
+ return syscall;
+}
+
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall/interupt exit enables interrupts, but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Invoke architecture specific last minute exit code, e.g. speculation
+ * mitigations, etc.
+ * 4) Tell lockdep that interrupts are enabled
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ user_enter_irqoff();
+ arch_exit_to_user_mode();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/* Workaround to allow gradual conversion of architecture code */
+void __weak arch_do_signal(struct pt_regs *regs) { }
+
+static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ /*
+ * Before returning to user space ensure that all pending work
+ * items have been completed.
+ */
+ while (ti_work & EXIT_TO_USER_MODE_WORK) {
+
+ local_irq_enable_exit_to_user(ti_work);
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ if (ti_work & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
+ if (ti_work & _TIF_SIGPENDING)
+ arch_do_signal(regs);
+
+ if (ti_work & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(NULL, regs);
+ }
+
+ /* Architecture specific TIF work */
+ arch_exit_to_user_mode_work(regs, ti_work);
+
+ /*
+ * Disable interrupts and reevaluate the work flags as they
+ * might have changed while interrupts and preemption was
+ * enabled above.
+ */
+ local_irq_disable_exit_to_user();
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ }
+
+ /* Return the latest work state for arch_exit_to_user_mode() */
+ return ti_work;
+}
+
+static void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ lockdep_assert_irqs_disabled();
+
+ if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+ ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+ arch_exit_to_user_mode_prepare(regs, ti_work);
+
+ /* Ensure that the address limit is intact and no locks are held */
+ addr_limit_user_check();
+ lockdep_assert_irqs_disabled();
+ lockdep_sys_exit();
+}
+
+#ifndef _TIF_SINGLESTEP
+static inline bool report_single_step(unsigned long ti_work)
+{
+ return false;
+}
+#else
+/*
+ * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_usermode().
+ */
+#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
+
+static inline bool report_single_step(unsigned long ti_work)
+{
+ return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+}
+#endif
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+{
+ bool step;
+
+ audit_syscall_exit(regs);
+
+ if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(ti_work);
+ if (step || ti_work & _TIF_SYSCALL_TRACE)
+ arch_syscall_exit_tracehook(regs, step);
+}
+
+/*
+ * Syscall specific exit to user mode preparation. Runs with interrupts
+ * enabled.
+ */
+static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ u32 cached_flags = READ_ONCE(current_thread_info()->flags);
+ unsigned long nr = syscall_get_nr(current, regs);
+
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
+ local_irq_enable();
+ }
+
+ rseq_syscall(regs);
+
+ /*
+ * Do one-time syscall specific work. If these work items are
+ * enabled, we want to run them exactly once per syscall exit with
+ * interrupts enabled.
+ */
+ if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
+ syscall_exit_work(regs, cached_flags);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ syscall_exit_to_user_mode_prepare(regs);
+ local_irq_disable_exit_to_user();
+ exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+}
+
+noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
+{
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
+
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ return ret;
+ }
+
+ /*
+ * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for __rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interupt and eventually claim
+ * quiescient state and end grace periods prematurely.
+ *
+ * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in irq_enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ rcu_irq_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ ret.exit_rcu = true;
+ return ret;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ /* Use the combo lockdep/tracing function */
+ trace_hardirqs_off();
+ instrumentation_end();
+
+ return ret;
+}
+
+void irqentry_exit_cond_resched(void)
+{
+ if (!preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched())
+ preempt_schedule_irq();
+ }
+}
+
+noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ irqentry_exit_to_user_mode(regs);
+ } else if (!regs_irqs_disabled(regs)) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (state.exit_rcu) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+ rcu_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ irqentry_exit_cond_resched();
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (state.exit_rcu)
+ rcu_irq_exit();
+ }
+}
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
new file mode 100644
index 000000000000..eb1a8a4c867c
--- /dev/null
+++ b/kernel/entry/kvm.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/entry-kvm.h>
+#include <linux/kvm_host.h>
+
+static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+{
+ do {
+ int ret;
+
+ if (ti_work & _TIF_SIGPENDING) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ if (ti_work & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (ti_work & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(NULL);
+ }
+
+ ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
+ if (ret)
+ return ret;
+
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ return 0;
+}
+
+int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
+{
+ unsigned long ti_work;
+
+ /*
+ * This is invoked from the outer guest loop with interrupts and
+ * preemption enabled.
+ *
+ * KVM invokes xfer_to_guest_mode_work_pending() with interrupts
+ * disabled in the inner loop before going into guest mode. No need
+ * to disable interrupts here.
+ */
+ ti_work = READ_ONCE(current_thread_info()->flags);
+ if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
+ return 0;
+
+ return xfer_to_guest_mode_work(vcpu, ti_work);
+}
+EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..78e69e10482a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -1237,12 +1238,26 @@ static void get_ctx(struct perf_event_context *ctx)
refcount_inc(&ctx->refcount);
}
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+ if (pmu->task_ctx_cache)
+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
+ return NULL;
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+ if (pmu->task_ctx_cache && task_ctx_data)
+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+}
+
static void free_ctx(struct rcu_head *head)
{
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx->task_ctx_data);
+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -4470,7 +4485,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
goto errout;
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+ task_ctx_data = alloc_task_ctx_data(pmu);
if (!task_ctx_data) {
err = -ENOMEM;
goto errout;
@@ -4528,11 +4543,11 @@ retry:
}
}
- kfree(task_ctx_data);
+ free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- kfree(task_ctx_data);
+ free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
@@ -4575,7 +4590,7 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
attr->task || attr->ksymbol ||
- attr->context_switch ||
+ attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
return false;
@@ -4651,6 +4666,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -8628,6 +8645,89 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10945,6 +11045,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
if (inc) {
/*
@@ -11483,7 +11585,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx, *gctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -12409,8 +12511,7 @@ inherit_event(struct perf_event *parent_event,
!child_ctx->task_ctx_data) {
struct pmu *pmu = child_event->pmu;
- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
- GFP_KERNEL);
+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
return ERR_PTR(-ENOMEM);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5f8b0c52fd2e..25de10c904e6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int uninitialized_var(is_swbp);
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == get_trampoline_vaddr())
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..e731c414e024 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
+ struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
@@ -217,6 +217,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
@@ -804,7 +805,6 @@ void __noreturn do_exit(long code)
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
- exit_umh(tsk);
/*
* Flush inherited counters to the parent - before the parent
@@ -1711,6 +1711,30 @@ Efault:
}
#endif
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if the thread group represented by tgid has exited (all
+ * threads are zombies, dead or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+ struct task_struct *task;
+ bool exited;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ exited = !task ||
+ (READ_ONCE(task->exit_state) && thread_group_empty(task));
+ rcu_read_unlock();
+
+ return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
__weak void abort(void)
{
BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..76d3f3387554 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new) {
- *new = *orig;
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ *new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
}
@@ -473,7 +479,6 @@ void free_task(struct task_struct *tsk)
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
- put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
@@ -1474,7 +1479,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
goto out;
}
- newf = dup_fd(oldf, &error);
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
if (!newf)
goto out;
@@ -1787,22 +1792,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
- struct task_struct *task;
struct pid *pid = file->private_data;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
- if (!task || (task->exit_state && thread_group_empty(task)))
+ if (thread_group_exited(pid))
poll_flags = EPOLLIN | EPOLLRDNORM;
- rcu_read_unlock();
return poll_flags;
}
@@ -1954,8 +1955,8 @@ static __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
@@ -2035,19 +2036,11 @@ static __latent_entropy struct task_struct *copy_process(
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
- p->irq_events = 0;
- p->hardirqs_enabled = 0;
- p->hardirq_enable_ip = 0;
- p->hardirq_enable_event = 0;
- p->hardirq_disable_ip = _THIS_IP_;
- p->hardirq_disable_event = 0;
- p->softirqs_enabled = 1;
- p->softirq_enable_ip = _THIS_IP_;
- p->softirq_enable_event = 0;
- p->softirq_disable_ip = 0;
- p->softirq_disable_event = 0;
- p->hardirq_context = 0;
- p->softirq_context = 0;
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
+ p->softirqs_enabled = 1;
+ p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
@@ -2104,8 +2097,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
- args->tls);
+ retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -2304,6 +2296,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
@@ -2423,6 +2416,20 @@ long _do_fork(struct kernel_clone_args *args)
long nr;
/*
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+ * field in struct clone_args and it still doesn't make sense to have
+ * them both point at the same memory location. Performing this check
+ * here has the advantage that we don't need to have a separate helper
+ * to check for legacy clone().
+ */
+ if ((args->flags & CLONE_PIDFD) &&
+ (args->flags & CLONE_PARENT_SETTID) &&
+ (args->pidfd == args->parent_tid))
+ return -EINVAL;
+
+ /*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
@@ -2479,42 +2486,6 @@ long _do_fork(struct kernel_clone_args *args)
return nr;
}
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((kargs->flags & CLONE_PIDFD) &&
- (kargs->flags & CLONE_PARENT_SETTID))
- return false;
-
- return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr)
-{
- struct kernel_clone_args args = {
- .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
- .pidfd = parent_tidptr,
- .child_tid = child_tidptr,
- .parent_tid = parent_tidptr,
- .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
- .stack = stack_start,
- .stack_size = stack_size,
- };
-
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
- return _do_fork(&args);
-}
-#endif
-
/*
* Create a kernel thread.
*/
@@ -2593,24 +2564,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
return _do_fork(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -2907,14 +2866,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
/*
* Unshare file descriptor table if it is being shared
*/
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+ struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
+ *new_fdp = dup_fd(fd, max_fds, &error);
if (!*new_fdp)
return error;
}
@@ -2925,7 +2885,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
@@ -2974,7 +2934,7 @@ int ksys_unshare(unsigned long unshare_flags)
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, &new_fd);
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
@@ -3063,7 +3023,7 @@ int unshare_files(struct files_struct **displaced)
struct files_struct *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, &copy);
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
if (error || !copy) {
*displaced = NULL;
return error;
diff --git a/kernel/futex.c b/kernel/futex.c
index e646661f6282..83404124b77b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -32,30 +32,13 @@
* "But they come in a choice of three flavours!"
*/
#include <linux/compat.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/file.h>
#include <linux/jhash.h>
-#include <linux/init.h>
-#include <linux/futex.h>
-#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
-#include <linux/signal.h>
-#include <linux/export.h>
-#include <linux/magic.h>
-#include <linux/pid.h>
-#include <linux/nsproxy.h>
-#include <linux/ptrace.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
-#include <linux/refcount.h>
#include <asm/futex.h>
@@ -476,7 +459,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
@@ -500,8 +483,8 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -538,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
again:
/* Ignore any VERIFY_READ mapping (futex common case) */
- if (unlikely(should_fail_futex(fshared)))
+ if (unlikely(should_fail_futex(true)))
return -EFAULT;
err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
@@ -626,7 +609,7 @@ again:
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
- if (unlikely(should_fail_futex(fshared)) || ro) {
+ if (unlikely(should_fail_futex(true)) || ro) {
err = -EFAULT;
goto out;
}
@@ -677,10 +660,6 @@ out:
return err;
}
-static inline void put_futex_key(union futex_key *key)
-{
-}
-
/**
* fault_in_user_writeable() - Fault in user address and verify RW access
* @uaddr: pointer to faulting user space address
@@ -1326,7 +1305,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
int err;
- u32 uninitialized_var(curval);
+ u32 curval;
if (unlikely(should_fail_futex(true)))
return -EFAULT;
@@ -1496,7 +1475,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
*/
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- u32 uninitialized_var(curval), newval;
+ u32 curval, newval;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
@@ -1611,13 +1590,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
if (unlikely(ret != 0))
- goto out;
+ return ret;
hb = hash_futex(&key);
/* Make sure we really have tasks to wakeup */
if (!hb_waiters_pending(hb))
- goto out_put_key;
+ return ret;
spin_lock(&hb->lock);
@@ -1640,9 +1619,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
spin_unlock(&hb->lock);
wake_up_q(&wake_q);
-out_put_key:
- put_futex_key(&key);
-out:
return ret;
}
@@ -1709,10 +1685,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
if (unlikely(ret != 0))
- goto out;
+ return ret;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
- goto out_put_key1;
+ return ret;
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
@@ -1730,13 +1706,13 @@ retry_private:
* an MMU, but we might get them from range checking
*/
ret = op_ret;
- goto out_put_keys;
+ return ret;
}
if (op_ret == -EFAULT) {
ret = fault_in_user_writeable(uaddr2);
if (ret)
- goto out_put_keys;
+ return ret;
}
if (!(flags & FLAGS_SHARED)) {
@@ -1744,8 +1720,6 @@ retry_private:
goto retry_private;
}
- put_futex_key(&key2);
- put_futex_key(&key1);
cond_resched();
goto retry;
}
@@ -1781,11 +1755,6 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
return ret;
}
@@ -1992,20 +1961,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
if (unlikely(ret != 0))
- goto out;
+ return ret;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0))
- goto out_put_key1;
+ return ret;
/*
* The check above which compares uaddrs is not sufficient for
* shared futexes. We need to compare the keys:
*/
- if (requeue_pi && match_futex(&key1, &key2)) {
- ret = -EINVAL;
- goto out_put_keys;
- }
+ if (requeue_pi && match_futex(&key1, &key2))
+ return -EINVAL;
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
@@ -2025,13 +1992,11 @@ retry_private:
ret = get_user(curval, uaddr1);
if (ret)
- goto out_put_keys;
+ return ret;
if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(&key2);
- put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
@@ -2090,12 +2055,10 @@ retry_private:
case -EFAULT:
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
- goto out;
+ return ret;
case -EBUSY:
case -EAGAIN:
/*
@@ -2106,8 +2069,6 @@ retry_private:
*/
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
@@ -2216,12 +2177,6 @@ out_unlock:
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
-
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
return ret ? ret : task_count;
}
@@ -2370,7 +2325,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct task_struct *argowner)
{
struct futex_pi_state *pi_state = q->pi_state;
- u32 uval, uninitialized_var(curval), newval;
+ u32 uval, curval, newval;
struct task_struct *oldowner, *newowner;
u32 newtid;
int ret, err = 0;
@@ -2567,7 +2522,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
- goto out;
+ return ret ? ret : locked;
}
/*
@@ -2580,7 +2535,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
*/
if (q->pi_state->owner == current) {
ret = fixup_pi_state_owner(uaddr, q, NULL);
- goto out;
+ return ret;
}
/*
@@ -2594,8 +2549,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
q->pi_state->owner);
}
-out:
- return ret ? ret : locked;
+ return ret;
}
/**
@@ -2692,12 +2646,11 @@ retry_private:
ret = get_user(uval, uaddr);
if (ret)
- goto out;
+ return ret;
if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(&q->key);
goto retry;
}
@@ -2706,9 +2659,6 @@ retry_private:
ret = -EWOULDBLOCK;
}
-out:
- if (ret)
- put_futex_key(&q->key);
return ret;
}
@@ -2853,7 +2803,6 @@ retry_private:
* - EAGAIN: The user space value changed.
*/
queue_unlock(hb);
- put_futex_key(&q.key);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
@@ -2961,13 +2910,11 @@ no_block:
put_pi_state(pi_state);
}
- goto out_put_key;
+ goto out;
out_unlock_put_key:
queue_unlock(hb);
-out_put_key:
- put_futex_key(&q.key);
out:
if (to) {
hrtimer_cancel(&to->timer);
@@ -2980,12 +2927,11 @@ uaddr_faulted:
ret = fault_in_user_writeable(uaddr);
if (ret)
- goto out_put_key;
+ goto out;
if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(&q.key);
goto retry;
}
@@ -2996,7 +2942,7 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+ u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
@@ -3114,16 +3060,13 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
out_putkey:
- put_futex_key(&key);
return ret;
pi_retry:
- put_futex_key(&key);
cond_resched();
goto retry;
pi_faulted:
- put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
@@ -3265,7 +3208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
- goto out_key2;
+ goto out;
/*
* The check above which compares uaddrs is not sufficient for
@@ -3274,7 +3217,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (match_futex(&q.key, &key2)) {
queue_unlock(hb);
ret = -EINVAL;
- goto out_put_keys;
+ goto out;
}
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
@@ -3284,7 +3227,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
spin_unlock(&hb->lock);
if (ret)
- goto out_put_keys;
+ goto out;
/*
* In order for us to be here, we know our q.key == key2, and since
@@ -3374,11 +3317,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
ret = -EWOULDBLOCK;
}
-out_put_keys:
- put_futex_key(&q.key);
-out_key2:
- put_futex_key(&key2);
-
out:
if (to) {
hrtimer_cancel(&to->timer);
@@ -3479,7 +3417,7 @@ err_unlock:
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
- u32 uval, uninitialized_var(nval), mval;
+ u32 uval, nval, mval;
int err;
/* Futex address must be 32bit aligned */
@@ -3609,7 +3547,7 @@ static void exit_robust_list(struct task_struct *curr)
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
unsigned long futex_offset;
int rc;
@@ -3909,7 +3847,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
+ unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20512252ecc9..10a5aff4eecc 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION
config HARDIRQS_SW_RESEND
bool
-# Preflow handler support for fasteoi (sparc64)
-config IRQ_PREFLOW_FASTEOI
- bool
-
# Edge style eoi based handler (cell)
config IRQ_EDGE_EOI_HANDLER
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41e7e37a0928..857f5f4c8098 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -656,16 +656,6 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_level_irq);
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void preflow_handler(struct irq_desc *desc)
-{
- if (desc->preflow_handler)
- desc->preflow_handler(&desc->irq_data);
-}
-#else
-static inline void preflow_handler(struct irq_desc *desc) { }
-#endif
-
static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
{
if (!(desc->istate & IRQS_ONESHOT)) {
@@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc)
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc)
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
@@ -1478,6 +1465,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy);
/**
* irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
@@ -1492,7 +1480,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
return -ENOSYS;
}
-
+EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent);
/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4f9f844074db..b95ff5d5f4bd 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_SET),
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+ BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
@@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+
+ BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
+
+ BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
};
static const struct irq_bit_descr irqdesc_states[] = {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a4c2c915511d..76cd7ebd1178 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
if (!domain)
return NULL;
- if (fwnode && is_fwnode_irqchip(fwnode)) {
+ if (is_fwnode_irqchip(fwnode)) {
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
switch (fwid->type) {
@@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
mutex_unlock(&irq_domain_mutex);
}
+EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
/**
* irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a9fec53e159..48c38e09c673 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
struct irq_desc *desc = irq_data_to_desc(data);
/*
+ * Handle irq chips which can handle affinity only in activated
+ * state correctly
+ *
* If the interrupt is not yet activated, just store the affinity
* mask and do not call the chip driver at all. On activation the
* driver has to make sure anyway that the interrupt is in a
* useable state so startup works.
*/
- if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data))
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+ irqd_is_activated(data) || !irqd_affinity_on_activate(data))
return false;
cpumask_copy(desc->irq_common_data.affinity, mask);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 27634f4022d0..c48ce19a257f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg)
}
/* Tasklet to handle resend: */
-static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index bb14e64f62a4..95cb74f73292 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/compiler.h>
/*
@@ -437,6 +438,7 @@ struct kallsym_iter {
loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
+ loff_t pos_bpf_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
{
int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
@@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
+ int ret;
+
strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
- &iter->value, &iter->type,
- iter->name) < 0 ? 0 : 1;
+ ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+ &iter->value, &iter->type,
+ iter->name);
+ if (ret < 0) {
+ iter->pos_bpf_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+ strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ iter->exported = 0;
+ return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
}
/* Returns space to next name. */
@@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
+ iter->pos_bpf_end = 0;
}
}
@@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
get_ksymbol_ftrace_mod(iter))
return 1;
- return get_ksymbol_bpf(iter);
+ if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+ get_ksymbol_bpf(iter))
+ return 1;
+
+ return get_ksymbol_kprobe(iter);
}
/* Returns false if pos at or past end of file. */
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index d4999b38d1be..65ca5539c470 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -7,8 +7,11 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
-CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
- $(call cc-option,-fno-stack-protector,)
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ -fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
-obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
+
+CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
index be9e625227f3..75fe701f4127 100644
--- a/kernel/kcsan/atomic.h
+++ b/kernel/kcsan/atomic.h
@@ -3,8 +3,7 @@
#ifndef _KERNEL_KCSAN_ATOMIC_H
#define _KERNEL_KCSAN_ATOMIC_H
-#include <linux/jiffies.h>
-#include <linux/sched.h>
+#include <linux/types.h>
/*
* Special rules for certain memory where concurrent conflicting accesses are
@@ -13,8 +12,7 @@
*/
static bool kcsan_is_atomic_special(const volatile void *ptr)
{
- /* volatile globals that have been observed in data races. */
- return ptr == &jiffies || ptr == &current->state;
+ return false;
}
#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..9147ff6a12e5 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -291,6 +291,20 @@ static inline unsigned int get_delay(void)
0);
}
+void kcsan_save_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->kcsan_save_irqtrace = task->irqtrace;
+#endif
+}
+
+void kcsan_restore_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->irqtrace = task->kcsan_save_irqtrace;
+#endif
+}
+
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
flags = user_access_save();
if (consumed) {
+ kcsan_save_irqtrace(current);
kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
KCSAN_REPORT_CONSUMED_WATCHPOINT,
watchpoint - watchpoints);
+ kcsan_restore_irqtrace(current);
} else {
/*
* The other thread may not print any diagnostics, as it has
@@ -396,9 +412,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
goto out;
}
+ /*
+ * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
+ * runtime is entered for every memory access, and potentially useful
+ * information is lost if dirtied by KCSAN.
+ */
+ kcsan_save_irqtrace(current);
if (!kcsan_interrupt_watcher)
- /* Use raw to avoid lockdep recursion via IRQ flags tracing. */
- raw_local_irq_save(irq_flags);
+ local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
if (watchpoint == NULL) {
@@ -539,7 +560,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
out_unlock:
if (!kcsan_interrupt_watcher)
- raw_local_irq_restore(irq_flags);
+ local_irq_restore(irq_flags);
+ kcsan_restore_irqtrace(current);
out:
user_access_restore(ua_flags);
}
@@ -754,6 +776,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
*/
#define DEFINE_TSAN_READ_WRITE(size) \
+ void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
check_access(ptr, size, 0); \
@@ -762,6 +785,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_unaligned_read##size(void *ptr) \
__alias(__tsan_read##size); \
EXPORT_SYMBOL(__tsan_unaligned_read##size); \
+ void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
check_access(ptr, size, KCSAN_ACCESS_WRITE); \
@@ -777,12 +801,14 @@ DEFINE_TSAN_READ_WRITE(4);
DEFINE_TSAN_READ_WRITE(8);
DEFINE_TSAN_READ_WRITE(16);
+void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
check_access(ptr, size, 0);
}
EXPORT_SYMBOL(__tsan_read_range);
+void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
check_access(ptr, size, KCSAN_ACCESS_WRITE);
@@ -799,6 +825,7 @@ EXPORT_SYMBOL(__tsan_write_range);
* the size-check of compiletime_assert_rwonce_type().
*/
#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \
+ void __tsan_volatile_read##size(void *ptr); \
void __tsan_volatile_read##size(void *ptr) \
{ \
const bool is_atomic = size <= sizeof(long long) && \
@@ -811,6 +838,7 @@ EXPORT_SYMBOL(__tsan_write_range);
void __tsan_unaligned_volatile_read##size(void *ptr) \
__alias(__tsan_volatile_read##size); \
EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
+ void __tsan_volatile_write##size(void *ptr); \
void __tsan_volatile_write##size(void *ptr) \
{ \
const bool is_atomic = size <= sizeof(long long) && \
@@ -836,14 +864,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16);
* The below are not required by KCSAN, but can still be emitted by the
* compiler.
*/
+void __tsan_func_entry(void *call_pc);
void __tsan_func_entry(void *call_pc)
{
}
EXPORT_SYMBOL(__tsan_func_entry);
+void __tsan_func_exit(void);
void __tsan_func_exit(void)
{
}
EXPORT_SYMBOL(__tsan_func_exit);
+void __tsan_init(void);
void __tsan_init(void)
{
}
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
new file mode 100644
index 000000000000..fed6fcb5768c
--- /dev/null
+++ b/kernel/kcsan/kcsan-test.c
@@ -0,0 +1,1107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN test with various race scenarious to test runtime behaviour. Since the
+ * interface with which KCSAN's reports are obtained is via the console, this is
+ * the output we should verify. For each test case checks the presence (or
+ * absence) of generated reports. Relies on 'console' tracepoint to capture
+ * reports as they appear in the kernel log.
+ *
+ * Makes use of KUnit for test organization, and the Torture framework for test
+ * thread control.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/torture.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+#include <trace/events/printk.h>
+
+/* Points to current test-case memory access "kernels". */
+static void (*access_kernels[2])(void);
+
+static struct task_struct **threads; /* Lists of threads. */
+static unsigned long end_time; /* End time of test. */
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[3][512];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Setup test checking loop. */
+static __no_kcsan inline void
+begin_test_checks(void (*func1)(void), void (*func2)(void))
+{
+ kcsan_disable_current();
+
+ /*
+ * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
+ * least one race is reported.
+ */
+ end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500);
+
+ /* Signal start; release potential initialization of shared data. */
+ smp_store_release(&access_kernels[0], func1);
+ smp_store_release(&access_kernels[1], func2);
+}
+
+/* End test checking loop. */
+static __no_kcsan inline bool
+end_test_checks(bool stop)
+{
+ if (!stop && time_before(jiffies, end_time)) {
+ /* Continue checking */
+ might_sleep();
+ return false;
+ }
+
+ kcsan_enable_current();
+ return true;
+}
+
+/*
+ * Probe for console output: checks if a race was reported, and obtains observed
+ * lines of interest.
+ */
+__no_kcsan
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ /*
+ * Note that KCSAN reports under a global lock, so we do not risk the
+ * possibility of having multiple reports interleaved. If that were the
+ * case, we'd expect tests to fail.
+ */
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KCSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+
+ if (strnstr(buf, "race at unknown origin", len)) {
+ if (WARN_ON(nlines != 2))
+ goto out;
+
+ /* No second line of interest. */
+ strcpy(observed.lines[nlines++], "<none>");
+ }
+ }
+
+out:
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+__no_kcsan
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Report information we expect in a report. */
+struct expect_report {
+ /* Access information of both accesses. */
+ struct {
+ void *fn; /* Function pointer to expected function of top frame. */
+ void *addr; /* Address of access; unchecked if NULL. */
+ size_t size; /* Size of access; unchecked if @addr is NULL. */
+ int type; /* Access type, see KCSAN_ACCESS definitions. */
+ } access[2];
+};
+
+/* Check observed report matches information in @r. */
+__no_kcsan
+static bool report_matches(const struct expect_report *r)
+{
+ const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
+ bool ret = false;
+ unsigned long flags;
+ typeof(observed.lines) expect;
+ const char *end;
+ char *cur;
+ int i;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
+ is_assert ? "assert: race" : "data-race");
+ if (r->access[1].fn) {
+ char tmp[2][64];
+ int cmp;
+
+ /* Expect lexographically sorted function names in title. */
+ scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn);
+ scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn);
+ cmp = strcmp(tmp[0], tmp[1]);
+ cur += scnprintf(cur, end - cur, "%ps / %ps",
+ cmp < 0 ? r->access[0].fn : r->access[1].fn,
+ cmp < 0 ? r->access[1].fn : r->access[0].fn);
+ } else {
+ scnprintf(cur, end - cur, "%pS", r->access[0].fn);
+ /* The exact offset won't match, remove it. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+ }
+
+ /* Access 1 */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+ if (!r->access[1].fn)
+ cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
+
+ /* Access 1 & 2 */
+ for (i = 0; i < 2; ++i) {
+ const char *const access_type =
+ (r->access[i].type & KCSAN_ACCESS_ASSERT) ?
+ ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+ "assert no accesses" :
+ "assert no writes") :
+ ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+ "write" :
+ "read");
+ const char *const access_type_aux =
+ (r->access[i].type & KCSAN_ACCESS_ATOMIC) ?
+ " (marked)" :
+ ((r->access[i].type & KCSAN_ACCESS_SCOPED) ?
+ " (scoped)" :
+ "");
+
+ if (i == 1) {
+ /* Access 2 */
+ cur = expect[2];
+ end = &expect[2][sizeof(expect[2]) - 1];
+
+ if (!r->access[1].fn) {
+ /* Dummy string if no second access is available. */
+ strcpy(cur, "<none>");
+ break;
+ }
+ }
+
+ cur += scnprintf(cur, end - cur, "%s%s to ", access_type,
+ access_type_aux);
+
+ if (r->access[i].addr) /* Address is optional. */
+ cur += scnprintf(cur, end - cur, "0x%px of %zu bytes",
+ r->access[i].addr, r->access[i].size);
+ }
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) &&
+ /* Access info may appear in any order. */
+ ((strstr(observed.lines[1], expect[1]) &&
+ strstr(observed.lines[2], expect[2])) ||
+ (strstr(observed.lines[1], expect[2]) &&
+ strstr(observed.lines[2], expect[1])));
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ return ret;
+}
+
+/* ===== Test kernels ===== */
+
+static long test_sink;
+static long test_var;
+/* @test_array should be large enough to fall into multiple watchpoint slots. */
+static long test_array[3 * PAGE_SIZE / sizeof(long)];
+static struct {
+ long val[8];
+} test_struct;
+static DEFINE_SEQLOCK(test_seqlock);
+
+/*
+ * Helper to avoid compiler optimizing out reads, and to generate source values
+ * for writes.
+ */
+__no_kcsan
+static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+
+static noinline void test_kernel_read(void) { sink_value(test_var); }
+
+static noinline void test_kernel_write(void)
+{
+ test_var = READ_ONCE_NOCHECK(test_sink) + 1;
+}
+
+static noinline void test_kernel_write_nochange(void) { test_var = 42; }
+
+/* Suffixed by value-change exception filter. */
+static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; }
+
+static noinline void test_kernel_read_atomic(void)
+{
+ sink_value(READ_ONCE(test_var));
+}
+
+static noinline void test_kernel_write_atomic(void)
+{
+ WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
+}
+
+__no_kcsan
+static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
+
+static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+
+static noinline void test_kernel_assert_writer(void)
+{
+ ASSERT_EXCLUSIVE_WRITER(test_var);
+}
+
+static noinline void test_kernel_assert_access(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS(test_var);
+}
+
+#define TEST_CHANGE_BITS 0xff00ff00
+
+static noinline void test_kernel_change_bits(void)
+{
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ /*
+ * Avoid race of unknown origin for this test, just pretend they
+ * are atomic.
+ */
+ kcsan_nestable_atomic_begin();
+ test_var ^= TEST_CHANGE_BITS;
+ kcsan_nestable_atomic_end();
+ } else
+ WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_change(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_nochange(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
+}
+
+/* To check that scoped assertions do trigger anywhere in scope. */
+static noinline void test_enter_scope(void)
+{
+ int x = 0;
+
+ /* Unrelated accesses to scoped assert. */
+ READ_ONCE(test_sink);
+ kcsan_check_read(&x, sizeof(x));
+}
+
+static noinline void test_kernel_assert_writer_scoped(void)
+{
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_assert_access_scoped(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_rmw_array(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_array); ++i)
+ test_array[i]++;
+}
+
+static noinline void test_kernel_write_struct(void)
+{
+ kcsan_check_write(&test_struct, sizeof(test_struct));
+ kcsan_disable_current();
+ test_struct.val[3]++; /* induce value change */
+ kcsan_enable_current();
+}
+
+static noinline void test_kernel_write_struct_part(void)
+{
+ test_struct.val[3] = 42;
+}
+
+static noinline void test_kernel_read_struct_zero_size(void)
+{
+ kcsan_check_read(&test_struct.val[3], 0);
+}
+
+static noinline void test_kernel_jiffies_reader(void)
+{
+ sink_value((long)jiffies);
+}
+
+static noinline void test_kernel_seqlock_reader(void)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&test_seqlock);
+ sink_value(test_var);
+ } while (read_seqretry(&test_seqlock, seq));
+}
+
+static noinline void test_kernel_seqlock_writer(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&test_seqlock, flags);
+ test_var++;
+ write_sequnlock_irqrestore(&test_seqlock, flags);
+}
+
+/* ===== Test cases ===== */
+
+/* Simple test with normal data race. */
+__no_kcsan
+static void test_basic(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ static const struct expect_report never = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_read);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Stress KCSAN with lots of concurrent races on different addresses until
+ * timeout.
+ */
+__no_kcsan
+static void test_concurrent_races(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ /* NULL will match any address. */
+ { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE },
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ },
+ };
+ static const struct expect_report never = {
+ .access = {
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(false));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */
+__no_kcsan
+static void test_novalue_change(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_nochange, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should
+ * never apply work.
+ */
+__no_kcsan
+static void test_novalue_change_exception(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that data races of unknown origin are reported. */
+__no_kcsan
+static void test_unknown_origin(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { NULL },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */
+__no_kcsan
+static void test_write_write_assume_atomic(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_write);
+ do {
+ sink_value(READ_ONCE(test_var)); /* induce value-change */
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races with writes larger than word-size are always reported,
+ * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races where only one write is larger than word-size are always
+ * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct_part(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that races with atomic accesses never result in reports. */
+__no_kcsan
+static void test_read_atomic_write_atomic(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that a race with an atomic and plain access result in reports. */
+__no_kcsan
+static void test_read_plain_atomic_write(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
+ return;
+
+ begin_test_checks(test_kernel_read, test_kernel_write_atomic);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Zero-sized accesses should never cause data race reports. */
+__no_kcsan
+static void test_zero_size_access(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ const struct expect_report never = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the data_race() macro. */
+__no_kcsan
+static void test_data_race(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_race, test_kernel_data_race);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_writer(struct kunit *test)
+{
+ const struct expect_report expect_access_writer = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ const struct expect_report expect_access_access = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ },
+ };
+ const struct expect_report never = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ bool match_expect_access_writer = false;
+ bool match_expect_access_access = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer);
+ do {
+ match_expect_access_writer |= report_matches(&expect_access_writer);
+ match_expect_access_access |= report_matches(&expect_access_access);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect_access_writer);
+ KUNIT_EXPECT_TRUE(test, match_expect_access_access);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_change(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_change_bits, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_nochange(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer_scoped(struct kunit *test)
+{
+ const struct expect_report expect_start = {
+ .access = {
+ { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ const struct expect_report expect_anywhere = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_anywhere = false;
+
+ begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
+ do {
+ match_expect_start |= report_matches(&expect_start);
+ match_expect_anywhere |= report_matches(&expect_anywhere);
+ } while (!end_test_checks(match_expect_start && match_expect_anywhere));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_scoped(struct kunit *test)
+{
+ const struct expect_report expect_start1 = {
+ .access = {
+ { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ const struct expect_report expect_start2 = {
+ .access = { expect_start1.access[0], expect_start1.access[0] },
+ };
+ const struct expect_report expect_inscope = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_inscope = false;
+
+ begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read);
+ end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */
+ do {
+ match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_start && match_expect_inscope));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+}
+
+/*
+ * jiffies is special (declared to be volatile) and its accesses are typically
+ * not marked; this test ensures that the compiler nor KCSAN gets confused about
+ * jiffies's declaration on different architectures.
+ */
+__no_kcsan
+static void test_jiffies_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that racing accesses in seqlock critical sections are not reported. */
+__no_kcsan
+static void test_seqlock_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Each test case is run with different numbers of threads. Until KUnit supports
+ * passing arguments for each test case, we encode #threads in the test case
+ * name (read by get_num_threads()). [The '-' was chosen as a stylistic
+ * preference to separate test name and #threads.]
+ *
+ * The thread counts are chosen to cover potentially interesting boundaries and
+ * corner cases (range 2-5), and then stress the system with larger counts.
+ */
+#define KCSAN_KUNIT_CASE(test_name) \
+ { .run_case = test_name, .name = #test_name "-02" }, \
+ { .run_case = test_name, .name = #test_name "-03" }, \
+ { .run_case = test_name, .name = #test_name "-04" }, \
+ { .run_case = test_name, .name = #test_name "-05" }, \
+ { .run_case = test_name, .name = #test_name "-08" }, \
+ { .run_case = test_name, .name = #test_name "-16" }
+
+static struct kunit_case kcsan_test_cases[] = {
+ KCSAN_KUNIT_CASE(test_basic),
+ KCSAN_KUNIT_CASE(test_concurrent_races),
+ KCSAN_KUNIT_CASE(test_novalue_change),
+ KCSAN_KUNIT_CASE(test_novalue_change_exception),
+ KCSAN_KUNIT_CASE(test_unknown_origin),
+ KCSAN_KUNIT_CASE(test_write_write_assume_atomic),
+ KCSAN_KUNIT_CASE(test_write_write_struct),
+ KCSAN_KUNIT_CASE(test_write_write_struct_part),
+ KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+ KCSAN_KUNIT_CASE(test_zero_size_access),
+ KCSAN_KUNIT_CASE(test_data_race),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+ KCSAN_KUNIT_CASE(test_jiffies_noreport),
+ KCSAN_KUNIT_CASE(test_seqlock_noreport),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+/* Get number of threads encoded in test name. */
+static bool __no_kcsan
+get_num_threads(const char *test, int *nthreads)
+{
+ int len = strlen(test);
+
+ if (WARN_ON(len < 3))
+ return false;
+
+ *nthreads = test[len - 1] - '0';
+ *nthreads += (test[len - 2] - '0') * 10;
+
+ if (WARN_ON(*nthreads < 0))
+ return false;
+
+ return true;
+}
+
+/* Concurrent accesses from interrupts. */
+__no_kcsan
+static void access_thread_timer(struct timer_list *timer)
+{
+ static atomic_t cnt = ATOMIC_INIT(0);
+ unsigned int idx;
+ void (*func)(void);
+
+ idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+}
+
+/* The main loop for each thread. */
+__no_kcsan
+static int access_thread(void *arg)
+{
+ struct timer_list timer;
+ unsigned int cnt = 0;
+ unsigned int idx;
+ void (*func)(void);
+
+ timer_setup_on_stack(&timer, access_thread_timer, 0);
+ do {
+ might_sleep();
+
+ if (!timer_pending(&timer))
+ mod_timer(&timer, jiffies + 1);
+ else {
+ /* Iterate through all kernels. */
+ idx = cnt++ % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+ }
+ } while (!torture_must_stop());
+ del_timer_sync(&timer);
+ destroy_timer_on_stack(&timer);
+
+ torture_kthread_stopping("access_thread");
+ return 0;
+}
+
+__no_kcsan
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int nthreads;
+ int i;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); ++i)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ if (!torture_init_begin((char *)test->name, 1))
+ return -EBUSY;
+
+ if (!get_num_threads(test->name, &nthreads))
+ goto err;
+
+ if (WARN_ON(threads))
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) {
+ if (WARN_ON(access_kernels[i]))
+ goto err;
+ }
+
+ if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ /*
+ * Without any preemption, keep 2 CPUs free for other tasks, one
+ * of which is the main test case function checking for
+ * completion or failure.
+ */
+ const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+ const int min_required_cpus = 2 + min_unused_cpus;
+
+ if (num_online_cpus() < min_required_cpus) {
+ pr_err("%s: too few online CPUs (%u < %d) for test",
+ test->name, num_online_cpus(), min_required_cpus);
+ goto err;
+ } else if (nthreads > num_online_cpus() - min_unused_cpus) {
+ nthreads = num_online_cpus() - min_unused_cpus;
+ pr_warn("%s: limiting number of threads to %d\n",
+ test->name, nthreads);
+ }
+ }
+
+ if (nthreads) {
+ threads = kcalloc(nthreads + 1, sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (WARN_ON(!threads))
+ goto err;
+
+ threads[nthreads] = NULL;
+ for (i = 0; i < nthreads; ++i) {
+ if (torture_create_kthread(access_thread, NULL,
+ threads[i]))
+ goto err;
+ }
+ }
+
+ torture_init_end();
+
+ return 0;
+
+err:
+ kfree(threads);
+ threads = NULL;
+ torture_init_end();
+ return -EINVAL;
+}
+
+__no_kcsan
+static void test_exit(struct kunit *test)
+{
+ struct task_struct **stop_thread;
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i)
+ WRITE_ONCE(access_kernels[i], NULL);
+
+ if (threads) {
+ for (stop_thread = threads; *stop_thread; stop_thread++)
+ torture_stop_kthread(reader_thread, *stop_thread);
+
+ kfree(threads);
+ threads = NULL;
+ }
+
+ torture_cleanup_end();
+}
+
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan-test",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
+
+__no_kcsan
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+__no_kcsan
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kcsan_test_init(void)
+{
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return __kunit_test_suites_init(kcsan_test_suites);
+}
+
+static void kcsan_test_exit(void)
+{
+ __kunit_test_suites_exit(kcsan_test_suites);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+late_initcall(kcsan_test_init);
+module_exit(kcsan_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 763d6d08d94b..29480010dc30 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -9,6 +9,7 @@
#define _KERNEL_KCSAN_KCSAN_H
#include <linux/kcsan.h>
+#include <linux/sched.h>
/* The number of adjacent watchpoints to check. */
#define KCSAN_CHECK_ADJACENT 1
@@ -23,6 +24,12 @@ extern unsigned int kcsan_udelay_interrupt;
extern bool kcsan_enabled;
/*
+ * Save/restore IRQ flags state trace dirtied by KCSAN.
+ */
+void kcsan_save_irqtrace(struct task_struct *task);
+void kcsan_restore_irqtrace(struct task_struct *task);
+
+/*
* Initialize debugfs file.
*/
void kcsan_debugfs_init(void);
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index ac5f8345bae9..9d07e175de0f 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task)
if (!task)
return;
+ /* Restore IRQ state trace for printing. */
+ kcsan_restore_irqtrace(task);
+
pr_err("\n");
debug_show_held_locks(task);
print_irqtrace_events(task);
@@ -606,10 +609,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type,
goto out;
/*
- * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
- * we do not turn off lockdep here; this could happen due to recursion
- * into lockdep via KCSAN if we detect a race in utilities used by
- * lockdep.
+ * Because we may generate reports when we're in scheduler code, the use
+ * of printk() could deadlock. Until such time that all printing code
+ * called in print_report() is scheduler-safe, accept the risk, and just
+ * get our message out. As such, also disable lockdep to hide the
+ * warning, and avoid disabling lockdep for the rest of the kernel.
*/
lockdep_off();
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/selftest.c
index d26a052d3383..d26a052d3383 100644
--- a/kernel/kcsan/test.c
+++ b/kernel/kcsan/selftest.c
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2e97febeef77..e87679a48ba2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/perf_event.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
@@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
+
+ /* Record the perf ksymbol register event after adding the page */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ PAGE_SIZE, false, c->sym);
out:
mutex_unlock(&c->mutex);
return slot;
@@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
+ /*
+ * Record perf ksymbol unregister event before removing
+ * the page.
+ */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
list_del_rcu(&kip->list);
synchronize_rcu();
kip->cache->free(kip->insns);
@@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
return ret;
}
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym)
+{
+ struct kprobe_insn_page *kip;
+ int ret = -ERANGE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if ((*symnum)--)
+ continue;
+ strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ *type = 't';
+ *value = (unsigned long)kip->insns;
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
@@ -563,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_lock(&kprobe_mutex);
cpus_read_lock();
mutex_lock(&text_mutex);
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -589,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work)
/* Step 4: Free cleaned kprobes after quiesence period */
do_free_cleaned_kprobes();
- mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
@@ -2232,6 +2264,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
kprobe_remove_area_blacklist(entry, entry + 1);
}
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ return 0;
+#ifdef CONFIG_OPTPROBES
+ if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ return 0;
+#endif
+#endif
+ if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ return 0;
+ return -ERANGE;
+}
+
int __init __weak arch_populate_kprobe_blacklist(void)
{
return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 132f84a5fde3..1d9e2fdfd67a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
+ set_cpus_allowed_ptr(task,
+ housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@@ -608,7 +610,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..2fad21d345b0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task)
static __always_inline void lockdep_recursion_finish(void)
{
- if (WARN_ON_ONCE(--current->lockdep_recursion))
+ if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK))
current->lockdep_recursion = 0;
}
@@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data)
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_forwards(this, (void *)&count, noop_count, &target_entry);
@@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
__bfs_backwards(this, (void *)&count, noop_count, &target_entry);
@@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
struct lock_trace **const trace)
{
int ret;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
struct lock_list src_entry = {
.class = hlock_class(src),
.parent = NULL,
@@ -1842,7 +1842,7 @@ static noinline int
check_redundant(struct held_lock *src, struct held_lock *target)
{
int ret;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
struct lock_list src_entry = {
.class = hlock_class(src),
.parent = NULL,
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("-----------------------------------------------------\n");
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- curr->hardirqs_enabled,
+ lockdep_hardirqs_enabled(),
curr->softirqs_enabled);
print_lock(next);
@@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
{
unsigned long usage_mask = 0, forward_mask, backward_mask;
enum lock_usage_bit forward_bit = 0, backward_bit = 0;
- struct lock_list *uninitialized_var(target_entry1);
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry1;
+ struct lock_list *target_entry;
struct lock_list this, that;
int ret;
@@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- lockdep_hardirqs_enabled(curr),
+ lockdep_hardirqs_enabled(),
lockdep_softirqs_enabled(curr));
print_lock(this);
@@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
{
int ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
root.parent = NULL;
root.class = hlock_class(this);
@@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
{
int ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
root.parent = NULL;
root.class = hlock_class(this);
@@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
void print_irqtrace_events(struct task_struct *curr)
{
- printk("irq event stamp: %u\n", curr->irq_events);
+ const struct irqtrace_events *trace = &curr->irqtrace;
+
+ printk("irq event stamp: %u\n", trace->irq_events);
printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
- curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
- (void *)curr->hardirq_enable_ip);
+ trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+ (void *)trace->hardirq_enable_ip);
printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
- curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
- (void *)curr->hardirq_disable_ip);
+ trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+ (void *)trace->hardirq_disable_ip);
printk("softirqs last enabled at (%u): [<%px>] %pS\n",
- curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
- (void *)curr->softirq_enable_ip);
+ trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+ (void *)trace->softirq_enable_ip);
printk("softirqs last disabled at (%u): [<%px>] %pS\n",
- curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
- (void *)curr->softirq_disable_ip);
+ trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+ (void *)trace->softirq_disable_ip);
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -3646,10 +3648,19 @@ static void __trace_hardirqs_on_caller(void)
*/
void lockdep_hardirqs_on_prepare(unsigned long ip)
{
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs do not (and cannot) track lock dependencies, nothing to do.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (unlikely(current->hardirqs_enabled)) {
+ if (unlikely(lockdep_hardirqs_enabled())) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3677,7 +3688,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
return;
current->hardirq_chain_key = current->curr_chain_key;
@@ -3690,12 +3701,35 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
void noinstr lockdep_hardirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+ * tracking state and hardware state are out of sync.
+ *
+ * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+ * and not rely on hardware state like normal interrupts.
+ */
+ if (unlikely(in_nmi())) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+
+ /*
+ * Skip:
+ * - recursion check, because NMI can hit lockdep;
+ * - hardware state check, because above;
+ * - chain_key check, see lockdep_hardirqs_on_prepare().
+ */
+ goto skip_checks;
+ }
- if (unlikely(!debug_locks || curr->lockdep_recursion))
+ if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -3720,10 +3754,11 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
current->curr_chain_key);
+skip_checks:
/* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
+ this_cpu_write(hardirqs_enabled, 1);
+ trace->hardirq_enable_ip = ip;
+ trace->hardirq_enable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_on_events);
}
EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
@@ -3733,9 +3768,18 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
*/
void noinstr lockdep_hardirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
+ if (unlikely(!debug_locks))
+ return;
- if (unlikely(!debug_locks || curr->lockdep_recursion))
+ /*
+ * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+ * they will restore the software state. This ensures the software
+ * state is consistent inside NMIs as well.
+ */
+ if (in_nmi()) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+ } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
return;
/*
@@ -3745,13 +3789,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->hardirqs_enabled = 0;
- curr->hardirq_disable_ip = ip;
- curr->hardirq_disable_event = ++curr->irq_events;
+ this_cpu_write(hardirqs_enabled, 0);
+ trace->hardirq_disable_ip = ip;
+ trace->hardirq_disable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_off_events);
} else {
debug_atomic_inc(redundant_hardirqs_off);
@@ -3764,7 +3810,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
*/
void lockdep_softirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -3776,7 +3822,7 @@ void lockdep_softirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
debug_atomic_inc(redundant_softirqs_on);
return;
}
@@ -3785,17 +3831,17 @@ void lockdep_softirqs_on(unsigned long ip)
/*
* We'll do an OFF -> ON transition:
*/
- curr->softirqs_enabled = 1;
- curr->softirq_enable_ip = ip;
- curr->softirq_enable_event = ++curr->irq_events;
+ current->softirqs_enabled = 1;
+ trace->softirq_enable_ip = ip;
+ trace->softirq_enable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_on_events);
/*
* We are going to turn softirqs on, so set the
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (curr->hardirqs_enabled)
- mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
+ if (lockdep_hardirqs_enabled())
+ mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
lockdep_recursion_finish();
}
@@ -3804,8 +3850,6 @@ void lockdep_softirqs_on(unsigned long ip)
*/
void lockdep_softirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -3815,13 +3859,15 @@ void lockdep_softirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->softirqs_enabled = 0;
- curr->softirq_disable_ip = ip;
- curr->softirq_disable_event = ++curr->irq_events;
+ current->softirqs_enabled = 0;
+ trace->softirq_disable_ip = ip;
+ trace->softirq_disable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_off_events);
/*
* Whoops, we wanted softirqs off, so why aren't they?
@@ -3843,7 +3889,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -3852,7 +3898,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -3890,7 +3936,7 @@ lock_used:
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
@@ -3983,7 +4029,7 @@ static inline short task_wait_context(struct task_struct *curr)
* Set appropriate wait type for the context; for IRQs we have to take
* into account force_irqthread as that is implied by PREEMPT_RT.
*/
- if (curr->hardirq_context) {
+ if (lockdep_hardirq_context()) {
/*
* Check if force_irqthreads will run us threaded.
*/
@@ -4826,11 +4872,11 @@ static void check_flags(unsigned long flags)
return;
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-on.\n");
}
}
@@ -5851,9 +5897,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
- ? "RCU used illegally from idle CPU!\n"
- : "",
+ : "",
rcu_scheduler_active, debug_locks);
/*
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..8ff6f50e06a0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg)
cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
- lock_is_write_held = 1;
+ lock_is_write_held = true;
if (WARN_ON_ONCE(lock_is_read_held))
lwsp->n_lock_fail++; /* rare, but... */
lwsp->n_lock_acquired++;
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = 0;
+ lock_is_write_held = false;
cxt.cur_ops->writeunlock();
stutter_wait("lock_torture_writer");
@@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock();
- lock_is_read_held = 1;
+ lock_is_read_held = true;
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = 0;
+ lock_is_read_held = false;
cxt.cur_ops->readunlock();
stutter_wait("lock_torture_reader");
@@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
- bool fail = 0;
+ bool fail = false;
int i, n_stress;
long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
long long sum = 0;
@@ -904,7 +904,7 @@ static int __init lock_torture_init(void)
/* Initialize the statistics so that each run gets its own numbers. */
if (nwriters_stress) {
- lock_is_write_held = 0;
+ lock_is_write_held = false;
cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
sizeof(*cxt.lwsa),
GFP_KERNEL);
@@ -935,7 +935,7 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = 0;
+ lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1f7734949ac8..1de006ed3aa8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
*/
for (;;) {
- if (prev->next == node &&
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
cmpxchg(&prev->next, node, NULL) == node)
break;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cd356630a311..12dd41b39a7f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
- CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
+ CLONE_NEWPID | CLONE_NEWCGROUP)))
return -EINVAL;
#ifndef CONFIG_USER_NS
@@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
if (flags & CLONE_NEWNET)
return -EINVAL;
#endif
+#ifndef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ return -EINVAL;
+#endif
return 0;
}
@@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
}
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME) {
+ ret = validate_ns(nsset, &nsp->time_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
out:
if (pid_ns)
put_pid_ns(pid_ns);
@@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
exit_sem(me);
#endif
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ timens_commit(me, nsset->nsproxy->time_ns);
+#endif
+
/* transfer ownership */
switch_task_namespaces(me, nsset->nsproxy);
nsset->nsproxy = NULL;
diff --git a/kernel/padata.c b/kernel/padata.c
index 4373f7adaa40..16cb894dc272 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -250,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel);
static struct padata_priv *padata_find_next(struct parallel_data *pd,
bool remove_object)
{
- struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
int cpu = pd->cpu;
- next_queue = per_cpu_ptr(pd->pqueue, cpu);
- reorder = &next_queue->reorder;
+ reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
if (list_empty(&reorder->list)) {
@@ -291,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd)
int cb_cpu;
struct padata_priv *padata;
struct padata_serial_queue *squeue;
- struct padata_parallel_queue *next_queue;
+ struct padata_list *reorder;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -339,9 +337,8 @@ static void padata_reorder(struct parallel_data *pd)
*/
smp_mb();
- next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
- if (!list_empty(&next_queue->reorder.list) &&
- padata_find_next(pd, false))
+ reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
+ if (!list_empty(&reorder->list) && padata_find_next(pd, false))
queue_work(pinst->serial_wq, &pd->reorder_work);
}
@@ -401,17 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
{
struct parallel_data *pd = padata->pd;
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
- struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
- hashed_cpu);
+ struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
- spin_lock(&pqueue->reorder.lock);
+ spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &pqueue->reorder.list, list)
+ list_for_each_entry_reverse(cur, &reorder->list, list)
if (cur->seq_nr < padata->seq_nr)
break;
list_add(&padata->list, &cur->list);
- spin_unlock(&pqueue->reorder.lock);
+ spin_unlock(&reorder->lock);
/*
* Ensure the addition to the reorder list is ordered correctly
@@ -441,28 +437,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
return err;
}
-static int pd_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
-{
- int err = -ENOMEM;
-
- if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
- goto out;
- if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
- goto free_pcpu_mask;
-
- cpumask_copy(pd->cpumask.pcpu, pcpumask);
- cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
-
- return 0;
-
-free_pcpu_mask:
- free_cpumask_var(pd->cpumask.pcpu);
-out:
- return err;
-}
-
static void __init padata_mt_helper(struct work_struct *w)
{
struct padata_work *pw = container_of(w, struct padata_work, pw_work);
@@ -575,17 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd)
}
}
-/* Initialize all percpu queues used by parallel workers */
-static void padata_init_pqueues(struct parallel_data *pd)
+/* Initialize per-CPU reorder lists */
+static void padata_init_reorder_list(struct parallel_data *pd)
{
int cpu;
- struct padata_parallel_queue *pqueue;
+ struct padata_list *list;
for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
-
- __padata_list_init(&pqueue->reorder);
- atomic_set(&pqueue->num_obj, 0);
+ list = per_cpu_ptr(pd->reorder_list, cpu);
+ __padata_list_init(list);
}
}
@@ -593,30 +565,31 @@ static void padata_init_pqueues(struct parallel_data *pd)
static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
struct padata_instance *pinst = ps->pinst;
- const struct cpumask *cbcpumask;
- const struct cpumask *pcpumask;
struct parallel_data *pd;
- cbcpumask = pinst->rcpumask.cbcpu;
- pcpumask = pinst->rcpumask.pcpu;
-
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
- pd->pqueue = alloc_percpu(struct padata_parallel_queue);
- if (!pd->pqueue)
+ pd->reorder_list = alloc_percpu(struct padata_list);
+ if (!pd->reorder_list)
goto err_free_pd;
pd->squeue = alloc_percpu(struct padata_serial_queue);
if (!pd->squeue)
- goto err_free_pqueue;
+ goto err_free_reorder_list;
pd->ps = ps;
- if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
+
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
goto err_free_squeue;
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+ goto err_free_pcpu;
+
+ cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
+ cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
- padata_init_pqueues(pd);
+ padata_init_reorder_list(pd);
padata_init_squeues(pd);
pd->seq_nr = -1;
atomic_set(&pd->refcnt, 1);
@@ -626,10 +599,12 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
return pd;
+err_free_pcpu:
+ free_cpumask_var(pd->cpumask.pcpu);
err_free_squeue:
free_percpu(pd->squeue);
-err_free_pqueue:
- free_percpu(pd->pqueue);
+err_free_reorder_list:
+ free_percpu(pd->reorder_list);
err_free_pd:
kfree(pd);
err:
@@ -640,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd)
{
free_cpumask_var(pd->cpumask.pcpu);
free_cpumask_var(pd->cpumask.cbcpu);
- free_percpu(pd->pqueue);
+ free_percpu(pd->reorder_list);
free_percpu(pd->squeue);
kfree(pd);
}
@@ -682,12 +657,6 @@ static int padata_replace(struct padata_instance *pinst)
pinst->flags |= PADATA_RESET;
- cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
- cpu_online_mask);
-
- cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
- cpu_online_mask);
-
list_for_each_entry(ps, &pinst->pslist, list) {
err = padata_replace_one(ps);
if (err)
@@ -789,43 +758,6 @@ out:
}
EXPORT_SYMBOL(padata_set_cpumask);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- *
- * Return: 0 on success or negative error code
- */
-int padata_start(struct padata_instance *pinst)
-{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err = -EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
-
- return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
- mutex_lock(&pinst->lock);
- __padata_stop(pinst);
- mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
#ifdef CONFIG_HOTPLUG_CPU
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
@@ -907,9 +839,6 @@ static void __padata_free(struct padata_instance *pinst)
WARN_ON(!list_empty(&pinst->pslist));
- padata_stop(pinst);
- free_cpumask_var(pinst->rcpumask.cbcpu);
- free_cpumask_var(pinst->rcpumask.pcpu);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
destroy_workqueue(pinst->serial_wq);
@@ -1044,18 +973,12 @@ static struct kobj_type padata_attr_type = {
};
/**
- * padata_alloc - allocate and initialize a padata instance and specify
- * cpumasks for serial and parallel workers.
- *
+ * padata_alloc - allocate and initialize a padata instance
* @name: used to identify the instance
- * @pcpumask: cpumask that will be used for padata parallelization
- * @cbcpumask: cpumask that will be used for padata serialization
*
* Return: new instance on success, NULL on error
*/
-static struct padata_instance *padata_alloc(const char *name,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+struct padata_instance *padata_alloc(const char *name)
{
struct padata_instance *pinst;
@@ -1081,26 +1004,16 @@ static struct padata_instance *padata_alloc(const char *name,
free_cpumask_var(pinst->cpumask.pcpu);
goto err_free_serial_wq;
}
- if (!padata_validate_cpumask(pinst, pcpumask) ||
- !padata_validate_cpumask(pinst, cbcpumask))
- goto err_free_masks;
-
- if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
- goto err_free_masks;
- if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
- goto err_free_rcpumask_pcpu;
INIT_LIST_HEAD(&pinst->pslist);
- cpumask_copy(pinst->cpumask.pcpu, pcpumask);
- cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
- cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
+ cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask);
+ cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask);
if (padata_setup_cpumasks(pinst))
- goto err_free_rcpumask_cbcpu;
+ goto err_free_masks;
- pinst->flags = 0;
+ __padata_start(pinst);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
@@ -1116,10 +1029,6 @@ static struct padata_instance *padata_alloc(const char *name,
return pinst;
-err_free_rcpumask_cbcpu:
- free_cpumask_var(pinst->rcpumask.cbcpu);
-err_free_rcpumask_pcpu:
- free_cpumask_var(pinst->rcpumask.pcpu);
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
@@ -1133,21 +1042,7 @@ err_free_inst:
err:
return NULL;
}
-
-/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- * Use the cpu_possible_mask for serial and
- * parallel workers.
- *
- * @name: used to identify the instance
- *
- * Return: new instance on success, NULL on error
- */
-struct padata_instance *padata_alloc_possible(const char *name)
-{
- return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
+EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
diff --git a/kernel/pid.c b/kernel/pid.c
index f1496b757162..b2562a7ce525 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <net/sock.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -198,7 +199,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
- if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
}
@@ -635,17 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = security_file_receive(file);
- if (ret) {
- fput(file);
- return ret;
- }
-
- ret = get_unused_fd_flags(O_CLOEXEC);
- if (ret < 0)
- fput(file);
- else
- fd_install(ret, file);
+ ret = receive_fd(file, O_CLOEXEC);
+ fput(file);
return ret;
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0e5ac162c3a8..ac135bd600eb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
struct ctl_table tmp = *table;
int ret, next;
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
/*
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0a9326f5f421..c1ff7fa030ab 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -1,9 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Energy Model of CPUs
+ * Energy Model of devices
*
- * Copyright (c) 2018, Arm ltd.
+ * Copyright (c) 2018-2020, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
+ * Improvements provided by: Lukasz Luba, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
@@ -15,30 +16,32 @@
#include <linux/sched/topology.h>
#include <linux/slab.h>
-/* Mapping of each CPU to the performance domain to which it belongs. */
-static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
-
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
+static bool _is_cpu_device(struct device *dev)
+{
+ return (dev->bus == &cpu_subsys);
+}
+
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
-static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
{
struct dentry *d;
char name[24];
- snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+ snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
- /* Create per-cs directory */
+ /* Create per-ps directory */
d = debugfs_create_dir(name, pd);
- debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
- debugfs_create_ulong("power", 0444, d, &cs->power);
- debugfs_create_ulong("cost", 0444, d, &cs->cost);
+ debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
+ debugfs_create_ulong("power", 0444, d, &ps->power);
+ debugfs_create_ulong("cost", 0444, d, &ps->cost);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
- char name[8];
int i;
- snprintf(name, sizeof(name), "pd%d", cpu);
-
/* Create the directory of the performance domain */
- d = debugfs_create_dir(name, rootdir);
+ d = debugfs_create_dir(dev_name(dev), rootdir);
- debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+ if (_is_cpu_device(dev))
+ debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
+ &em_debug_cpus_fops);
+
+ /* Create a sub-directory for each performance state */
+ for (i = 0; i < dev->em_pd->nr_perf_states; i++)
+ em_debug_create_ps(&dev->em_pd->table[i], d);
- /* Create a sub-directory for each capacity state */
- for (i = 0; i < pd->nr_cap_states; i++)
- em_debug_create_cs(&pd->table[i], d);
+}
+
+static void em_debug_remove_pd(struct device *dev)
+{
+ struct dentry *debug_dir;
+
+ debug_dir = debugfs_lookup(dev_name(dev), rootdir);
+ debugfs_remove_recursive(debug_dir);
}
static int __init em_debug_init(void)
@@ -76,58 +87,55 @@ static int __init em_debug_init(void)
}
core_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+static void em_debug_create_pd(struct device *dev) {}
+static void em_debug_remove_pd(struct device *dev) {}
#endif
-static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
- struct em_data_callback *cb)
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+ int nr_states, struct em_data_callback *cb)
{
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
unsigned long power, freq, prev_freq = 0;
- int i, ret, cpu = cpumask_first(span);
- struct em_cap_state *table;
- struct em_perf_domain *pd;
+ struct em_perf_state *table;
+ int i, ret;
u64 fmax;
- if (!cb->active_power)
- return NULL;
-
- pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
- if (!pd)
- return NULL;
-
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
- goto free_pd;
+ return -ENOMEM;
- /* Build the list of capacity states for this performance domain */
+ /* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
- * lowest capacity state of 'cpu' above 'freq' and updates
+ * lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, cpu);
+ ret = cb->active_power(&power, &freq, dev);
if (ret) {
- pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
- goto free_cs_table;
+ dev_err(dev, "EM: invalid perf. state: %d\n",
+ ret);
+ goto free_ps_table;
}
/*
* We expect the driver callback to increase the frequency for
- * higher capacity states.
+ * higher performance states.
*/
if (freq <= prev_freq) {
- pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
- goto free_cs_table;
+ dev_err(dev, "EM: non-increasing freq: %lu\n",
+ freq);
+ goto free_ps_table;
}
/*
* The power returned by active_state() is expected to be
* positive, in milli-watts and to fit into 16 bits.
*/
- if (!power || power > EM_CPU_MAX_POWER) {
- pr_err("pd%d: invalid power: %lu\n", cpu, power);
- goto free_cs_table;
+ if (!power || power > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid power: %lu\n",
+ power);
+ goto free_ps_table;
}
table[i].power = power;
@@ -141,12 +149,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
*/
opp_eff = freq / power;
if (opp_eff >= prev_opp_eff)
- pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
- cpu, i, i - 1);
+ dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
+ i, i - 1);
prev_opp_eff = opp_eff;
}
- /* Compute the cost of each capacity_state. */
+ /* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
table[i].cost = div64_u64(fmax * table[i].power,
@@ -154,39 +162,94 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
}
pd->table = table;
- pd->nr_cap_states = nr_states;
- cpumask_copy(to_cpumask(pd->cpus), span);
-
- em_debug_create_pd(pd, cpu);
+ pd->nr_perf_states = nr_states;
- return pd;
+ return 0;
-free_cs_table:
+free_ps_table:
kfree(table);
-free_pd:
- kfree(pd);
+ return -EINVAL;
+}
+
+static int em_create_pd(struct device *dev, int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus)
+{
+ struct em_perf_domain *pd;
+ struct device *cpu_dev;
+ int cpu, ret;
+
+ if (_is_cpu_device(dev)) {
+ pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(em_span_cpus(pd), cpus);
+ } else {
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+ }
+
+ ret = em_create_perf_table(dev, pd, nr_states, cb);
+ if (ret) {
+ kfree(pd);
+ return ret;
+ }
+
+ if (_is_cpu_device(dev))
+ for_each_cpu(cpu, cpus) {
+ cpu_dev = get_cpu_device(cpu);
+ cpu_dev->em_pd = pd;
+ }
+
+ dev->em_pd = pd;
+
+ return 0;
+}
+
+/**
+ * em_pd_get() - Return the performance domain for a device
+ * @dev : Device to find the performance domain for
+ *
+ * Returns the performance domain to which @dev belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_pd_get(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev))
+ return NULL;
- return NULL;
+ return dev->em_pd;
}
+EXPORT_SYMBOL_GPL(em_pd_get);
/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
- * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
- return READ_ONCE(per_cpu(em_data, cpu));
+ struct device *cpu_dev;
+
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev)
+ return NULL;
+
+ return em_pd_get(cpu_dev);
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
- * em_register_perf_domain() - Register the Energy Model of a performance domain
- * @span : Mask of CPUs in the performance domain
- * @nr_states : Number of capacity states to register
+ * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
+ * @dev : Device for which the EM is to register
+ * @nr_states : Number of performance states to register
* @cb : Callback functions providing the data of the Energy Model
+ * @cpus : Pointer to cpumask_t, which in case of a CPU device is
+ * obligatory. It can be taken from i.e. 'policy->cpus'. For other
+ * type of devices this should be set to NULL.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
@@ -196,14 +259,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
*
* Return 0 on success
*/
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
- struct em_data_callback *cb)
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+ struct em_data_callback *cb, cpumask_t *cpus)
{
unsigned long cap, prev_cap = 0;
- struct em_perf_domain *pd;
- int cpu, ret = 0;
+ int cpu, ret;
- if (!span || !nr_states || !cb)
+ if (!dev || !nr_states || !cb)
return -EINVAL;
/*
@@ -212,47 +274,79 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
*/
mutex_lock(&em_pd_mutex);
- for_each_cpu(cpu, span) {
- /* Make sure we don't register again an existing domain. */
- if (READ_ONCE(per_cpu(em_data, cpu))) {
- ret = -EEXIST;
- goto unlock;
- }
+ if (dev->em_pd) {
+ ret = -EEXIST;
+ goto unlock;
+ }
- /*
- * All CPUs of a domain must have the same micro-architecture
- * since they all share the same table.
- */
- cap = arch_scale_cpu_capacity(cpu);
- if (prev_cap && prev_cap != cap) {
- pr_err("CPUs of %*pbl must have the same capacity\n",
- cpumask_pr_args(span));
+ if (_is_cpu_device(dev)) {
+ if (!cpus) {
+ dev_err(dev, "EM: invalid CPU mask\n");
ret = -EINVAL;
goto unlock;
}
- prev_cap = cap;
+
+ for_each_cpu(cpu, cpus) {
+ if (em_cpu_get(cpu)) {
+ dev_err(dev, "EM: exists for CPU%d\n", cpu);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ /*
+ * All CPUs of a domain must have the same
+ * micro-architecture since they all share the same
+ * table.
+ */
+ cap = arch_scale_cpu_capacity(cpu);
+ if (prev_cap && prev_cap != cap) {
+ dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
+ cpumask_pr_args(cpus));
+
+ ret = -EINVAL;
+ goto unlock;
+ }
+ prev_cap = cap;
+ }
}
- /* Create the performance domain and add it to the Energy Model. */
- pd = em_create_pd(span, nr_states, cb);
- if (!pd) {
- ret = -EINVAL;
+ ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (ret)
goto unlock;
- }
- for_each_cpu(cpu, span) {
- /*
- * The per-cpu array can be read concurrently from em_cpu_get().
- * The barrier enforces the ordering needed to make sure readers
- * can only access well formed em_perf_domain structs.
- */
- smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
- }
+ em_debug_create_pd(dev);
+ dev_info(dev, "EM: created perf domain\n");
- pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
-
return ret;
}
-EXPORT_SYMBOL_GPL(em_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
+ * @dev : Device for which the EM is registered
+ *
+ * Unregister the EM for the specified @dev (but not a CPU device).
+ */
+void em_dev_unregister_perf_domain(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
+ return;
+
+ if (_is_cpu_device(dev))
+ return;
+
+ /*
+ * The mutex separates all register/unregister requests and protects
+ * from potential clean-up/setup issues in the debugfs directories.
+ * The debugfs directory name is the same as device's name.
+ */
+ mutex_lock(&em_pd_mutex);
+ em_debug_remove_pd(dev);
+
+ kfree(dev->em_pd->table);
+ kfree(dev->em_pd);
+ dev->em_pd = NULL;
+ mutex_unlock(&em_pd_mutex);
+}
+EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 02ec716a4927..5714f51ba9f8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1062,7 +1062,7 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
@@ -1162,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj,
power_attr(reserved_size);
-static struct attribute * g[] = {
+static struct attribute *g[] = {
&disk_attr.attr,
&resume_offset_attr.attr,
&resume_attr.attr,
@@ -1190,7 +1190,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy( resume_file, str, 255 );
+ strncpy(resume_file, str, 255);
return 1;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ba2094db6294..32fc89ac96c3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info)
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
}
-static inline char *check_image_kernel(struct swsusp_info *info)
+static inline const char *check_image_kernel(struct swsusp_info *info)
{
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 881128b9351e..cef154261fe2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-static char *check_image_kernel(struct swsusp_info *info)
+static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm)
static int check_header(struct swsusp_info *info)
{
- char *reason;
+ const char *reason;
reason = check_image_kernel(info);
if (!reason && info->num_physpages != get_num_physpages())
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b71eaf5f5a86..9b75f6bfc333 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -943,6 +943,14 @@ out:
return ret;
}
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
@@ -2658,7 +2666,7 @@ early_param("keep_bootcon", keep_bootcon_setup);
static int try_enable_new_console(struct console *newcon, bool user_specified)
{
struct console_cmdline *c;
- int i;
+ int i, err;
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
@@ -2681,8 +2689,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return 0;
if (newcon->setup &&
- newcon->setup(newcon, c->options) != 0)
- return -EIO;
+ (err = newcon->setup(newcon, c->options)) != 0)
+ return err;
}
newcon->flags |= CON_ENABLED;
if (i == preferred_console) {
@@ -2695,7 +2703,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
/*
* Some consoles, such as pstore and netconsole, can be enabled even
* without matching. Accept the pre-enabled consoles only when match()
- * and setup() had a change to be called.
+ * and setup() had a chance to be called.
*/
if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
return 0;
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 452feae8de20..3cf6132a4bb9 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,6 +61,25 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
+config RCU_REF_SCALE_TEST
+ tristate "Scalability tests for read-side synchronization (RCU and others)"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ select TASKS_RUDE_RCU
+ select TASKS_TRACE_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance tests
+ useful comparing RCU with various read-side synchronization mechanisms.
+ The kernel module may be built after the fact on the running kernel to be
+ tested, if desired.
+
+ Say Y here if you want these performance tests built into the kernel.
+ Say M if you want to build it as a module instead.
+ Say N if you are unsure.
+
config RCU_CPU_STALL_TIMEOUT
int "RCU CPU stall timeout in seconds"
depends on RCU_STALL_COMMON
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index f91f2c2cf138..95f5117ef8da 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 9eb39c20082c..ec903d781778 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
* value specified by nr_cpus for a read-only test.
*
* Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers. The reader performance statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
*/
#ifdef MODULE
@@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void)
}
/*
- * RCU perf reader kthread. Repeatedly does empty RCU read-side
- * critical section, minimizing update-side interference.
+ * RCU perf reader kthread. Repeatedly does empty RCU read-side critical
+ * section, minimizing update-side interference. However, the point of
+ * this test is not to evaluate reader performance, but instead to serve
+ * as a test load for update-side performance testing.
*/
static int
rcu_perf_reader(void *arg)
@@ -576,11 +583,8 @@ static int compute_real(int n)
static int
rcu_perf_shutdown(void *arg)
{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_perf_writer_finished) >=
- nrealwriters);
- } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+ wait_event(shutdown_wq,
+ atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_perf_cleanup();
kernel_power_off();
@@ -693,11 +697,8 @@ kfree_perf_cleanup(void)
static int
kfree_perf_shutdown(void *arg)
{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_perf_thread_ended) >=
- kfree_nrealthreads);
- } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+ wait_event(shutdown_wq,
+ atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..d0d265304d14 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -7,7 +7,7 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
- * See also: Documentation/RCU/torture.txt
+ * See also: Documentation/RCU/torture.rst
*/
#define pr_fmt(fmt) fmt
@@ -109,6 +109,10 @@ torture_param(int, object_debug, 0,
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, read_exit_delay, 13,
+ "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16,
+ "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -146,6 +150,7 @@ static struct task_struct *stall_task;
static struct task_struct *fwd_prog_task;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -177,6 +182,7 @@ static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
@@ -1166,6 +1172,7 @@ rcu_torture_writer(void *arg)
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
}
} while (!torture_must_stop());
+ rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
if (expediting > 0)
expediting = -expediting;
@@ -1370,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
struct rt_read_seg *rtrsp1;
unsigned long long ts;
+ WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
started = cur_ops->get_gp_seq();
@@ -1539,10 +1547,11 @@ rcu_torture_stats_print(void)
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
torture_onoff_stats();
- pr_cont("barrier: %ld/%ld:%ld\n",
+ pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
+ pr_cont("read-exits: %ld\n", data_race(n_read_exits));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -1634,7 +1643,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
- "onoff_interval=%d onoff_holdoff=%d\n",
+ "onoff_interval=%d onoff_holdoff=%d "
+ "read_exit_delay=%d read_exit_burst=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +1653,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block,
n_barrier_cbs,
- onoff_interval, onoff_holdoff);
+ onoff_interval, onoff_holdoff,
+ read_exit_delay, read_exit_burst);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2175,7 +2186,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
- bool lastphase = 0;
+ bool lastphase = false;
bool newphase;
struct rcu_head rcu;
@@ -2338,6 +2349,99 @@ static bool rcu_torture_can_boost(void)
return true;
}
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+ struct torture_random_state *trsp = trsp_in;
+
+ set_user_nice(current, MAX_NICE);
+ // Minimize time between reading and exiting.
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ (void)rcu_torture_one_read(trsp);
+ return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+ int count = 0;
+ bool errexit = false;
+ int i;
+ struct task_struct *tsp;
+ DEFINE_TORTURE_RANDOM(trs);
+
+ // Allocate and initialize.
+ set_user_nice(current, MAX_NICE);
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+ // Each pass through this loop does one read-exit episode.
+ do {
+ if (++count > read_exit_burst) {
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ for (i = 0; i < read_exit_delay; i++) {
+ schedule_timeout_uninterruptible(HZ);
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ }
+ if (!READ_ONCE(read_exit_child_stop))
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ count = 0;
+ }
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s",
+ "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ tsp = NULL;
+ break;
+ }
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits ++;
+ stutter_wait("rcu_torture_read_exit");
+ } while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+ // Clean up and exit.
+ smp_store_release(&read_exit_child_stopped, true); // After reaping.
+ smp_mb(); // Store before wakeup.
+ wake_up(&read_exit_wq);
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+ torture_kthread_stopping("rcu_torture_read_exit");
+ return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+ if (read_exit_burst <= 0)
+ return -EINVAL;
+ init_waitqueue_head(&read_exit_wq);
+ read_exit_child_stop = false;
+ read_exit_child_stopped = false;
+ return torture_create_kthread(rcu_torture_read_exit, NULL,
+ read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+ if (!read_exit_task)
+ return;
+ WRITE_ONCE(read_exit_child_stop, true);
+ smp_mb(); // Above write before wait.
+ wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+ torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -2359,6 +2463,7 @@ rcu_torture_cleanup(void)
}
show_rcu_gp_kthreads();
+ rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2370,7 +2475,6 @@ rcu_torture_cleanup(void)
reader_tasks[i]);
kfree(reader_tasks);
}
- rcu_torture_current = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
@@ -2682,6 +2786,9 @@ rcu_torture_init(void)
firsterr = rcu_torture_barrier_init();
if (firsterr)
goto unwind;
+ firsterr = rcu_torture_read_exit_init();
+ if (firsterr)
+ goto unwind;
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
new file mode 100644
index 000000000000..d9291f883b54
--- /dev/null
+++ b/kernel/rcu/refscale.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Scalability test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define SCALE_FLAG "-ref-scale: "
+
+#define SCALEOUT(s, x...) \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
+
+#define VERBOSE_SCALEOUT(s, x...) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+
+#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+// Number of loops per experiment, all readers execute operations concurrently.
+torture_param(long, loops, 10000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+
+#ifdef MODULE
+# define REFSCALE_SHUTDOWN 0
+#else
+# define REFSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+
+struct reader_task {
+ struct task_struct *task;
+ int start_reader;
+ wait_queue_head_t wq;
+ u64 last_duration_ns;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_scale_ops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ void (*readsection)(const int nloops);
+ void (*delaysection)(const int nloops, const int udl, const int ndl);
+ const char *name;
+};
+
+static struct ref_scale_ops *cur_ops;
+
+static void un_delay(const int udl, const int ndl)
+{
+ if (udl)
+ udelay(udl);
+ if (ndl)
+ ndelay(ndl);
+}
+
+static void ref_rcu_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ un_delay(udl, ndl);
+ rcu_read_unlock();
+ }
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct ref_scale_ops rcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_rcu_read_section,
+ .delaysection = ref_rcu_delay_section,
+ .name = "rcu"
+};
+
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
+
+static void srcu_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static struct ref_scale_ops srcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_ref_scale_read_section,
+ .delaysection = srcu_ref_scale_delay_section,
+ .name = "srcu"
+};
+
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ continue;
+}
+
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ un_delay(udl, ndl);
+}
+
+static struct ref_scale_ops rcu_tasks_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_tasks_ref_scale_read_section,
+ .delaysection = rcu_tasks_ref_scale_delay_section,
+ .name = "rcu-tasks"
+};
+
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ rcu_read_unlock_trace();
+ }
+}
+
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ un_delay(udl, ndl);
+ rcu_read_unlock_trace();
+ }
+}
+
+static struct ref_scale_ops rcu_trace_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_trace_ref_scale_read_section,
+ .delaysection = rcu_trace_ref_scale_delay_section,
+ .name = "rcu-trace"
+};
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static void ref_refcnt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ atomic_dec(&refcnt);
+ }
+}
+
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ un_delay(udl, ndl);
+ atomic_dec(&refcnt);
+ }
+}
+
+static struct ref_scale_ops refcnt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_refcnt_section,
+ .delaysection = ref_refcnt_delay_section,
+ .name = "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static void ref_rwlock_init(void)
+{
+ rwlock_init(&test_rwlock);
+}
+
+static void ref_rwlock_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ un_delay(udl, ndl);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static struct ref_scale_ops rwlock_ops = {
+ .init = ref_rwlock_init,
+ .readsection = ref_rwlock_section,
+ .delaysection = ref_rwlock_delay_section,
+ .name = "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static void ref_rwsem_init(void)
+{
+ init_rwsem(&test_rwsem);
+}
+
+static void ref_rwsem_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ up_read(&test_rwsem);
+ }
+}
+
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ un_delay(udl, ndl);
+ up_read(&test_rwsem);
+ }
+}
+
+static struct ref_scale_ops rwsem_ops = {
+ .init = ref_rwsem_init,
+ .readsection = ref_rwsem_section,
+ .delaysection = ref_rwsem_delay_section,
+ .name = "rwsem"
+};
+
+static void rcu_scale_one_reader(void)
+{
+ if (readdelay <= 0)
+ cur_ops->readsection(loops);
+ else
+ cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
+}
+
+// Reader kthread. Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_scale_reader(void *arg)
+{
+ unsigned long flags;
+ long me = (long)arg;
+ struct reader_task *rt = &(reader_tasks[me]);
+ u64 start;
+ s64 duration;
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_init);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+repeat:
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+
+ // Wait for signal that this reader can start.
+ wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
+ torture_must_stop());
+
+ if (torture_must_stop())
+ goto end;
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ WARN_ON_ONCE(smp_processor_id() != me);
+
+ WRITE_ONCE(rt->start_reader, 0);
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started))
+ cpu_relax();
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+
+
+ // To reduce noise, do an initial cache-warming invocation, check
+ // in, and then keep warming until everyone has checked in.
+ rcu_scale_one_reader();
+ if (!atomic_dec_return(&n_warmedup))
+ while (atomic_read_acquire(&n_warmedup))
+ rcu_scale_one_reader();
+ // Also keep interrupts disabled. This also has the effect
+ // of preventing entries into slow path for rcu_read_unlock().
+ local_irq_save(flags);
+ start = ktime_get_mono_fast_ns();
+
+ rcu_scale_one_reader();
+
+ duration = ktime_get_mono_fast_ns() - start;
+ local_irq_restore(flags);
+
+ rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+ // To reduce runtime-skew noise, do maintain-load invocations until
+ // everyone is done.
+ if (!atomic_dec_return(&n_cooleddown))
+ while (atomic_read_acquire(&n_cooleddown))
+ rcu_scale_one_reader();
+
+ if (atomic_dec_and_test(&nreaders_exp))
+ wake_up(&main_wq);
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
+
+ if (!torture_must_stop())
+ goto repeat;
+end:
+ torture_kthread_stopping("ref_scale_reader");
+ return 0;
+}
+
+static void reset_readers(void)
+{
+ int i;
+ struct reader_task *rt;
+
+ for (i = 0; i < nreaders; i++) {
+ rt = &(reader_tasks[i]);
+
+ rt->last_duration_ns = 0;
+ }
+}
+
+// Print the results of each reader and return the sum of all their durations.
+static u64 process_durations(int n)
+{
+ int i;
+ struct reader_task *rt;
+ char buf1[64];
+ char *buf;
+ u64 sum = 0;
+
+ buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ if (!buf)
+ return 0;
+ buf[0] = 0;
+ sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
+
+ for (i = 0; i < n && !torture_must_stop(); i++) {
+ rt = &(reader_tasks[i]);
+ sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+ if (i % 5 == 0)
+ strcat(buf, "\n");
+ strcat(buf, buf1);
+
+ sum += rt->last_duration_ns;
+ }
+ strcat(buf, "\n");
+
+ SCALEOUT("%s\n", buf);
+
+ kfree(buf);
+ return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments. For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+ bool errexit = false;
+ int exp, r;
+ char buf1[64];
+ char *buf;
+ u64 *result_avg;
+
+ set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ VERBOSE_SCALEOUT("main_func task started");
+ result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+ buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ if (!result_avg || !buf) {
+ VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ errexit = true;
+ }
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ // Wait for all threads to start.
+ atomic_inc(&n_init);
+ while (atomic_read(&n_init) < nreaders + 1)
+ schedule_timeout_uninterruptible(1);
+
+ // Start exp readers up per experiment
+ for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+ if (errexit)
+ break;
+ if (torture_must_stop())
+ goto end;
+
+ reset_readers();
+ atomic_set(&nreaders_exp, nreaders);
+ atomic_set(&n_started, nreaders);
+ atomic_set(&n_warmedup, nreaders);
+ atomic_set(&n_cooleddown, nreaders);
+
+ exp_idx = exp;
+
+ for (r = 0; r < nreaders; r++) {
+ smp_store_release(&reader_tasks[r].start_reader, 1);
+ wake_up(&reader_tasks[r].wq);
+ }
+
+ VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
+ nreaders);
+
+ wait_event(main_wq,
+ !atomic_read(&nreaders_exp) || torture_must_stop());
+
+ VERBOSE_SCALEOUT("main_func: experiment ended");
+
+ if (torture_must_stop())
+ goto end;
+
+ result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
+ }
+
+ // Print the average of all experiments
+ SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+ buf[0] = 0;
+ strcat(buf, "\n");
+ strcat(buf, "Runs\tTime(ns)\n");
+
+ for (exp = 0; exp < nruns; exp++) {
+ u64 avg;
+ u32 rem;
+
+ if (errexit)
+ break;
+ avg = div_u64_rem(result_avg[exp], 1000, &rem);
+ sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
+ strcat(buf, buf1);
+ }
+
+ if (!errexit)
+ SCALEOUT("%s", buf);
+
+ // This will shutdown everything including us.
+ if (shutdown) {
+ shutdown_start = 1;
+ wake_up(&shutdown_wq);
+ }
+
+ // Wait for torture to stop us
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+
+end:
+ torture_kthread_stopping("main_func");
+ kfree(result_avg);
+ kfree(buf);
+ return 0;
+}
+
+static void
+ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+}
+
+static void
+ref_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nreaders; i++)
+ torture_stop_kthread("ref_scale_reader",
+ reader_tasks[i].task);
+ }
+ kfree(reader_tasks);
+
+ torture_stop_kthread("main_task", main_task);
+ kfree(main_task);
+
+ // Do scale-type-specific cleanup operations.
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+// Shutdown kthread. Just waits to be awakened, then shuts down system.
+static int
+ref_scale_shutdown(void *arg)
+{
+ wait_event(shutdown_wq, shutdown_start);
+
+ smp_mb(); // Wake before output.
+ ref_scale_cleanup();
+ kernel_power_off();
+
+ return -EINVAL;
+}
+
+static int __init
+ref_scale_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct ref_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
+ &refcnt_ops, &rwlock_ops, &rwsem_ops,
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ ref_scale_print_module_parms(cur_ops, "Start of test");
+
+ // Shutdown task
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ // Reader tasks (default to ~75% of online CPUs).
+ if (nreaders < 0)
+ nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+ reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (!reader_tasks) {
+ VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+
+ for (i = 0; i < nreaders; i++) {
+ firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
+ reader_tasks[i].task);
+ if (firsterr)
+ goto unwind;
+
+ init_waitqueue_head(&(reader_tasks[i].wq));
+ }
+
+ // Main Task
+ init_waitqueue_head(&main_wq);
+ firsterr = torture_create_kthread(main_func, NULL, main_task);
+ if (firsterr)
+ goto unwind;
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ ref_scale_cleanup();
+ return firsterr;
+}
+
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d3ef700fb0e..c100acf332ed 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* it, if this function was preempted for enough time for the counters
* to wrap, it really doesn't matter whether or not we expedite the grace
* period. The extra overhead of a needlessly expedited grace period is
- * negligible when amoritized over that time period, and the extra latency
+ * negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
static bool srcu_might_be_idle(struct srcu_struct *ssp)
@@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long t;
unsigned long tlast;
+ check_init_srcu_struct(ssp);
/* If the local srcu_data structure has callbacks, not idle. */
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabalistically probe global state.
@@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
}
rhp->func = func;
idx = srcu_read_lock(ssp);
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
- spin_lock_rcu_node(sdp);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_gp_seq));
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..835e2df8590a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
#define RTGS_WAIT_READERS 9
#define RTGS_INVOKE_CBS 10
#define RTGS_WAIT_CBS 11
+#ifndef CONFIG_TINY_RCU
static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INIT",
"RTGS_WAIT_WAIT_CBS",
@@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INVOKE_CBS",
"RTGS_WAIT_CBS",
};
+#endif /* #ifndef CONFIG_TINY_RCU */
////////////////////////////////////////////////////////////////////////
//
@@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
rtp->gp_jiffies = jiffies;
}
+#ifndef CONFIG_TINY_RCU
/* Return state name. */
static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
{
@@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
return "???";
return rcu_tasks_gp_state_names[j];
}
+#endif /* #ifndef CONFIG_TINY_RCU */
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
@@ -205,7 +209,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
if (!rtp->cbs_head) {
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_interruptible(HZ/10);
+ schedule_timeout_idle(HZ/10);
}
continue;
}
@@ -227,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
cond_resched();
}
/* Paranoid sleep to keep this from entering a tight loop */
- schedule_timeout_uninterruptible(HZ/10);
+ schedule_timeout_idle(HZ/10);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
}
@@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
".C"[!!data_race(rtp->cbs_head)],
s);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -336,7 +342,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
/* Slowly back off waiting for holdouts */
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_interruptible(HZ/fract);
+ schedule_timeout_idle(HZ/fract);
if (fract > 1)
fract--;
@@ -402,7 +408,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
/* Processing between scanning taskslist and draining the holdout list. */
-void rcu_tasks_postscan(struct list_head *hop)
+static void rcu_tasks_postscan(struct list_head *hop)
{
/*
* Wait for tasks that are in the process of exiting. This
@@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void)
}
core_initcall(rcu_spawn_tasks_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_classic_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
+#endif /* #ifndef CONFIG_TINY_RCU */
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
}
core_initcall(rcu_spawn_tasks_rude_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_rude_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RUDE_RCU */
static void show_rcu_tasks_rude_gp_kthread(void) {}
@@ -727,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
+static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
+static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -835,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
bool ofl = cpu_is_offline(cpu);
if (task_curr(t)) {
- WARN_ON_ONCE(ofl & !is_idle_task(t));
+ WARN_ON_ONCE(ofl && !is_idle_task(t));
// If no chance of heavyweight readers, do it the hard way.
if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -1118,11 +1128,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
* synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
*
* Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed. These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-tasks read-side critical sections have elapsed. These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -1164,6 +1173,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
}
core_initcall(rcu_spawn_tasks_trace_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
@@ -1174,18 +1184,21 @@ static void show_rcu_tasks_trace_gp_kthread(void)
data_race(n_heavy_reader_attempts));
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
+#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
show_rcu_tasks_trace_gp_kthread();
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index dd572ce7c747..aa897c3f2e92 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/prefetch.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "rcu.h"
@@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- trace_rcu_invoke_kfree_callback("", head, offset);
- kfree((void *)head - offset);
+ if (__is_kvfree_rcu_offset(offset)) {
+ trace_rcu_invoke_kvfree_callback("", head, offset);
+ kvfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..ac7198ed3197 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -175,6 +177,15 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -954,7 +965,6 @@ void __rcu_irq_enter_check_tick(void)
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
*
* If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
* rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
@@ -990,8 +1000,11 @@ noinstr void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
// ... but is watching here.
- if (!in_nmi())
+ if (!in_nmi()) {
+ instrumentation_begin();
rcu_cleanup_after_idle();
+ instrumentation_end();
+ }
instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
@@ -1638,7 +1651,7 @@ static void rcu_gp_slow(int delay)
if (delay > 0 &&
!(rcu_seq_ctr(rcu_state.gp_seq) %
(rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
- schedule_timeout_uninterruptible(delay);
+ schedule_timeout_idle(delay);
}
static unsigned long sleep_duration;
@@ -1661,7 +1674,7 @@ static void rcu_gp_torture_wait(void)
duration = xchg(&sleep_duration, 0UL);
if (duration > 0) {
pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
- schedule_timeout_uninterruptible(duration);
+ schedule_timeout_idle(duration);
pr_alert("%s: Wait complete\n", __func__);
}
}
@@ -2443,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
count = -rcl.len;
+ rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2726,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
@@ -2894,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rcu_state.name, head,
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
else
@@ -2957,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
+#define FREE_N_CHANNELS 2
/**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
* @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
* @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
+ * @records: Array of the kvfree_rcu() pointers
*/
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
unsigned long nr_records;
- void *records[KFREE_BULK_MAX_ENTR];
- struct kfree_rcu_bulk_data *next;
- struct rcu_head *head_free_debug;
+ struct kvfree_rcu_bulk_data *next;
+ void *records[];
};
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kfree_rcu_bulk_data *bhead_free;
+ struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
+ * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
+ * @count: Number of objects for which GP not started
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -3012,28 +3026,84 @@ struct kfree_rcu_cpu_work {
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
- struct kfree_rcu_bulk_data *bhead;
- struct kfree_rcu_bulk_data *bcached;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- spinlock_t lock;
+ raw_spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
- // Number of objects for which GP not started
int count;
+
+ /*
+ * A simple cache list that contains objects for
+ * reuse purpose. In order to save some per-cpu
+ * space the list is singular. Even though it is
+ * lockless an access has to be protected by the
+ * per-cpu lock.
+ */
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
};
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- for (; head; head = head->next)
- debug_rcu_head_unqueue(head);
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
#endif
}
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock(&krcp->lock);
+ local_irq_restore(flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ krcp->nr_bkv_objs--;
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ krcp->nr_bkv_objs++;
+ return true;
+
+}
+
/*
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3041,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head)
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
struct rcu_head *head, *next;
- struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ int i, j;
krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
- spin_lock_irqsave(&krcp->lock, flags);
- head = krwp->head_free;
- krwp->head_free = NULL;
- bhead = krwp->bhead_free;
- krwp->bhead_free = NULL;
- spin_unlock_irqrestore(&krcp->lock, flags);
-
- /* "bhead" is now private, so traverse locklessly. */
- for (; bhead; bhead = bnext) {
- bnext = bhead->next;
- debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ bkvhead[i] = krwp->bkvhead_free[i];
+ krwp->bkvhead_free[i] = NULL;
+ }
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
- bhead->nr_records, bhead->records);
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle two first channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ for (; bkvhead[i]; bkvhead[i] = bnext) {
+ bnext = bkvhead[i]->next;
+ debug_rcu_bhead_unqueue(bkvhead[i]);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (i == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+
+ kfree_bulk(bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+ } else { // vmalloc() / vfree().
+ for (j = 0; j < bkvhead[i]->nr_records; j++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name,
+ bkvhead[i]->records[j], 0);
+
+ vfree(bkvhead[i]->records[j]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
- kfree_bulk(bhead->nr_records, bhead->records);
- rcu_lock_release(&rcu_callback_map);
+ krcp = krc_this_cpu_lock(&flags);
+ if (put_cached_bnode(krcp, bkvhead[i]))
+ bkvhead[i] = NULL;
+ krc_this_cpu_unlock(krcp, flags);
- if (cmpxchg(&krcp->bcached, NULL, bhead))
- free_page((unsigned long) bhead);
+ if (bkvhead[i])
+ free_page((unsigned long) bkvhead[i]);
- cond_resched_tasks_rcu_qs();
+ cond_resched_tasks_rcu_qs();
+ }
}
/*
@@ -3082,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work)
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
+ void *ptr = (void *)head - offset;
next = head->next;
- debug_rcu_head_unqueue(head);
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
- kfree((void *)head - offset);
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -3105,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work)
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
struct kfree_rcu_cpu_work *krwp;
- bool queued = false;
- int i;
+ bool repeat = false;
+ int i, j;
lockdep_assert_held(&krcp->lock);
@@ -3114,21 +3210,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
krwp = &(krcp->krw_arr[i]);
/*
- * Try to detach bhead or head and attach it over any
+ * Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so
* return false to tell caller to retry.
*/
- if ((krcp->bhead && !krwp->bhead_free) ||
+ if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+ (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
- /* Channel 1. */
- if (!krwp->bhead_free) {
- krwp->bhead_free = krcp->bhead;
- krcp->bhead = NULL;
+ // Channel 1 corresponds to SLAB ptrs.
+ // Channel 2 corresponds to vmalloc ptrs.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (!krwp->bkvhead_free[j]) {
+ krwp->bkvhead_free[j] = krcp->bkvhead[j];
+ krcp->bkvhead[j] = NULL;
+ }
}
- /* Channel 2. */
+ // Channel 3 corresponds to emergency path.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
@@ -3137,17 +3237,21 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
/*
- * One work is per one batch, so there are two "free channels",
- * "bhead_free" and "head_free" the batch can handle. It can be
- * that the work is in the pending state when two channels have
- * been detached following each other, one by one.
+ * One work is per one batch, so there are three
+ * "free channels", the batch can handle. It can
+ * be that the work is in the pending state when
+ * channels have been detached following by each
+ * other.
*/
queue_rcu_work(system_wq, &krwp->rcu_work);
- queued = true;
}
+
+ // Repeat if any "free" corresponding channel is still busy.
+ if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
+ repeat = true;
}
- return queued;
+ return !repeat;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -3157,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
@@ -3177,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work)
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
- struct rcu_head *head, rcu_callback_t func)
+kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
- struct kfree_rcu_bulk_data *bnode;
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
if (unlikely(!krcp->initialized))
return false;
lockdep_assert_held(&krcp->lock);
+ idx = !!is_vmalloc_addr(ptr);
/* Check if a new block is required. */
- if (!krcp->bhead ||
- krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
- bnode = xchg(&krcp->bcached, NULL);
+ if (!krcp->bkvhead[idx] ||
+ krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(krcp);
if (!bnode) {
- WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
+ /*
+ * To keep this path working on raw non-preemptible
+ * sections, prevent the optional entry into the
+ * allocator as it uses sleeping locks. In fact, even
+ * if the caller of kfree_rcu() is preemptible, this
+ * path still is not, as krcp->lock is a raw spinlock.
+ * With additional page pre-allocation in the works,
+ * hitting this return is going to be much less likely.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
- bnode = (struct kfree_rcu_bulk_data *)
+ /*
+ * NOTE: For one argument of kvfree_rcu() we can
+ * drop the lock and get the page in sleepable
+ * context. That would allow to maintain an array
+ * for the CONFIG_PREEMPT_RT as well if no cached
+ * pages are available.
+ */
+ bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
}
@@ -3212,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
/* Initialize the new block. */
bnode->nr_records = 0;
- bnode->next = krcp->bhead;
- bnode->head_free_debug = NULL;
+ bnode->next = krcp->bkvhead[idx];
/* Attach it to the head. */
- krcp->bhead = bnode;
+ krcp->bkvhead[idx] = bnode;
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- head->func = func;
- head->next = krcp->bhead->head_free_debug;
- krcp->bhead->head_free_debug = head;
-#endif
-
/* Finally insert. */
- krcp->bhead->records[krcp->bhead->nr_records++] =
- (void *) head - (unsigned long) func;
+ krcp->bkvhead[idx]->records
+ [krcp->bkvhead[idx]->nr_records++] = ptr;
return true;
}
/*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of appropriate free routine after a
+ * grace period. Please note there are three paths are maintained, two are the
+ * main ones that use array of pointers interface and third one is emergency
+ * one, that is used only when the main path can not be maintained temporary,
+ * due to memory pressure.
*
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
+ bool success;
+ void *ptr;
- local_irq_save(flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- if (krcp->initialized)
- spin_lock(&krcp->lock);
+ if (head) {
+ ptr = (void *) head - (unsigned long) func;
+ } else {
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ might_sleep();
+ ptr = (unsigned long *) func;
+ }
+
+ krcp = krc_this_cpu_lock(&flags);
// Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(head)) {
+ if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
+
+ // Mark as success and leave.
+ success = true;
goto unlock_return;
}
@@ -3266,10 +3397,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
- if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+ success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+ if (!success) {
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
head->func = func;
head->next = krcp->head;
krcp->head = head;
+ success = true;
}
WRITE_ONCE(krcp->count, krcp->count + 1);
@@ -3282,11 +3419,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
unlock_return:
- if (krcp->initialized)
- spin_unlock(&krcp->lock);
- local_irq_restore(flags);
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
@@ -3315,11 +3461,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
sc->nr_to_scan -= count;
freed += count;
@@ -3328,7 +3474,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- return freed;
+ return freed == 0 ? SHRINK_STOP : freed;
}
static struct shrinker kfree_rcu_shrinker = {
@@ -3346,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void)
for_each_online_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3842,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
- int nbits;
- unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ bool newcpu;
if (per_cpu(rcu_cpu_started, cpu))
return;
@@ -3857,12 +4002,10 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
- oldmask = rnp->expmaskinitnext;
+ newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
- oldmask ^= rnp->expmaskinitnext;
- nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
/* Allow lockless access for expedited grace periods. */
- smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4249,13 +4392,23 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ struct kvfree_rcu_bulk_data *bnode;
- spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+ if (bnode)
+ put_cached_bnode(krcp, bnode);
+ else
+ pr_err("Failed to preallocate for %d CPU!\n", cpu);
+ }
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 43991a40b084..c96ae351688b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -41,7 +41,7 @@ struct rcu_node {
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
/* some rcu_state fields as well as */
/* following. */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */
+ unsigned long gp_seq; /* Track rsp->gp_seq. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -73,9 +73,9 @@ struct rcu_node {
unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
- int grplo; /* lowest-numbered CPU or group here. */
- int grphi; /* highest-numbered CPU or group here. */
- u8 grpnum; /* CPU/group number for next level up. */
+ int grplo; /* lowest-numbered CPU here. */
+ int grphi; /* highest-numbered CPU here. */
+ u8 grpnum; /* group number for next level up. */
u8 level; /* root is at level 0. */
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
@@ -149,7 +149,7 @@ union rcu_noqs {
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */
+ unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool core_needs_qs; /* Core waits for quiesc state. */
@@ -171,6 +171,7 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -301,6 +302,8 @@ struct rcu_state {
u8 boost ____cacheline_internodealigned_in_smp;
/* Subject to priority boost. */
unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
@@ -346,8 +349,6 @@ struct rcu_state {
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
/* GP start. */
- unsigned long gp_max; /* Maximum GP duration in */
- /* jiffies. */
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 72952edad1e4..1888c0eb1216 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -403,7 +403,7 @@ retry_ipi:
/* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_idle(1);
goto retry_ipi;
}
/* CPU really is offline, so we must report its QS. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 352223664ebd..982fc5be5269 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg)
if (spincnt > 10) {
WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
@@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_interruptible(1);
+ schedule_timeout_idle(1);
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..b5d3b4794db4 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr {
*/
static bool check_slow_task(struct task_struct *t, void *arg)
{
- struct rcu_node *rnp;
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
return false; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
- rnp = t->rcu_blocked_node;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
return true;
}
@@ -468,7 +466,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
/*
* OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
@@ -535,7 +533,7 @@ static void print_cpu_stall(unsigned long gps)
/*
* OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
@@ -649,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
*/
void show_rcu_gp_kthreads(void)
{
+ unsigned long cbs = 0;
int cpu;
unsigned long j;
unsigned long ja;
@@ -690,9 +689,11 @@ void show_rcu_gp_kthreads(void)
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
+ cbs += data_race(rdp->n_cbs_invoked);
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
+ pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..2de49b5d8dd2 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -42,6 +42,7 @@
#include <linux/kprobes.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
#define CREATE_TRACE_POINTS
@@ -207,7 +208,7 @@ void rcu_end_inkernel_boot(void)
rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
- rcu_boot_ended = 1;
+ rcu_boot_ended = true;
}
/*
@@ -279,6 +280,7 @@ struct lockdep_map rcu_sched_lock_map = {
};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+// Tell lockdep when RCU callbacks are being invoked.
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
@@ -390,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
might_sleep();
continue;
}
- init_rcu_head_on_stack(&rs_array[i].head);
- init_completion(&rs_array[i].completion);
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
}
/* Wait for all callbacks to be invoked. */
@@ -407,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
wait_for_completion(&rs_array[i].completion);
- destroy_rcu_head_on_stack(&rs_array[i].head);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 491f1347bf43..e7b78d5ae1ab 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -26,7 +26,7 @@ int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
-#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#if defined(CONFIG_ARM)
#define DEFAULT_REBOOT_MODE = REBOOT_HARD
#else
#define DEFAULT_REBOOT_MODE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2142c6767682..4a0e7b449b88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,10 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
#include "sched.h"
#include <linux/nospec.h>
@@ -23,9 +27,6 @@
#include "pelt.h"
#include "smp.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
@@ -36,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -75,6 +79,100 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
+
+/*
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ * rq1->lock
+ * rq2->lock where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most elegible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ * - sched_setaffinity()/
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
+ * - set_user_nice(): p->se.load, p->*prio
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
+ * p->se.load, p->rt_priority,
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ * - sched_setnuma(): p->numa_preferred_nid
+ * - sched_move_task()/
+ * cpu_cgroup_fork(): p->sched_task_group
+ * - uclamp_update_active() p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ * is changed locklessly using set_current_state(), __set_current_state() or
+ * set_special_state(), see their respective comments, or by
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ * concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ * is set by activate_task() and cleared by deactivate_task(), under
+ * rq->lock. Non-zero indicates the task is runnable, the special
+ * ON_RQ_MIGRATING state is used for migration without holding both
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ * is set by prepare_task() and cleared by finish_task() such that it will be
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ * [ The astute reader will observe that it is possible for two tasks on one
+ * CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ * - Don't call set_task_cpu() on a blocked task:
+ *
+ * We don't care what CPU we're not running on, this simplifies hotplug,
+ * the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ * - for try_to_wake_up(), called under p->pi_lock:
+ *
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ * - for migration called under rq->lock:
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ * o move_queued_task()
+ * o detach_task()
+ *
+ * - for migration called under double_rq_lock():
+ *
+ * o __migrate_swap_task()
+ * o push_rt_task() / pull_rt_task()
+ * o push_dl_task() / pull_dl_task()
+ * o dl_task_offline_migration()
+ *
+ */
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -791,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ * * A task modifying its uclamp value with sched_setattr().
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
+
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -873,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ rq = task_rq_lock(p, &rf);
+ __uclamp_update_util_min_rt_default(p);
+ task_rq_unlock(rq, p, &rf);
+}
+
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -990,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
lockdep_assert_held(&rq->lock);
+ /*
+ * If sched_uclamp_used was enabled after task @p was enqueued,
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
+ *
+ * In this case the uc_se->active flag should be false since no uclamp
+ * accounting was performed at enqueue time and we can just return
+ * here.
+ *
+ * Need to be careful of the following enqeueue/dequeue ordering
+ * problem too
+ *
+ * enqueue(taskA)
+ * // sched_uclamp_used gets enabled
+ * enqueue(taskB)
+ * dequeue(taskA)
+ * // Must not decrement bukcet->tasks here
+ * dequeue(taskB)
+ *
+ * where we could end up with stale data in uc_se and
+ * bucket[uc_se->bucket_id].
+ *
+ * The following check here eliminates the possibility of such race.
+ */
+ if (unlikely(!uc_se->active))
+ return;
+
bucket = &uc_rq->bucket[uc_se->bucket_id];
+
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
+
uc_se->active = false;
/*
@@ -1021,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1036,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1114,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
- int old_min, old_max;
+ int old_min, old_max, old_min_rt;
int result;
mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
@@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
result = -EINVAL;
goto undo;
}
@@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
update_root_tg = true;
}
- if (update_root_tg)
+ if (update_root_tg) {
+ static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
+ }
+
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ static_branch_enable(&sched_uclamp_used);
+ uclamp_sync_util_min_rt_default();
+ }
/*
* We update all RUNNABLE tasks only when task groups are in use.
@@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
done:
mutex_unlock(&uclamp_mutex);
@@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p,
if (upper_bound > SCHED_CAPACITY_SCALE)
return -EINVAL;
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
return 0;
}
@@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
*/
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int clamp_value = uclamp_none(clamp_id);
/* Keep using defined clamps across class changes */
if (uc_se->user_defined)
continue;
- /* By default, RT tasks always get 100% boost */
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
+ __uclamp_update_util_min_rt_default(p);
+ else
+ uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
- uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
+ */
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p)
}
}
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
+}
+
+static void __init init_uclamp_rq(struct rq *rq)
+{
+ enum uclamp_id clamp_id;
+ struct uclamp_rq *uc_rq = rq->uclamp;
+
+ for_each_clamp_id(clamp_id) {
+ uc_rq[clamp_id] = (struct uclamp_rq) {
+ .value = uclamp_none(clamp_id)
+ };
+ }
+
+ rq->uclamp_flags = 0;
+}
+
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
- mutex_init(&uclamp_mutex);
-
- for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0,
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
- cpu_rq(cpu)->uclamp_flags = 0;
- }
+ for_each_possible_cpu(cpu)
+ init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
@@ -1404,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
- const struct sched_class *class;
-
- if (p->sched_class == rq->curr->sched_class) {
+ if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ else if (p->sched_class > rq->curr->sched_class)
+ resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
@@ -1468,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -1477,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
- enqueue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
@@ -2243,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
}
/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
*/
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
@@ -2389,6 +2677,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
+
+#else /* !CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2396,10 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
-#if defined(CONFIG_SMP)
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
-#endif
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -2455,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
- * 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_load_acquire(!X->on_cpu)
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
@@ -2496,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * If (@state & @p->state) @p->state = TASK_RUNNING.
+ * Conceptually does:
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
- * Atomic against schedule() which would dequeue a task, also see
- * set_current_state().
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
*
- * This function executes a full memory barrier before accessing the task
- * state; see set_current_state().
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ * - p->sched_class
+ * - p->cpus_ptr
+ * - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
+ * - ttwu_queue() -- new rq, for enqueue of the task;
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
@@ -2520,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
- * case the whole 'p->on_rq && ttwu_remote()' case below
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
@@ -2541,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
@@ -2577,7 +2889,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
if (p->in_iowait) {
@@ -2990,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
+void sched_post_fork(struct task_struct *p)
+{
+ uclamp_post_fork(p);
+}
+
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -3147,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next)
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
+ *
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
*/
- next->on_cpu = 1;
+ WRITE_ONCE(next->on_cpu, 1);
#endif
}
@@ -3156,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
@@ -3656,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
-
-void arch_set_thermal_pressure(struct cpumask *cpus,
- unsigned long th_pressure)
-{
- int cpu;
-
- for_each_cpu(cpu, cpus)
- WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
-}
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4029,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely((prev->sched_class == &idle_sched_class ||
- prev->sched_class == &fair_sched_class) &&
+ if (likely(prev->sched_class <= &fair_sched_class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5519,6 +5827,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
@@ -5876,7 +6189,7 @@ again:
if (task_running(p_rq, p) || p->state)
goto out_unlock;
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
@@ -6710,6 +7023,14 @@ void __init sched_init(void)
unsigned long ptr = 0;
int i;
+ /* Make sure the linker didn't screw up */
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
+ &fair_sched_class + 1 != &rt_sched_class ||
+ &rt_sched_class + 1 != &dl_sched_class);
+#ifdef CONFIG_SMP
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+#endif
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7431,6 +7752,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
+ static_branch_enable(&sched_uclamp_used);
+
mutex_lock(&uclamp_mutex);
rcu_read_lock();
@@ -8118,4 +8441,7 @@ const u32 sched_prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..8cb06c8c7eb1 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = capacity_orig_of(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..e39008242cf4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
- if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+ if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
@@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
-static int __init sugov_register(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
+cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f63f337c7147..3862a28cd05d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ unsigned long cap = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cap += capacity_orig_of(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ return __dl_bw_capacity(i);
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
#endif
static inline
@@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1604,6 +1643,7 @@ static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
+ bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
@@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -2430,8 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
}
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
+const struct sched_class dl_sched_class
+ __attribute__((section("__dl_sched_class"))) = {
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -2551,11 +2600,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
}
/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+
+/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
@@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
@@ -2715,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
+ unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
- int cpus, ret;
- unsigned long flags;
+ int ret;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ cap = dl_bw_capacity(dest_cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2737,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
+ int cpus = dl_bw_cpus(dest_cpu);
+
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2769,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
bool dl_cpu_busy(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
- int cpus;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ cap = dl_bw_capacity(cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..1a68a0536add 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
*/
#include "sched.h"
-#include <trace/events/sched.h>
-
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -3094,7 +3092,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
@@ -3440,16 +3438,18 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
@@ -3463,16 +3463,18 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
@@ -3500,7 +3502,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3646,7 +3648,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3701,7 +3703,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3922,6 +3924,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
/*
@@ -3952,6 +3956,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ trace_sched_util_est_cfs_tp(cfs_rq);
+
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4017,6 +4023,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
WRITE_ONCE(p->se.avg.util_est, ue);
+
+ trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
@@ -5618,14 +5626,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
-dequeue_throttle:
- if (!se)
- sub_nr_running(rq, 1);
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
+dequeue_throttle:
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -6501,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
max_util = max(max_util, cpu_util);
}
- return em_pd_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -7161,7 +7169,7 @@ static void yield_task_fair(struct rq *rq)
set_skip_buddy(se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -8049,7 +8057,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8081,7 +8089,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
@@ -8703,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_has_spare:
/* Select group with most idle CPUs */
- if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
+ return false;
+
+ /* Select group with lowest group_util */
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+ idlest_sgs->group_util <= sgs->group_util)
return false;
+
break;
}
@@ -10027,7 +10041,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -10348,6 +10367,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -10368,14 +10395,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
@@ -11118,8 +11137,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+const struct sched_class fair_sched_class
+ __attribute__((section("__fair_sched_class"))) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -11292,3 +11311,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
+
+int sched_trace_rq_nr_running(struct rq *rq)
+{
+ return rq ? rq->nr_running : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..6bf34986f45c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -453,11 +453,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -465,8 +460,8 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
+const struct sched_class idle_sched_class
+ __attribute__((section("__idle_sched_class"))) = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -486,8 +481,6 @@ const struct sched_class idle_sched_class = {
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..5a6ea03f9882 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+ HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da666ac7..d2a655643a02 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { }
*
* Called from the global timer code.
*/
-void calc_global_load(unsigned long ticks)
+void calc_global_load(void)
{
unsigned long sample_window;
long active, delta;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..2c613e1cff3a 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
#include "sched.h"
#include "pelt.h"
-#include <trace/events/sched.h>
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -83,8 +81,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -264,7 +260,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..795e43e02afc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+}
+
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..e53b711bd643 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
+/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
- struct kthread_worker *kworker;
+ struct task_struct *task;
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ /*
+ * Do not reschedule if already scheduled.
+ * Possible race with a timer scheduled after this check but before
+ * mod_timer below can be tolerated because group->polling_next_update
+ * will keep updates on schedule.
+ */
+ if (timer_pending(&group->poll_timer))
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->poll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
- else
- atomic_set(&group->poll_scheduled, 0);
+ if (likely(task))
+ mod_timer(&group->poll_timer, jiffies + delay);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_poll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
-
- atomic_set(&group->poll_scheduled, 0);
-
mutex_lock(&group->trigger_lock);
now = sched_clock();
@@ -623,6 +613,35 @@ out:
mutex_unlock(&group->trigger_lock);
}
+static int psi_poll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+ struct sched_param param = {
+ .sched_priority = 1,
+ };
+
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+
+ while (true) {
+ wait_event_interruptible(group->poll_wait,
+ atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_poll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, poll_timer);
+
+ atomic_set(&group->poll_wakeup, 1);
+ wake_up_interruptible(&group->poll_wait);
+}
+
static void record_times(struct psi_group_cpu *groupc, int cpu,
bool memstall_tick)
{
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_lock(&group->trigger_lock);
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
+ if (!rcu_access_pointer(group->poll_task)) {
+ struct task_struct *task;
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
+ task = kthread_create(psi_poll_worker, group, "psimon");
+ if (IS_ERR(task)) {
kfree(t);
mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ return ERR_CAST(task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
+ atomic_set(&group->poll_wakeup, 0);
+ init_waitqueue_head(&group->poll_wait);
+ wake_up_process(task);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->poll_task, task);
}
list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
{
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct task_struct *task_to_destroy = NULL;
if (static_branch_likely(&psi_disabled))
return;
@@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
+ /* Destroy poll_task when the last trigger is destroyed */
if (group->poll_states == 0) {
group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
+ task_to_destroy = rcu_dereference_protected(
+ group->poll_task,
lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
}
@@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref)
/*
* Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * poll_task RCUs to complete their read-side critical sections
+ * before destroying the trigger and optionally the poll_task
*/
synchronize_rcu();
/*
* Destroy the kworker after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
+ * can no longer be found through group->poll_task.
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ del_timer_sync(&group->poll_timer);
+ kthread_stop(task_to_destroy);
}
kfree(t);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75f38..f215eea6a966 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2429,8 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+const struct sched_class rt_sched_class
+ __attribute__((section("__rt_sched_class"))) = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..3fd283892761 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -67,6 +67,7 @@
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
+#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
@@ -75,6 +76,8 @@
#include "cpupri.h"
#include "cpudeadline.h"
+#include <trace/events/sched.h>
+
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
@@ -96,6 +99,7 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -310,11 +314,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
__dl_update(dl_b, -((s32)tsk_bw / cpus));
}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
+ u64 old_bw, u64 new_bw)
{
return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
}
extern void init_dl_bw(struct dl_bw *dl_b);
@@ -862,6 +881,8 @@ struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
/*
@@ -1182,6 +1203,16 @@ struct rq_flags {
#endif
};
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(&rq->lock);
@@ -1739,7 +1770,6 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
struct sched_class {
- const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
@@ -1748,7 +1778,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
@@ -1796,7 +1826,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
-};
+} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@@ -1810,17 +1840,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __begin_sched_classes[];
+extern struct sched_class __end_sched_classes[];
+
+#define sched_class_highest (__end_sched_classes - 1)
+#define sched_class_lowest (__begin_sched_classes - 1)
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class = class->next)
+ for (class = (_from); class != (_to); class--)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, NULL)
+ for_class_range(class, sched_class_highest, sched_class_lowest)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1930,12 +1961,7 @@ extern int __init sched_tick_offload_init(void);
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
- int cpu;
-
- if (!tick_nohz_full_enabled())
- return;
-
- cpu = cpu_of(rq);
+ int cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
@@ -1955,6 +1981,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1969,6 +1998,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
+
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
@@ -2016,6 +2049,16 @@ void arch_scale_freq_tick(void)
#endif
#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
{
@@ -2349,12 +2392,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+/**
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq: The rq to clamp against. Must not be NULL.
+ * @util: The util value to clamp.
+ * @p: The task to clamp against. Can be NULL if you want to clamp
+ * against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
static __always_inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
{
- unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long min_util;
+ unsigned long max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return util;
+
+ min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2371,6 +2437,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
return clamp(util, min_util, max_util);
}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
#else /* CONFIG_UCLAMP_TASK */
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
@@ -2378,6 +2457,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
{
return util;
}
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..394bc8126a1e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -115,8 +109,8 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+const struct sched_class stop_sched_class
+ __attribute__((section("__stop_sched_class"))) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
@@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = {
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..007b0a6b0152 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map,
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
- em_pd_nr_cap_states(pd->em_pd));
+ em_pd_nr_perf_states(pd->em_pd));
pd = pd->next;
}
@@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas)
*
* The complexity of the Energy Model is defined as:
*
- * C = nr_pd * (nr_cpus + nr_cs)
+ * C = nr_pd * (nr_cpus + nr_ps)
*
* with parameters defined as:
* - nr_pd: the number of performance domains
* - nr_cpus: the number of CPUs
- * - nr_cs: the sum of the number of capacity states of all performance
+ * - nr_ps: the sum of the number of performance states of all performance
* domains (for example, on a system with 2 performance domains,
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
*
* It is generally not a good idea to use such a model in the wake-up path on
* very complex platforms because of the associated scheduling overheads. The
* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * with per-CPU DVFS and less than 8 performance states each, for example.
*/
#define EM_MAX_COMPLEXITY 2048
extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
pd = tmp;
/*
- * Count performance domains and capacity states for the
+ * Count performance domains and performance states for the
* complexity check.
*/
nr_pd++;
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
}
/* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
cpumask_pr_args(cpu_map));
goto free;
@@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index ba059fbfc53a..01f5d3020589 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wq_entry->entry);
+ list_del_init_careful(&wq_entry->entry);
return ret;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d653d8426de9..3ee59ce0a323 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,6 +13,7 @@
* Mode 2 allows user-defined system call filters in the form
* of Berkeley Packet Filters/Linux Socket Filters.
*/
+#define pr_fmt(fmt) "seccomp: " fmt
#include <linux/refcount.h>
#include <linux/audit.h>
@@ -41,6 +42,15 @@
#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
+#include <linux/lockdep.h>
+
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
enum notify_state {
SECCOMP_NOTIFY_INIT,
@@ -77,10 +87,42 @@ struct seccomp_knotif {
long val;
u32 flags;
- /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+ /*
+ * Signals when this has changed states, such as the listener
+ * dying, a new seccomp addfd message, or changing to REPLIED
+ */
struct completion ready;
struct list_head list;
+
+ /* outstanding addfd requests */
+ struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ * installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ * is allowed.
+ * @ret: The return value of the installing process. It is set to the fd num
+ * upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ * installation, or gone away (either due to successful
+ * reply, or signal)
+ *
+ */
+struct seccomp_kaddfd {
+ struct file *file;
+ int fd;
+ unsigned int flags;
+
+ /* To only be set on reply */
+ int ret;
+ struct completion completion;
+ struct list_head list;
};
/**
@@ -94,27 +136,35 @@ struct seccomp_knotif {
* filter->notify_lock.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
- * @wqh: A wait queue for poll.
*/
struct notification {
struct semaphore request;
u64 next_id;
struct list_head notifications;
- wait_queue_head_t wqh;
};
/**
* struct seccomp_filter - container for seccomp BPF programs
*
- * @usage: reference count to manage the object lifetime.
- * get/put helpers should be used when accessing an instance
- * outside of a lifetime-guarded section. In general, this
- * is only needed for handling filters shared across tasks.
+ * @refs: Reference count to manage the object lifetime.
+ * A filter's reference count is incremented for each directly
+ * attached task, once for the dependent filter, and if
+ * requested for the user notifier. When @refs reaches zero,
+ * the filter can be freed.
+ * @users: A filter's @users count is incremented for each directly
+ * attached task (filter installation, fork(), thread_sync),
+ * and once for the dependent filter (tracked in filter->prev).
+ * When it reaches zero it indicates that no direct or indirect
+ * users of that filter exist. No new tasks can get associated with
+ * this filter after reaching 0. The @users count is always smaller
+ * or equal to @refs. Hence, reaching 0 for @users does not mean
+ * the filter can be freed.
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
* @notify_lock: A lock for all notification-related accesses.
+ * @wqh: A wait queue for poll if a notifier is in use.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -124,15 +174,17 @@ struct notification {
* how namespaces work.
*
* seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
+ * to a task_struct (other than @refs).
*/
struct seccomp_filter {
- refcount_t usage;
+ refcount_t refs;
+ refcount_t users;
bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
struct mutex notify_lock;
+ wait_queue_head_t wqh;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -366,6 +418,59 @@ static inline pid_t seccomp_can_sync_threads(void)
return 0;
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_destroy(filter->prog);
+ kfree(filter);
+ }
+}
+
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
+{
+ while (orig && refcount_dec_and_test(&orig->users)) {
+ if (waitqueue_active(&orig->wqh))
+ wake_up_poll(&orig->wqh, EPOLLHUP);
+ orig = orig->prev;
+ }
+}
+
+static void __put_seccomp_filter(struct seccomp_filter *orig)
+{
+ /* Clean up single-reference branches iteratively. */
+ while (orig && refcount_dec_and_test(&orig->refs)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ seccomp_filter_free(freeme);
+ }
+}
+
+static void __seccomp_filter_release(struct seccomp_filter *orig)
+{
+ /* Notify about any unused filters in the task's former filter tree. */
+ __seccomp_filter_orphan(orig);
+ /* Finally drop all references to the task's former tree. */
+ __put_seccomp_filter(orig);
+}
+
+/**
+ * seccomp_filter_release - Detach the task from its filter tree,
+ * drop its reference count, and notify
+ * about unused filters
+ *
+ * This function should only be called when the task is exiting as
+ * it detaches it from its filter tree. As such, READ_ONCE() and
+ * barriers are not needed here, as would normally be needed.
+ */
+void seccomp_filter_release(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+
+ /* Detach task from its filter tree. */
+ tsk->seccomp.filter = NULL;
+ __seccomp_filter_release(orig);
+}
+
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
@@ -390,14 +495,19 @@ static inline void seccomp_sync_threads(unsigned long flags)
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
+
/*
* Drop the task reference to the shared ancestor since
* current's path will hold a reference. (This also
* allows a put before the assignment.)
*/
- put_seccomp_filter(thread);
+ __seccomp_filter_release(thread->seccomp.filter);
+
+ /* Make our new filter tree visible. */
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+ atomic_set(&thread->seccomp.filter_count,
+ atomic_read(&thread->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
@@ -461,7 +571,9 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- refcount_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->refs, 1);
+ refcount_set(&sfilter->users, 1);
+ init_waitqueue_head(&sfilter->wqh);
return sfilter;
}
@@ -544,6 +656,7 @@ static long seccomp_attach_filter(unsigned int flags,
*/
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
+ atomic_inc(&current->seccomp.filter_count);
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -554,7 +667,7 @@ static long seccomp_attach_filter(unsigned int flags,
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
- refcount_inc(&filter->usage);
+ refcount_inc(&filter->refs);
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -564,30 +677,7 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
__get_seccomp_filter(orig);
-}
-
-static inline void seccomp_filter_free(struct seccomp_filter *filter)
-{
- if (filter) {
- bpf_prog_destroy(filter->prog);
- kfree(filter);
- }
-}
-
-static void __put_seccomp_filter(struct seccomp_filter *orig)
-{
- /* Clean up single-reference branches iteratively. */
- while (orig && refcount_dec_and_test(&orig->usage)) {
- struct seccomp_filter *freeme = orig;
- orig = orig->prev;
- seccomp_filter_free(freeme);
- }
-}
-
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
-{
- __put_seccomp_filter(tsk->seccomp.filter);
+ refcount_inc(&orig->users);
}
static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
@@ -684,20 +774,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
- 0, /* null terminated */
+ -1, /* negative terminated */
};
static void __secure_computing_strict(int this_syscall)
{
- const int *syscall_whitelist = mode1_syscalls;
+ const int *allowed_syscalls = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- syscall_whitelist = get_compat_mode1_syscalls();
+ allowed_syscalls = get_compat_mode1_syscalls();
#endif
do {
- if (*syscall_whitelist == this_syscall)
+ if (*allowed_syscalls == this_syscall)
return;
- } while (*++syscall_whitelist);
+ } while (*++allowed_syscalls != -1);
#ifdef SECCOMP_DEBUG
dump_stack();
@@ -735,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+{
+ /*
+ * Remove the notification, and reset the list pointers, indicating
+ * that it has been handled.
+ */
+ list_del_init(&addfd->list);
+ addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+ complete(&addfd->completion);
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -743,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall,
u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
+ struct seccomp_kaddfd *addfd, *tmp;
mutex_lock(&match->notify_lock);
err = -ENOSYS;
@@ -755,25 +857,43 @@ static int seccomp_do_user_notification(int this_syscall,
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
list_add(&n.list, &match->notif->notifications);
+ INIT_LIST_HEAD(&n.addfd);
up(&match->notif->request);
- wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
mutex_unlock(&match->notify_lock);
/*
* This is where we wait for a reply from userspace.
*/
+wait:
err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
if (err == 0) {
+ /* Check if we were woken up by a addfd message */
+ addfd = list_first_entry_or_null(&n.addfd,
+ struct seccomp_kaddfd, list);
+ if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+ seccomp_handle_addfd(addfd);
+ mutex_unlock(&match->notify_lock);
+ goto wait;
+ }
ret = n.val;
err = n.error;
flags = n.flags;
}
+ /* If there were any pending addfd calls, clear them out */
+ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
+ /* The process went away before we got a chance to handle it */
+ addfd->ret = -ESRCH;
+ list_del_init(&addfd->list);
+ complete(&addfd->completion);
+ }
+
/*
* Note that it's possible the listener died in between the time when
- * we were notified of a respons (or a signal) and when we were able to
+ * we were notified of a response (or a signal) and when we were able to
* re-acquire the lock, so only delete from the list if the
* notification actually exists.
*
@@ -1011,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
knotif->error = -ENOSYS;
knotif->val = 0;
+ /*
+ * We do not need to wake up any pending addfd messages, as
+ * the notifier will do that for us, as this just looks
+ * like a standard reply.
+ */
complete(&knotif->ready);
}
@@ -1021,6 +1146,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
return 0;
}
+/* must be called with notif_lock held */
+static inline struct seccomp_knotif *
+find_notification(struct seccomp_filter *filter, u64 id)
+{
+ struct seccomp_knotif *cur;
+
+ lockdep_assert_held(&filter->notify_lock);
+
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
+ if (cur->id == id)
+ return cur;
+ }
+
+ return NULL;
+}
+
+
static long seccomp_notify_recv(struct seccomp_filter *filter,
void __user *buf)
{
@@ -1064,7 +1206,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
unotif.data = *(knotif->data);
knotif->state = SECCOMP_NOTIFY_SENT;
- wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+ wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
ret = 0;
out:
mutex_unlock(&filter->notify_lock);
@@ -1078,15 +1220,8 @@ out:
* may have died when we released the lock, so we need to make
* sure it's still around.
*/
- knotif = NULL;
mutex_lock(&filter->notify_lock);
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == unotif.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, unotif.id);
if (knotif) {
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
@@ -1101,7 +1236,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
void __user *buf)
{
struct seccomp_notif_resp resp = {};
- struct seccomp_knotif *knotif = NULL, *cur;
+ struct seccomp_knotif *knotif;
long ret;
if (copy_from_user(&resp, buf, sizeof(resp)))
@@ -1118,13 +1253,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- list_for_each_entry(cur, &filter->notif->notifications, list) {
- if (cur->id == resp.id) {
- knotif = cur;
- break;
- }
- }
-
+ knotif = find_notification(filter, resp.id);
if (!knotif) {
ret = -ENOENT;
goto out;
@@ -1150,7 +1279,7 @@ out:
static long seccomp_notify_id_valid(struct seccomp_filter *filter,
void __user *buf)
{
- struct seccomp_knotif *knotif = NULL;
+ struct seccomp_knotif *knotif;
u64 id;
long ret;
@@ -1161,17 +1290,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
if (ret < 0)
return ret;
- ret = -ENOENT;
- list_for_each_entry(knotif, &filter->notif->notifications, list) {
- if (knotif->id == id) {
- if (knotif->state == SECCOMP_NOTIFY_SENT)
- ret = 0;
- goto out;
- }
+ knotif = find_notification(filter, id);
+ if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
+ ret = 0;
+ else
+ ret = -ENOENT;
+
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
+ struct seccomp_notif_addfd __user *uaddfd,
+ unsigned int size)
+{
+ struct seccomp_notif_addfd addfd;
+ struct seccomp_knotif *knotif;
+ struct seccomp_kaddfd kaddfd;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
+
+ if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
+ return -EINVAL;
+
+ ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
+ if (ret)
+ return ret;
+
+ if (addfd.newfd_flags & ~O_CLOEXEC)
+ return -EINVAL;
+
+ if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
+ return -EINVAL;
+
+ if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
+ return -EINVAL;
+
+ kaddfd.file = fget(addfd.srcfd);
+ if (!kaddfd.file)
+ return -EBADF;
+
+ kaddfd.flags = addfd.newfd_flags;
+ kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
+ addfd.newfd : -1;
+ init_completion(&kaddfd.completion);
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ goto out;
+
+ knotif = find_notification(filter, addfd.id);
+ if (!knotif) {
+ ret = -ENOENT;
+ goto out_unlock;
}
-out:
+ /*
+ * We do not want to allow for FD injection to occur before the
+ * notification has been picked up by a userspace handler, or after
+ * the notification has been replied to.
+ */
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
+ ret = -EINPROGRESS;
+ goto out_unlock;
+ }
+
+ list_add(&kaddfd.list, &knotif->addfd);
+ complete(&knotif->ready);
+ mutex_unlock(&filter->notify_lock);
+
+ /* Now we wait for it to be processed or be interrupted */
+ ret = wait_for_completion_interruptible(&kaddfd.completion);
+ if (ret == 0) {
+ /*
+ * We had a successful completion. The other side has already
+ * removed us from the addfd queue, and
+ * wait_for_completion_interruptible has a memory barrier upon
+ * success that lets us read this value directly without
+ * locking.
+ */
+ ret = kaddfd.ret;
+ goto out;
+ }
+
+ mutex_lock(&filter->notify_lock);
+ /*
+ * Even though we were woken up by a signal and not a successful
+ * completion, a completion may have happened in the mean time.
+ *
+ * We need to check again if the addfd request has been handled,
+ * and if not, we will remove it from the queue.
+ */
+ if (list_empty(&kaddfd.list))
+ ret = kaddfd.ret;
+ else
+ list_del(&kaddfd.list);
+
+out_unlock:
mutex_unlock(&filter->notify_lock);
+out:
+ fput(kaddfd.file);
+
return ret;
}
@@ -1181,13 +1402,22 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
struct seccomp_filter *filter = file->private_data;
void __user *buf = (void __user *)arg;
+ /* Fixed-size ioctls */
switch (cmd) {
case SECCOMP_IOCTL_NOTIF_RECV:
return seccomp_notify_recv(filter, buf);
case SECCOMP_IOCTL_NOTIF_SEND:
return seccomp_notify_send(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ }
+
+ /* Extensible Argument ioctls */
+#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
+ switch (EA_IOCTL(cmd)) {
+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
+ return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
default:
return -EINVAL;
}
@@ -1200,7 +1430,7 @@ static __poll_t seccomp_notify_poll(struct file *file,
__poll_t ret = 0;
struct seccomp_knotif *cur;
- poll_wait(file, &filter->notif->wqh, poll_tab);
+ poll_wait(file, &filter->wqh, poll_tab);
if (mutex_lock_interruptible(&filter->notify_lock) < 0)
return EPOLLERR;
@@ -1216,6 +1446,9 @@ static __poll_t seccomp_notify_poll(struct file *file,
mutex_unlock(&filter->notify_lock);
+ if (refcount_read(&filter->users) == 0)
+ ret |= EPOLLHUP;
+
return ret;
}
@@ -1244,7 +1477,6 @@ static struct file *init_listener(struct seccomp_filter *filter)
sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);
- init_waitqueue_head(&filter->notif->wqh);
ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
filter, O_RDWR);
@@ -1822,7 +2054,7 @@ static int __init seccomp_sysctl_init(void)
hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
if (!hdr)
- pr_warn("seccomp: sysctl registration failed\n");
+ pr_warn("sysctl registration failed\n");
else
kmemleak_not_leak(hdr);
diff --git a/kernel/smp.c b/kernel/smp.c
index aa17eedff5be..d0ae8eb6bf8b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -634,8 +634,7 @@ static int __init nrcpus(char *str)
{
int nr_cpus;
- get_option(&str, &nr_cpus);
- if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
nr_cpu_ids = nr_cpus;
return 0;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c4201b7f42b1..bf88d7f62433 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending)
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
+
+DEFINE_PER_CPU(int, hardirqs_enabled);
+DEFINE_PER_CPU(int, hardirq_context);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
+
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -224,7 +230,7 @@ static inline bool lockdep_softirq_start(void)
{
bool in_hardirq = false;
- if (lockdep_hardirq_context(current)) {
+ if (lockdep_hardirq_context()) {
in_hardirq = true;
lockdep_hardirq_exit();
}
@@ -547,7 +553,10 @@ static void tasklet_action_common(struct softirq_action *a,
if (!test_and_clear_bit(TASKLET_STATE_SCHED,
&t->state))
BUG();
- t->func(t->data);
+ if (t->use_callback)
+ t->callback(t);
+ else
+ t->func(t->data);
tasklet_unlock(t);
continue;
}
@@ -573,6 +582,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
+void tasklet_setup(struct tasklet_struct *t,
+ void (*callback)(struct tasklet_struct *))
+{
+ t->next = NULL;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+ t->callback = callback;
+ t->use_callback = true;
+ t->data = 0;
+}
+EXPORT_SYMBOL(tasklet_setup);
+
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
@@ -580,6 +601,7 @@ void tasklet_init(struct tasklet_struct *t,
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
+ t->use_callback = false;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init);
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index b193a59fc05b..a8fc9ae1d03d 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void)
}
NOKPROBE_SYMBOL(stackleak_erase);
-void __used notrace stackleak_track_stack(void)
+void __used __no_caller_saved_registers notrace stackleak_track_stack(void)
{
- /*
- * N.B. stackleak_erase() fills the kernel stack with the poison value,
- * which has the register width. That code assumes that the value
- * of 'lowest_stack' is aligned on the register width boundary.
- *
- * That is true for x86 and x86_64 because of the kernel stack
- * alignment on these platforms (for details, see 'cc_stack_align' in
- * arch/x86/Makefile). Take care of that when you port STACKLEAK to
- * new platforms.
- */
- unsigned long sp = (unsigned long)&sp;
+ unsigned long sp = current_stack_pointer;
/*
* Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
@@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void)
*/
BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+ /* 'lowest_stack' should be aligned on the register width boundary */
+ sp = ALIGN(sp, sizeof(unsigned long));
if (sp < current->lowest_stack &&
sp >= (unsigned long)task_stack_page(current) +
sizeof(unsigned long)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..ca11af9d815d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2007,12 +2007,15 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.exe_fd != (u32)-1) {
/*
- * Make sure the caller has the rights to
- * change /proc/pid/exe link: only local sys admin should
- * be allowed to.
+ * Check if the current user is checkpoint/restore capable.
+ * At the time of this writing, it checks for CAP_SYS_ADMIN
+ * or CAP_CHECKPOINT_RESTORE.
+ * Note that a user with access to ptrace can masquerade an
+ * arbitrary program as any executable, even setuid ones.
+ * This may have implications in the tomoyo subsystem.
*/
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
- return -EINVAL;
+ if (!checkpoint_restore_ns_capable(current_user_ns()))
+ return -EPERM;
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
if (error)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..1b4d2dc270a5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1780,6 +1780,20 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_rt_handler,
},
{
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_rr_timeslice_ms",
.data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
@@ -1801,6 +1815,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5d9fc22d836a..afc65e6be33e 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
- int err;
if (!current_is_single_threaded())
return -EUSERS;
@@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- timens_set_vvar_page(current, ns);
-
- err = vdso_join_timens(current, ns);
- if (err)
- return err;
-
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
@@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
- int err;
/* create_new_namespaces() already incremented the ref counter */
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
return 0;
- timens_set_vvar_page(tsk, ns);
-
- err = vdso_join_timens(tsk, ns);
- if (err)
- return err;
-
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
+ timens_commit(tsk, ns);
+
return 0;
}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..0deaf4b79fb4 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -20,31 +20,6 @@
#include "timekeeping.h"
/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns: sched_clock() value at last update
- * @epoch_cyc: Clock cycle value at last update.
- * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
- * clocks.
- * @read_sched_clock: Current clock source (or dummy source when suspended).
- * @mult: Multipler for scaled math conversion.
- * @shift: Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 sched_clock_mask;
- u64 (*read_sched_clock)(void);
- u32 mult;
- u32 shift;
-};
-
-/**
* struct clock_data - all data needed for sched_clock() (including
* registration of a new clock source)
*
@@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
return (cyc * mult) >> shift;
}
+struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+ *seq = raw_read_seqcount_latch(&cd.seq);
+ return cd.read_data + (*seq & 1);
+}
+
+int sched_clock_read_retry(unsigned int seq)
+{
+ return read_seqcount_retry(&cd.seq, seq);
+}
+
unsigned long long notrace sched_clock(void)
{
u64 cyc, res;
@@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void)
struct clock_read_data *rd;
do {
- seq = raw_read_seqcount(&cd.seq);
- rd = cd.read_data + (seq & 1);
+ rd = sched_clock_read_begin(&seq);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
rd->sched_clock_mask;
res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
- } while (read_seqcount_retry(&cd.seq, seq));
+ } while (sched_clock_read_retry(seq));
return res;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e2dc9b8858c..f0199a4ba1ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
- * per task timers.
+ * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- /*
- * We could optimize this with just kicking the target running the task
- * if that noise matters for nohz full users.
- */
- tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+ if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
+ if (tsk == current) {
+ preempt_disable();
+ tick_nohz_full_kick();
+ preempt_enable();
+ } else {
+ /*
+ * Some future tick_nohz_full_kick_task()
+ * should optimize this.
+ */
+ tick_nohz_full_kick_all();
+ }
+ }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d20d489841c8..63a632f9896c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64);
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- calc_global_load(ticks);
+ calc_global_load();
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 026ac01af9da..ae5029f984a8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -157,7 +157,8 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The time start value for each level to select the bucket at enqueue
- * time.
+ * time. We start from the last possible delta of the previous level
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/
#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
@@ -204,8 +205,8 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
- bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -488,35 +489,48 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
* Helper function to calculate the array index for a given expiry
* time.
*/
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
+ unsigned long *bucket_expiry)
{
+
+ /*
+ * The timer wheel has to guarantee that a timer does not fire
+ * early. Early expiry can happen due to:
+ * - Timer is armed at the edge of a tick
+ * - Truncation of the expiry time in the outer wheel levels
+ *
+ * Round up with level granularity to prevent this.
+ */
expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
+ unsigned long *bucket_expiry)
{
unsigned long delta = expires - clk;
unsigned int idx;
if (delta < LVL_START(1)) {
- idx = calc_index(expires, 0);
+ idx = calc_index(expires, 0, bucket_expiry);
} else if (delta < LVL_START(2)) {
- idx = calc_index(expires, 1);
+ idx = calc_index(expires, 1, bucket_expiry);
} else if (delta < LVL_START(3)) {
- idx = calc_index(expires, 2);
+ idx = calc_index(expires, 2, bucket_expiry);
} else if (delta < LVL_START(4)) {
- idx = calc_index(expires, 3);
+ idx = calc_index(expires, 3, bucket_expiry);
} else if (delta < LVL_START(5)) {
- idx = calc_index(expires, 4);
+ idx = calc_index(expires, 4, bucket_expiry);
} else if (delta < LVL_START(6)) {
- idx = calc_index(expires, 5);
+ idx = calc_index(expires, 5, bucket_expiry);
} else if (delta < LVL_START(7)) {
- idx = calc_index(expires, 6);
+ idx = calc_index(expires, 6, bucket_expiry);
} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
- idx = calc_index(expires, 7);
+ idx = calc_index(expires, 7, bucket_expiry);
} else if ((long) delta < 0) {
idx = clk & LVL_MASK;
+ *bucket_expiry = clk;
} else {
/*
* Force expire obscene large timeouts to expire at the
@@ -525,34 +539,11 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
if (delta >= WHEEL_TIMEOUT_CUTOFF)
expires = clk + WHEEL_TIMEOUT_MAX;
- idx = calc_index(expires, LVL_DEPTH - 1);
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
}
return idx;
}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
-{
- unsigned int idx;
-
- idx = calc_wheel_index(timer->expires, base->clk);
- enqueue_timer(base, timer, idx);
-}
-
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
@@ -574,34 +565,48 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
- if (!base->is_idle)
- return;
+ if (base->is_idle)
+ wake_up_nohz_cpu(base->cpu);
+}
- /* Check whether this is the new first expiring timer: */
- if (time_after_eq(timer->expires, base->next_expiry))
- return;
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap, store the index in the timer flags then wake up
+ * the target CPU if needed.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx, unsigned long bucket_expiry)
+{
+
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
/*
- * Set the next expiry time and kick the CPU so it can reevaluate the
- * wheel:
+ * Check whether this is the new first expiring timer. The
+ * effective expiry time of the timer is required here
+ * (bucket_expiry) instead of timer->expires.
*/
- if (time_before(timer->expires, base->clk)) {
+ if (time_before(bucket_expiry, base->next_expiry)) {
/*
- * Prevent from forward_timer_base() moving the base->clk
- * backward
+ * Set the next expiry time and kick the CPU so it
+ * can reevaluate the wheel:
*/
- base->next_expiry = base->clk;
- } else {
- base->next_expiry = timer->expires;
+ base->next_expiry = bucket_expiry;
+ base->next_expiry_recalc = false;
+ trigger_dyntick_cpu(base, timer);
}
- wake_up_nohz_cpu(base->cpu);
}
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
- __internal_add_timer(base, timer);
- trigger_dyntick_cpu(base, timer);
+ unsigned long bucket_expiry;
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
+ enqueue_timer(base, timer, idx, bucket_expiry);
}
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
@@ -834,8 +839,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -885,20 +892,14 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned long jnow;
+ unsigned long jnow = READ_ONCE(jiffies);
/*
- * We only forward the base when we are idle or have just come out of
- * idle (must_forward_clk logic), and have a delta between base clock
- * and jiffies. In the common case, run_timers will take care of it.
+ * No need to forward if we are close enough below jiffies.
+ * Also while executing timers, base->clk is 1 offset ahead
+ * of jiffies to avoid endless requeuing to current jffies.
*/
- if (likely(!base->must_forward_clk))
- return;
-
- jnow = READ_ONCE(jiffies);
- base->must_forward_clk = base->is_idle;
- if ((long)(jnow - base->clk) < 2)
+ if ((long)(jnow - base->clk) < 1)
return;
/*
@@ -912,7 +913,6 @@ static inline void forward_timer_base(struct timer_base *base)
return;
base->clk = base->next_expiry;
}
-#endif
}
@@ -960,9 +960,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
+ unsigned long clk = 0, flags, bucket_expiry;
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
- unsigned long clk = 0, flags;
int ret = 0;
BUG_ON(!timer->function);
@@ -1001,7 +1001,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
clk = base->clk;
- idx = calc_wheel_index(expires, clk);
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
/*
* Retrieve and compare the array index of the pending
@@ -1054,16 +1054,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* If 'idx' was calculated above and the base time did not advance
* between calculating 'idx' and possibly switching the base, only
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
- * we need to (re)calculate the wheel index via
- * internal_add_timer().
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
+ * the wheel index via internal_add_timer().
*/
- if (idx != UINT_MAX && clk == base->clk) {
- enqueue_timer(base, timer, idx);
- trigger_dyntick_cpu(base, timer);
- } else {
+ if (idx != UINT_MAX && clk == base->clk)
+ enqueue_timer(base, timer, idx, bucket_expiry);
+ else
internal_add_timer(base, timer);
- }
out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1466,10 +1463,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
}
}
-static int __collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- unsigned long clk = base->clk;
+ unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
@@ -1491,7 +1488,6 @@ static int __collect_expired_timers(struct timer_base *base,
return levels;
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
@@ -1524,6 +1520,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
@@ -1531,6 +1528,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
+
+ /*
+ * If the next expiration happens before we reach
+ * the next level, no need to check further.
+ */
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+ break;
}
/*
* Clock for the next level. If the current level clock lower
@@ -1568,13 +1572,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
- adj = clk & LVL_CLK_MASK ? 1 : 0;
+ adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
+
+ base->next_expiry_recalc = false;
+
return next;
}
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
@@ -1631,9 +1639,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
+ if (base->next_expiry_recalc)
+ base->next_expiry = __next_timer_interrupt(base);
+ nextevt = base->next_expiry;
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
@@ -1659,10 +1669,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC) {
- base->must_forward_clk = true;
+ if ((expires - basem) > TICK_NSEC)
base->is_idle = true;
- }
}
raw_spin_unlock(&base->lock);
@@ -1686,42 +1694,6 @@ void timer_clear_idle(void)
*/
base->is_idle = false;
}
-
-static int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- /*
- * NOHZ optimization. After a long idle sleep we need to forward the
- * base to current jiffies. Avoid a loop by searching the bitfield for
- * the next expiring timer.
- */
- if ((long)(now - base->clk) > 2) {
- unsigned long next = __next_timer_interrupt(base);
-
- /*
- * If the next timer is ahead of time forward to current
- * jiffies, otherwise forward to the next expiry time:
- */
- if (time_after(next, now)) {
- /*
- * The call site will increment base->clk and then
- * terminate the expiry loop immediately.
- */
- base->clk = now;
- return 0;
- }
- base->clk = next;
- }
- return __collect_expired_timers(base, heads);
-}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- return __collect_expired_timers(base, heads);
-}
#endif
/*
@@ -1761,32 +1733,23 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (!time_after_eq(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
- /*
- * timer_base::must_forward_clk must be cleared before running
- * timers so that any timer functions that call mod_timer() will
- * not try to forward the base. Idle tracking / clock forwarding
- * logic is only used with BASE_STD timers.
- *
- * The must_forward_clk flag is cleared unconditionally also for
- * the deferrable base. The deferrable base is not affected by idle
- * tracking and never forwarded, so clearing the flag is a NOOP.
- *
- * The fact that the deferrable base is never forwarded can cause
- * large variations in granularity for deferrable timers, but they
- * can be deferred for long periods due to idle anyway.
- */
- base->must_forward_clk = false;
-
- while (time_after_eq(jiffies, base->clk)) {
-
+ while (time_after_eq(jiffies, base->clk) &&
+ time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The only possible reason for not finding any expired
+ * timer at this clk is that all matching timers have been
+ * dequeued.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
base->clk++;
+ base->next_expiry = __next_timer_interrupt(base);
while (levels--)
expire_timers(base, heads + levels);
@@ -1816,12 +1779,12 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk)) {
+ if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
@@ -1986,7 +1949,6 @@ int timers_prepare_cpu(unsigned int cpu)
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
base->is_idle = false;
- base->must_forward_clk = true;
}
return 0;
}
@@ -2039,6 +2001,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
diff --git a/kernel/torture.c b/kernel/torture.c
index a1a41484ff6d..1061492f14bd 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static bool disable_onoff_at_boot;
module_param(disable_onoff_at_boot, bool, 0444);
+static bool ftrace_dump_at_shutdown;
+module_param(ftrace_dump_at_shutdown, bool, 0444);
+
static char *torture_type;
static int verbose;
@@ -527,7 +530,8 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
- rcu_ftrace_dump(DUMP_ALL);
+ if (ftrace_dump_at_shutdown)
+ rcu_ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5ef0484513ec..7ba62d68885a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q)
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (!bt)
return -EINVAL;
@@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_remove(q);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -483,12 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct dentry *dir = NULL;
int ret;
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- if (!blk_debugfs_root)
- return -ENOENT;
-
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
@@ -503,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex))) {
+ lockdep_is_held(&q->debugfs_mutex))) {
pr_warn("Concurrent blktraces are not allowed on %s\n",
buts->name);
return -EBUSY;
@@ -522,12 +521,29 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->msg_data)
goto err;
- ret = -ENOENT;
-
- dir = debugfs_lookup(buts->name, blk_debugfs_root);
- if (!dir)
+ /*
+ * When tracing the whole disk reuse the existing debugfs directory
+ * created by the block layer on init. For partitions block devices,
+ * and scsi-generic block devices we create a temporary new debugfs
+ * directory that will be removed once the trace ends.
+ */
+ if (bdev && bdev == bdev->bd_contains)
+ dir = q->debugfs_dir;
+ else
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ /*
+ * As blktrace relies on debugfs for its interface the debugfs directory
+ * is required, contrary to the usual mantra of not checking for debugfs
+ * files or directories.
+ */
+ if (IS_ERR_OR_NULL(dir)) {
+ pr_warn("debugfs_dir not present for %s so skipping\n",
+ buts->name);
+ ret = -ENOENT;
+ goto err;
+ }
+
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
@@ -563,8 +579,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
- if (dir && !bt->dir)
- dput(dir);
if (ret)
blk_trace_free(bt);
return ret;
@@ -597,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -645,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
@@ -685,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_startstop(q, start);
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -716,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
switch (cmd) {
case BLKTRACESETUP:
@@ -743,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
break;
}
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -754,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
if (rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex))) {
+ lockdep_is_held(&q->debugfs_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
}
#ifdef CONFIG_BLK_CGROUP
@@ -846,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore,
blk_trace_request_get_cgid(q, rq));
}
+static void blk_add_trace_rq_merge(void *ignore,
+ struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
+ blk_trace_request_get_cgid(q, rq));
+}
+
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
@@ -1130,6 +1151,8 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
+ ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
+ WARN_ON(ret);
ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
@@ -1176,6 +1199,7 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
@@ -1642,7 +1666,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
@@ -1817,10 +1841,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
@@ -1838,7 +1862,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1881,10 +1905,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&q->blk_trace_mutex);
+ mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
if (!!value == !!bt) {
ret = 0;
@@ -1901,7 +1925,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
bt = rcu_dereference_protected(q->blk_trace,
- lockdep_is_held(&q->blk_trace_mutex));
+ lockdep_is_held(&q->debugfs_mutex));
}
if (ret == 0) {
@@ -1916,7 +1940,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&q->blk_trace_mutex);
+ mutex_unlock(&q->debugfs_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1903b80db6eb..72064541bef2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_del_rcu(&ops->list);
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ ops->trampoline) {
+ /*
+ * Record the text poke event before the ksymbol unregister
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline,
+ (void *)ops->trampoline,
+ ops->trampoline_size, NULL, 0);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size,
+ true, FTRACE_TRAMPOLINE_SYM);
+ /* Remove from kallsyms after the perf events */
+ ftrace_remove_trampoline_from_kallsyms(ops);
+ }
+
+ arch_ftrace_trampoline_free(ops);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
synchronize_rcu_tasks();
free_ops:
- arch_ftrace_trampoline_free(ops);
+ ftrace_trampoline_free(ops);
}
return 0;
@@ -6178,6 +6222,27 @@ struct ftrace_mod_map {
unsigned int num_funcs;
};
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ unsigned long *value, char *type,
+ char *name, char *module_name,
+ int *exported)
+{
+ struct ftrace_ops *op;
+
+ list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ if (!op->trampoline || symnum--)
+ continue;
+ *value = op->trampoline;
+ *type = 't';
+ strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ *exported = 0;
+ return 0;
+ }
+
+ return -ERANGE;
+}
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
{
struct ftrace_mod_map *mod_map;
struct ftrace_mod_func *mod_func;
+ int ret;
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
WARN_ON(1);
break;
}
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
preempt_enable();
- return -ERANGE;
+ return ret;
}
#else
@@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod,
{
return NULL;
}
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, char *module_name,
+ int *exported)
+{
+ int ret;
+
+ preempt_disable();
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
+ preempt_enable();
+ return ret;
+}
#endif /* CONFIG_MODULES */
struct ftrace_init_func {
@@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
+ unsigned long trampoline = ops->trampoline;
+
arch_ftrace_update_trampoline(ops);
+ if (ops->trampoline && ops->trampoline != trampoline &&
+ (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ /* Add to kallsyms before the perf events */
+ ftrace_add_trampoline_to_kallsyms(ops);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size, false,
+ FTRACE_TRAMPOLINE_SYM);
+ /*
+ * Record the perf text poke event after the ksymbol register
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ (void *)ops->trampoline,
+ ops->trampoline_size);
+ }
}
void ftrace_init_trace_array(struct trace_array *tr)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00867ff82412..f15471ce969e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -577,7 +577,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
*/
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
int ret = 0;
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..a25433f9cd9a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
#include <trace/events/module.h>
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
@@ -102,16 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- sub_info->pid = task_pid_nr(current);
- if (sub_info->file) {
- retval = do_execve_file(sub_info->file,
- sub_info->argv, sub_info->envp);
- if (!retval)
- current->flags |= PF_UMH;
- } else
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ retval = kernel_execve(sub_info->path,
+ (const char *const *)sub_info->argv,
+ (const char *const *)sub_info->envp);
out:
sub_info->retval = retval;
/*
@@ -405,140 +394,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info), void *data)
-{
- struct subprocess_info *sub_info;
- struct umh_info *info = data;
- const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
- sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
- if (!sub_info)
- return NULL;
-
- sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
- if (!sub_info->argv) {
- kfree(sub_info);
- return NULL;
- }
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
- sub_info->path = "none";
- sub_info->file = file;
- sub_info->init = init;
- sub_info->cleanup = cleanup;
- sub_info->data = data;
- return sub_info;
-}
-
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
- struct umh_info *umh_info = info->data;
- struct file *from_umh[2];
- struct file *to_umh[2];
- int err;
-
- /* create pipe to send data to umh */
- err = create_pipe_files(to_umh, 0);
- if (err)
- return err;
- err = replace_fd(0, to_umh[0], 0);
- fput(to_umh[0]);
- if (err < 0) {
- fput(to_umh[1]);
- return err;
- }
-
- /* create pipe to receive data from umh */
- err = create_pipe_files(from_umh, 0);
- if (err) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- return err;
- }
- err = replace_fd(1, from_umh[1], 0);
- fput(from_umh[1]);
- if (err < 0) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- fput(from_umh[0]);
- return err;
- }
-
- umh_info->pipe_to_umh = to_umh[1];
- umh_info->pipe_from_umh = from_umh[0];
- return 0;
-}
-
-static void umh_clean_and_save_pid(struct subprocess_info *info)
-{
- struct umh_info *umh_info = info->data;
-
- /* cleanup if umh_pipe_setup() was successful but exec failed */
- if (info->pid && info->retval) {
- fput(umh_info->pipe_to_umh);
- fput(umh_info->pipe_from_umh);
- }
-
- argv_free(info->argv);
- umh_info->pid = info->pid;
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
- struct subprocess_info *sub_info;
- struct file *file;
- ssize_t written;
- loff_t pos = 0;
- int err;
-
- file = shmem_kernel_file_setup("", len, 0);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- written = kernel_write(file, data, len, &pos);
- if (written != len) {
- err = written;
- if (err >= 0)
- err = -ENOMEM;
- goto out;
- }
-
- err = -ENOMEM;
- sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
- umh_clean_and_save_pid, info);
- if (!sub_info)
- goto out;
-
- err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
- if (!err) {
- mutex_lock(&umh_list_lock);
- list_add(&info->list, &umh_list);
- mutex_unlock(&umh_list_lock);
- }
-out:
- fput(file);
- return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
@@ -700,26 +555,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
return 0;
}
-void __exit_umh(struct task_struct *tsk)
-{
- struct umh_info *info;
- pid_t pid = tsk->pid;
-
- mutex_lock(&umh_list_lock);
- list_for_each_entry(info, &umh_list, list) {
- if (info->pid == pid) {
- list_del(&info->list);
- mutex_unlock(&umh_list_lock);
- goto out;
- }
- }
- mutex_unlock(&umh_list_lock);
- return;
-out:
- if (info->cleanup)
- info->cleanup(info);
-}
-
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
new file mode 100644
index 000000000000..0b35212ffc3d
--- /dev/null
+++ b/kernel/usermode_driver.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
+#include <linux/usermode_driver.h>
+
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+ struct file_system_type *type;
+ struct vfsmount *mnt;
+ struct file *file;
+ ssize_t written;
+ loff_t pos = 0;
+
+ type = get_fs_type("tmpfs");
+ if (!type)
+ return ERR_PTR(-ENODEV);
+
+ mnt = kern_mount(type);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+ if (IS_ERR(file)) {
+ mntput(mnt);
+ return ERR_CAST(file);
+ }
+
+ written = kernel_write(file, data, len, &pos);
+ if (written != len) {
+ int err = written;
+ if (err >= 0)
+ err = -ENOMEM;
+ filp_close(file, NULL);
+ mntput(mnt);
+ return ERR_PTR(err);
+ }
+
+ fput(file);
+
+ /* Flush delayed fput so exec can open the file read-only */
+ flush_delayed_fput();
+ task_work_run();
+ return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len: The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+ struct vfsmount *mnt;
+
+ if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+ return -EBUSY;
+
+ mnt = blob_to_mnt(data, len, info->driver_name);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ info->wd.mnt = mnt;
+ info->wd.dentry = mnt->mnt_root;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+ if (WARN_ON_ONCE(!info->wd.mnt ||
+ !info->wd.dentry ||
+ info->wd.mnt->mnt_root != info->wd.dentry))
+ return -EINVAL;
+
+ kern_unmount(info->wd.mnt);
+ info->wd.mnt = NULL;
+ info->wd.dentry = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct umd_info *umd_info = info->data;
+ struct file *from_umh[2];
+ struct file *to_umh[2];
+ int err;
+
+ /* create pipe to send data to umh */
+ err = create_pipe_files(to_umh, 0);
+ if (err)
+ return err;
+ err = replace_fd(0, to_umh[0], 0);
+ fput(to_umh[0]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ return err;
+ }
+
+ /* create pipe to receive data from umh */
+ err = create_pipe_files(from_umh, 0);
+ if (err) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ return err;
+ }
+ err = replace_fd(1, from_umh[1], 0);
+ fput(from_umh[1]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ fput(from_umh[0]);
+ return err;
+ }
+
+ set_fs_pwd(current->fs, &umd_info->wd);
+ umd_info->pipe_to_umh = to_umh[1];
+ umd_info->pipe_from_umh = from_umh[0];
+ umd_info->tgid = get_pid(task_tgid(current));
+ return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+ struct umd_info *umd_info = info->data;
+
+ /* cleanup if umh_setup() was successful but exec failed */
+ if (info->retval) {
+ fput(umd_info->pipe_to_umh);
+ fput(umd_info->pipe_from_umh);
+ put_pid(umd_info->tgid);
+ umd_info->tgid = NULL;
+ }
+}
+
+/**
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a tgid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * tgid, and closing the pipes when user process is no longer needed.
+ */
+int fork_usermode_driver(struct umd_info *info)
+{
+ struct subprocess_info *sub_info;
+ const char *argv[] = { info->driver_name, NULL };
+ int err;
+
+ if (WARN_ON_ONCE(info->tgid))
+ return -EBUSY;
+
+ err = -ENOMEM;
+ sub_info = call_usermodehelper_setup(info->driver_name,
+ (char **)argv, NULL, GFP_KERNEL,
+ umd_setup, umd_cleanup, info);
+ if (!sub_info)
+ goto out;
+
+ err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
+
+