aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c193
-rw-r--r--kernel/trace/fgraph.c1092
-rw-r--r--kernel/trace/ftrace.c691
-rw-r--r--kernel/trace/ftrace_internal.h18
-rw-r--r--kernel/trace/pid_list.c5
-rw-r--r--kernel/trace/preemptirq_delay_test.c2
-rw-r--r--kernel/trace/ring_buffer.c1005
-rw-r--r--kernel/trace/rv/rv.c3
-rw-r--r--kernel/trace/rv/rv_reactors.c1
-rw-r--r--kernel/trace/trace.c424
-rw-r--r--kernel/trace/trace.h130
-rw-r--r--kernel/trace/trace_eprobe.c7
-rw-r--r--kernel/trace/trace_events.c41
-rw-r--r--kernel/trace/trace_events_hist.c4
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--kernel/trace/trace_fprobe.c185
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/trace/trace_functions_graph.c119
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c198
-rw-r--r--kernel/trace/trace_osnoise.c56
-rw-r--r--kernel/trace/trace_output.c17
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c12
-rw-r--r--kernel/trace/trace_selftest.c274
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_syscalls.c12
-rw-r--r--kernel/trace/trace_uprobe.c81
-rw-r--r--kernel/trace/tracing_map.c6
32 files changed, 3702 insertions, 915 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d1daeab1bbc1..630b763e5240 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,7 +24,6 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
-#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -798,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct cgroup *cgrp;
-
- if (unlikely(idx >= array->map.max_entries))
- return -E2BIG;
-
- cgrp = READ_ONCE(array->ptrs[idx]);
- if (unlikely(!cgrp))
- return -EAGAIN;
-
- return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
- .func = bpf_current_task_under_cgroup,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
@@ -1226,7 +1202,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_LONG,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u64),
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
@@ -1242,7 +1219,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_LONG,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg2_size = sizeof(u64),
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
@@ -1369,8 +1347,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_ptr: data to verify
- * @sig_ptr: signature of the data
+ * @data_p: data to verify
+ * @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
@@ -1378,10 +1356,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
- struct bpf_dynptr_kern *sig_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
@@ -1437,72 +1417,6 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
-/* filesystem kfuncs */
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_get_file_xattr - get xattr of a file
- * @file: file to get xattr from
- * @name__str: name of the xattr
- * @value_ptr: output buffer of the xattr value
- *
- * Get xattr *name__str* of *file* and store the output in *value_ptr*.
- *
- * For security reasons, only *name__str* with prefix "user." is allowed.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr_kern *value_ptr)
-{
- struct dentry *dentry;
- u32 value_len;
- void *value;
- int ret;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
- value_len = __bpf_dynptr_size(value_ptr);
- value = __bpf_dynptr_data_rw(value_ptr, value_len);
- if (!value)
- return -EINVAL;
-
- dentry = file_dentry(file);
- ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
- if (ret)
- return ret;
- return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_KFUNCS_END(fs_kfunc_set_ids)
-
-static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
-{
- if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
- return 0;
-
- /* Only allow to attach from LSM hooks, to avoid recursion */
- return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
-}
-
-static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &fs_kfunc_set_ids,
- .filter = bpf_get_file_xattr_filter,
-};
-
-static int __init bpf_fs_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
-}
-
-late_initcall(bpf_fs_kfuncs_init);
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1545,8 +1459,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_write_user:
@@ -1575,6 +1487,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1595,7 +1509,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
@@ -1651,7 +1566,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
@@ -2301,8 +2216,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
- if (ret == -ENOENT)
- goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
@@ -3157,6 +3070,7 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
@@ -3175,15 +3089,15 @@ struct bpf_uprobe_multi_run_ctx {
struct bpf_uprobe *uprobe;
};
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
- u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
- for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
- }
+ for (i = 0; i < cnt; i++)
+ uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
+
+ if (cnt)
+ uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3191,7 +3105,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
@@ -3217,7 +3131,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
struct bpf_uprobe_multi_link *umulti_link;
u32 ucount = info->uprobe_multi.count;
int err = 0, i;
- long left;
+ char *p, *buf;
+ long left = 0;
if (!upath ^ !upath_size)
return -EINVAL;
@@ -3231,26 +3146,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
info->uprobe_multi.pid = umulti_link->task ?
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
- if (upath) {
- char *p, *buf;
-
- upath_size = min_t(u32, upath_size, PATH_MAX);
-
- buf = kmalloc(upath_size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- p = d_path(&umulti_link->path, buf, upath_size);
- if (IS_ERR(p)) {
- kfree(buf);
- return PTR_ERR(p);
- }
- upath_size = buf + upath_size - p;
- left = copy_to_user(upath, p, upath_size);
+ upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX;
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
kfree(buf);
- if (left)
- return -EFAULT;
- info->uprobe_multi.path_size = upath_size;
+ return PTR_ERR(p);
}
+ upath_size = buf + upath_size - p;
+
+ if (upath)
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
if (!uoffsets && !ucookies && !uref_ctr_offsets)
return 0;
@@ -3295,7 +3207,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
struct bpf_run_ctx *old_run_ctx;
int err = 0;
- if (link->task && current->mm != link->task->mm)
+ if (link->task && !same_thread_group(current, link->task))
return 0;
if (sleepable)
@@ -3319,8 +3231,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
-uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
- struct mm_struct *mm)
+uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
@@ -3477,22 +3388,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
- err = uprobe_register_refctr(d_real_inode(link->path.dentry),
- uprobes[i].offset,
- uprobes[i].ref_ctr_offset,
- &uprobes[i].consumer);
- if (err) {
- bpf_uprobe_unregister(&path, uprobes, i);
- goto error_free;
+ uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (IS_ERR(uprobes[i].uprobe)) {
+ err = PTR_ERR(uprobes[i].uprobe);
+ link->cnt = i;
+ goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
- goto error_free;
+ goto error_unregister;
return bpf_link_settle(&link_primer);
+error_unregister:
+ bpf_uprobe_unregister(uprobes, link->cnt);
+
error_free:
kvfree(uprobes);
kfree(link);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a130b2d898f7..69e226a48daa 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,9 +7,11 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/bits.h>
#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
+#include <linux/static_call.h>
#include <linux/slab.h>
#include <trace/events/sched.h>
@@ -17,17 +19,447 @@
#include "ftrace_internal.h"
#include "trace.h"
-#ifdef CONFIG_DYNAMIC_FTRACE
-#define ASSIGN_OPS_HASH(opsname, val) \
- .func_hash = val, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#else
-#define ASSIGN_OPS_HASH(opsname, val)
-#endif
+/*
+ * FGRAPH_FRAME_SIZE: Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET: Size in long words of the meta data frame
+ */
+#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
+
+/*
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits: 0 - 9 offset in words from the previous ftrace_ret_stack
+ *
+ * bits: 10 - 11 Type of storage
+ * 0 - reserved
+ * 1 - bitmap of fgraph_array index
+ * 2 - reserved data
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ * bits: 12 - 27 The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ * where if a corresponding ops in the fgraph_array[]
+ * expects a callback from the return of the function
+ * it's corresponding bit will be set.
+ *
+ *
+ * The top of the ret_stack (when not empty) will always have a reference
+ * word that points to the last fgraph frame that was saved.
+ *
+ * For reserved data:
+ * bits: 12 - 17 The size in words that is stored
+ * bits: 18 - 23 The index of fgraph_array, which shows who is stored
+ *
+ * That is, at the end of function_graph_enter, if the first and forth
+ * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
+ *
+ * ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15] |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0] |
+ * ret_stack[SHADOW_STACK_MAX_OFFSET]
+ * ...
+ * | | <- task->curr_ret_stack
+ * +--------------------------------------------+
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \ | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \ | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3) | <- the offset2 is from here
+ * +--------------------------------------------+ ( It is 4 words from the ret_stack)
+ * | STORED DATA WORD 2 |
+ * | STORED DATA WORD 1 |
+ * +--------------------------------------------+
+ * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \ |
+ * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
+ * | (offset1:FGRAPH_FRAME_OFFSET) | <- the offset1 is from here
+ * +--------------------------------------------+
+ * | struct ftrace_ret_stack |
+ * | (stores the saved ret pointer) | <- the offset points here
+ * +--------------------------------------------+
+ * | (X) | (N) | ( N words away from
+ * | | previous ret_stack)
+ * ...
+ * ret_stack[0]
+ *
+ * If a backtrace is required, and the real return pointer needs to be
+ * fetched, then it looks at the task's curr_ret_stack offset, if it
+ * is greater than zero (reserved, or right before popped), it would mask
+ * the value by FGRAPH_FRAME_OFFSET_MASK to get the offset of the
+ * ftrace_ret_stack structure stored on the shadow stack.
+ */
+
+/*
+ * The following is for the top word on the stack:
+ *
+ * FGRAPH_FRAME_OFFSET (0-9) holds the offset delta to the fgraph frame
+ * FGRAPH_TYPE (10-11) holds the type of word this is.
+ * (RESERVED or BITMAP)
+ */
+#define FGRAPH_FRAME_OFFSET_BITS 10
+#define FGRAPH_FRAME_OFFSET_MASK GENMASK(FGRAPH_FRAME_OFFSET_BITS - 1, 0)
+
+#define FGRAPH_TYPE_BITS 2
+#define FGRAPH_TYPE_MASK GENMASK(FGRAPH_TYPE_BITS - 1, 0)
+#define FGRAPH_TYPE_SHIFT FGRAPH_FRAME_OFFSET_BITS
+
+enum {
+ FGRAPH_TYPE_RESERVED = 0,
+ FGRAPH_TYPE_BITMAP = 1,
+ FGRAPH_TYPE_DATA = 2,
+};
+
+/*
+ * For BITMAP type:
+ * FGRAPH_INDEX (12-27) bits holding the gops index wanting return callback called
+ */
+#define FGRAPH_INDEX_BITS 16
+#define FGRAPH_INDEX_MASK GENMASK(FGRAPH_INDEX_BITS - 1, 0)
+#define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+
+/*
+ * For DATA type:
+ * FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ * FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is for
+ *
+ * Note:
+ * data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS 5
+#define FGRAPH_DATA_MASK GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+#define FGRAPH_MAX_DATA_SIZE (sizeof(long) * (1 << FGRAPH_DATA_BITS))
+
+#define FGRAPH_DATA_INDEX_BITS 4
+#define FGRAPH_DATA_INDEX_MASK GENMASK(FGRAPH_DATA_INDEX_BITS - 1, 0)
+#define FGRAPH_DATA_INDEX_SHIFT (FGRAPH_DATA_SHIFT + FGRAPH_DATA_BITS)
+
+#define FGRAPH_MAX_INDEX \
+ ((FGRAPH_INDEX_SIZE << FGRAPH_DATA_BITS) + FGRAPH_RET_INDEX)
+
+#define FGRAPH_ARRAY_SIZE FGRAPH_INDEX_BITS
+
+/*
+ * SHADOW_STACK_SIZE: The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET: The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added
+ */
+#define SHADOW_STACK_SIZE (PAGE_SIZE)
+#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_OFFSET \
+ (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
+
+/* RET_STACK(): Return the frame from a given @offset from task @t */
+#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
+
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+ ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - FGRAPH_ARRAY_SIZE]))
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
+
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+ int i;
+
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+ if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+ WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_last] = idx;
+ fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+ clear_bit(idx, &fgraph_array_bitmask);
+ return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+ int idx = fgraph_lru_table[fgraph_lru_next];
+
+ /* No id is available */
+ if (idx == -1)
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_next] = -1;
+ fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+ set_bit(idx, &fgraph_array_bitmask);
+ return idx;
+}
+
+/* Get the offset to the fgraph frame from a ret_stack value */
+static inline int __get_offset(unsigned long val)
+{
+ return val & FGRAPH_FRAME_OFFSET_MASK;
+}
+
+/* Get the type of word from a ret_stack value */
+static inline int __get_type(unsigned long val)
+{
+ return (val >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
+}
+
+/* Get the data_index for a DATA type ret_stack word */
+static inline int __get_data_index(unsigned long val)
+{
+ return (val >> FGRAPH_DATA_INDEX_SHIFT) & FGRAPH_DATA_INDEX_MASK;
+}
+
+/* Get the data_size for a DATA type ret_stack word */
+static inline int __get_data_size(unsigned long val)
+{
+ return ((val >> FGRAPH_DATA_SHIFT) & FGRAPH_DATA_MASK) + 1;
+}
+
+/* Get the word from the ret_stack at @offset */
+static inline unsigned long get_fgraph_entry(struct task_struct *t, int offset)
+{
+ return t->ret_stack[offset];
+}
+
+/* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
+static inline int get_frame_offset(struct task_struct *t, int offset)
+{
+ return __get_offset(t->ret_stack[offset]);
+}
+
+/* For BITMAP type: get the bitmask from the @offset at ret_stack */
+static inline unsigned long
+get_bitmap_bits(struct task_struct *t, int offset)
+{
+ return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
+}
+
+/* Write the bitmap to the ret_stack at @offset (does index, offset and bitmask) */
+static inline void
+set_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
+{
+ t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
+ (FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+}
+
+/* For DATA type: get the data saved under the ret_stack word at @offset */
+static inline void *get_data_type_data(struct task_struct *t, int offset)
+{
+ unsigned long val = t->ret_stack[offset];
+
+ if (__get_type(val) != FGRAPH_TYPE_DATA)
+ return NULL;
+ offset -= __get_data_size(val);
+ return (void *)&t->ret_stack[offset];
+}
+
+/* Create the ret_stack word for a DATA type */
+static inline unsigned long make_data_type_val(int idx, int size, int offset)
+{
+ return (idx << FGRAPH_DATA_INDEX_SHIFT) |
+ ((size - 1) << FGRAPH_DATA_SHIFT) |
+ (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT) | offset;
+}
+
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+{
+ return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
+{
+}
+
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ return &gvals[idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+ memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_reserve_data - Reserve storage on the task's ret_stack
+ * @idx: The index of fgraph_array
+ * @size_bytes: The size in bytes to reserve
+ *
+ * Reserves space of up to FGRAPH_MAX_DATA_SIZE bytes on the
+ * task's ret_stack shadow stack, for a given fgraph_ops during
+ * the entryfunc() call. If entryfunc() returns zero, the storage
+ * is discarded. An entryfunc() can only call this once per iteration.
+ * The fgraph_ops retfunc() can retrieve this stored data with
+ * fgraph_retrieve_data().
+ *
+ * Returns: On success, a pointer to the data on the stack.
+ * Otherwise, NULL if there's not enough space left on the
+ * ret_stack for the data, or if fgraph_reserve_data() was called
+ * more than once for a single entryfunc() call.
+ */
+void *fgraph_reserve_data(int idx, int size_bytes)
+{
+ unsigned long val;
+ void *data;
+ int curr_ret_stack = current->curr_ret_stack;
+ int data_size;
+
+ if (size_bytes > FGRAPH_MAX_DATA_SIZE)
+ return NULL;
+
+ /* Convert the data size to number of longs. */
+ data_size = (size_bytes + sizeof(long) - 1) >> (sizeof(long) == 4 ? 2 : 3);
+
+ val = get_fgraph_entry(current, curr_ret_stack - 1);
+ data = &current->ret_stack[curr_ret_stack];
+
+ curr_ret_stack += data_size + 1;
+ if (unlikely(curr_ret_stack >= SHADOW_STACK_MAX_OFFSET))
+ return NULL;
+
+ val = make_data_type_val(idx, data_size, __get_offset(val) + data_size + 1);
+
+ /* Set the last word to be reserved */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ /* Make sure interrupts see this */
+ barrier();
+ current->curr_ret_stack = curr_ret_stack;
+ /* Again sync with interrupts, and reset reserve */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ return data;
+}
+
+/**
+ * fgraph_retrieve_data - Retrieve stored data from fgraph_reserve_data()
+ * @idx: the index of fgraph_array (fgraph_ops::idx)
+ * @size_bytes: pointer to retrieved data size.
+ *
+ * This is to be called by a fgraph_ops retfunc(), to retrieve data that
+ * was stored by the fgraph_ops entryfunc() on the function entry.
+ * That is, this will retrieve the data that was reserved on the
+ * entry of the function that corresponds to the exit of the function
+ * that the fgraph_ops retfunc() is called on.
+ *
+ * Returns: The stored data from fgraph_reserve_data() called by the
+ * matching entryfunc() for the retfunc() this is called from.
+ * Or NULL if there was nothing stored.
+ */
+void *fgraph_retrieve_data(int idx, int *size_bytes)
+{
+ int offset = current->curr_ret_stack - 1;
+ unsigned long val;
+
+ val = get_fgraph_entry(current, offset);
+ while (__get_type(val) == FGRAPH_TYPE_DATA) {
+ if (__get_data_index(val) == idx)
+ goto found;
+ offset -= __get_data_size(val) + 1;
+ val = get_fgraph_entry(current, offset);
+ }
+ return NULL;
+found:
+ if (size_bytes)
+ *size_bytes = __get_data_size(val) * sizeof(long);
+ return get_data_type_data(current, offset);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+ return ret_stack_get_task_var(current, gops->idx);
+}
+
+/*
+ * @offset: The offset into @t->ret_stack to find the ret_stack entry
+ * @frame_offset: Where to place the offset into @t->ret_stack of that entry
+ *
+ * Returns a pointer to the previous ret_stack below @offset or NULL
+ * when it reaches the bottom of the stack.
+ *
+ * Calling this with:
+ *
+ * offset = task->curr_ret_stack;
+ * do {
+ * ret_stack = get_ret_stack(task, offset, &offset);
+ * } while (ret_stack);
+ *
+ * Will iterate through all the ret_stack entries from curr_ret_stack
+ * down to the first one.
+ */
+static inline struct ftrace_ret_stack *
+get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
+{
+ int offs;
+
+ BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
+ if (unlikely(offset <= 0))
+ return NULL;
+
+ offs = get_frame_offset(t, --offset);
+ if (WARN_ON_ONCE(offs <= 0 || offs > offset))
+ return NULL;
+
+ offset -= offs;
+
+ *frame_offset = offset;
+ return RET_STACK(t, offset);
+}
+
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
@@ -51,6 +483,27 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
}
#endif
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+ .entryfunc = ftrace_graph_entry_stub,
+ .retfunc = ftrace_graph_ret_stub,
+};
+
+static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub;
+DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub);
+DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub);
+static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct);
+
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
*
@@ -67,10 +520,13 @@ void ftrace_graph_stop(void)
/* Add a function return address to the trace stack on thread info.*/
static int
ftrace_push_return_trace(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+ unsigned long frame_pointer, unsigned long *retp,
+ int fgraph_idx)
{
+ struct ftrace_ret_stack *ret_stack;
unsigned long long calltime;
- int index;
+ unsigned long val;
+ int offset;
if (unlikely(ftrace_graph_is_dead()))
return -EBUSY;
@@ -78,32 +534,67 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
if (!current->ret_stack)
return -EBUSY;
+ BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
+ /* Set val to "reserved" with the delta to the new fgraph frame */
+ val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+
/*
* We must make sure the ret_stack is tested before we read
* anything else.
*/
smp_rmb();
- /* The return trace stack is full */
- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ /*
+ * Check if there's room on the shadow stack to fit a fraph frame
+ * and a bitmap word.
+ */
+ if (current->curr_ret_stack + FGRAPH_FRAME_OFFSET + 1 >= SHADOW_STACK_MAX_OFFSET) {
atomic_inc(&current->trace_overrun);
return -EBUSY;
}
calltime = trace_clock_local();
- index = ++current->curr_ret_stack;
+ offset = READ_ONCE(current->curr_ret_stack);
+ ret_stack = RET_STACK(current, offset);
+ offset += FGRAPH_FRAME_OFFSET;
+
+ /* ret offset = FGRAPH_FRAME_OFFSET ; type = reserved */
+ current->ret_stack[offset] = val;
+ ret_stack->ret = ret;
+ /*
+ * The unwinders expect curr_ret_stack to point to either zero
+ * or an offset where to find the next ret_stack. Even though the
+ * ret stack might be bogus, we want to write the ret and the
+ * offset to find the ret_stack before we increment the stack point.
+ * If an interrupt comes in now before we increment the curr_ret_stack
+ * it may blow away what we wrote. But that's fine, because the
+ * offset will still be correct (even though the 'ret' won't be).
+ * What we worry about is the offset being correct after we increment
+ * the curr_ret_stack and before we update that offset, as if an
+ * interrupt comes in and does an unwind stack dump, it will need
+ * at least a correct offset!
+ */
barrier();
- current->ret_stack[index].ret = ret;
- current->ret_stack[index].func = func;
- current->ret_stack[index].calltime = calltime;
+ WRITE_ONCE(current->curr_ret_stack, offset + 1);
+ /*
+ * This next barrier is to ensure that an interrupt coming in
+ * will not corrupt what we are about to write.
+ */
+ barrier();
+
+ /* Still keep it reserved even if an interrupt came in */
+ current->ret_stack[offset] = val;
+
+ ret_stack->ret = ret;
+ ret_stack->func = func;
+ ret_stack->calltime = calltime;
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- current->ret_stack[index].fp = frame_pointer;
+ ret_stack->fp = frame_pointer;
#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
- current->ret_stack[index].retp = retp;
-#endif
- return 0;
+ ret_stack->retp = retp;
+ return offset;
}
/*
@@ -120,44 +611,85 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
# define MCOUNT_INSN_SIZE 0
#endif
+/* If the caller does not use ftrace, call this function. */
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
struct ftrace_graph_ent trace;
+ unsigned long bitmap = 0;
+ int offset;
+ int i;
trace.func = func;
trace.depth = ++current->curr_ret_depth;
- if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+ offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
+ if (offset < 0)
goto out;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace))
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ int save_curr_ret_stack = current->curr_ret_stack;
+
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+ bitmap |= BIT(fgraph_direct_gops->idx);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ } else
+#endif
+ {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+ int save_curr_ret_stack;
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ save_curr_ret_stack = current->curr_ret_stack;
+ if (ftrace_ops_test(&gops->ops, func, NULL) &&
+ gops->entryfunc(&trace, gops))
+ bitmap |= BIT(i);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ }
+ }
+
+ if (!bitmap)
goto out_ret;
+ /*
+ * Since this function uses fgraph_idx = 0 as a tail-call checking
+ * flag, set that bit always.
+ */
+ set_bitmap(current, offset, bitmap | BIT(0));
+
return 0;
out_ret:
- current->curr_ret_stack--;
+ current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
return -EBUSY;
}
/* Retrieve a function return address to the trace stack on thread info.*/
-static void
+static struct ftrace_ret_stack *
ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, int *offset)
{
- int index;
+ struct ftrace_ret_stack *ret_stack;
- index = current->curr_ret_stack;
+ ret_stack = get_ret_stack(current, current->curr_ret_stack, offset);
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ if (unlikely(!ret_stack)) {
ftrace_graph_stop();
- WARN_ON(1);
+ WARN(1, "Bad function graph ret_stack pointer: %d",
+ current->curr_ret_stack);
/* Might as well panic, otherwise we have no where to go */
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
@@ -175,30 +707,33 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
* Note, -mfentry does not use frame pointers, and this test
* is not needed if CC_USING_FENTRY is set.
*/
- if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ if (unlikely(ret_stack->fp != frame_pointer)) {
ftrace_graph_stop();
WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
" from func %ps return to %lx\n",
- current->ret_stack[index].fp,
+ ret_stack->fp,
frame_pointer,
- (void *)current->ret_stack[index].func,
- current->ret_stack[index].ret);
+ (void *)ret_stack->func,
+ ret_stack->ret);
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#endif
- *ret = current->ret_stack[index].ret;
- trace->func = current->ret_stack[index].func;
- trace->calltime = current->ret_stack[index].calltime;
+ *offset += FGRAPH_FRAME_OFFSET;
+ *ret = ret_stack->ret;
+ trace->func = ret_stack->func;
+ trace->calltime = ret_stack->calltime;
trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = current->curr_ret_depth--;
+ trace->depth = current->curr_ret_depth;
/*
* We still want to trace interrupts coming in if
* max_depth is set to 1. Make sure the decrement is
* seen before ftrace_graph_return.
*/
barrier();
+
+ return ret_stack;
}
/*
@@ -236,30 +771,55 @@ struct fgraph_ret_regs;
static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
unsigned long frame_pointer)
{
+ struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
+ unsigned long bitmap;
unsigned long ret;
+ int offset;
+ int i;
+
+ ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
- ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+ if (unlikely(!ret_stack)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ return (unsigned long)panic;
+ }
+
+ trace.rettime = trace_clock_local();
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
trace.retval = fgraph_ret_regs_return_value(ret_regs);
#endif
- trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
+
+ bitmap = get_bitmap_bits(current, offset);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ if (test_bit(fgraph_direct_gops->idx, &bitmap))
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops);
+ } else
+#endif
+ {
+ for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = fgraph_array[i];
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ gops->retfunc(&trace, gops);
+ }
+ }
+
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
* curr_ret_stack is after that.
*/
barrier();
- current->curr_ret_stack--;
-
- if (unlikely(!ret)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic. What else to do? */
- ret = (unsigned long)panic;
- }
+ current->curr_ret_stack = offset - FGRAPH_FRAME_OFFSET;
+ current->curr_ret_depth--;
return ret;
}
@@ -282,7 +842,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
- * @task: The task to read the shadow stack from
+ * @task: The task to read the shadow stack from.
* @idx: Index down the shadow stack
*
* Return the ret_struct on the shadow stack of the @task at the
@@ -294,104 +854,116 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
struct ftrace_ret_stack *
ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
{
- idx = task->curr_ret_stack - idx;
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
- if (idx >= 0 && idx <= task->curr_ret_stack)
- return &task->ret_stack[idx];
+ if (offset < 0)
+ return NULL;
- return NULL;
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && --idx >= 0);
+
+ return ret_stack;
}
/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- * to its original value
+ * ftrace_graph_ret_addr - return the original value of the return address
+ * @task: The task the unwinder is being executed on
+ * @idx: An initialized pointer to the next stack index to use
+ * @ret: The current return address (likely pointing to return_handler)
+ * @retp: The address on the stack of the current return location
*
* This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
+ * return address (@ret) to its original value, in case the function graph
* tracer has modified it to be 'return_to_handler'. If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
+ * been modified, the unchanged value of @ret is returned.
*
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
+ * @idx holds the last index used to know where to start from. It should be
+ * initialized to zero for the first iteration as that will mean to start
+ * at the top of the shadow stack. If the location is found, this pointer
+ * will be assigned that location so that if called again, it will continue
+ * where it left off.
*
- * 'retp' is a pointer to the return address on the stack. It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ * @retp is a pointer to the return address on the stack.
*/
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp)
{
- int index = task->curr_ret_stack;
- int i;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+ int i = task->curr_ret_stack;
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
+ if (ret != return_handler)
return ret;
- if (index < 0)
+ if (!idx)
return ret;
- for (i = 0; i <= index; i++)
- if (task->ret_stack[i].retp == retp)
- return task->ret_stack[i].ret;
+ i = *idx ? : task->curr_ret_stack;
+ while (i > 0) {
+ ret_stack = get_ret_stack(task, i, &i);
+ if (!ret_stack)
+ break;
+ /*
+ * For the tail-call, there would be 2 or more ftrace_ret_stacks on
+ * the ret_stack, which records "return_to_handler" as the return
+ * address except for the last one.
+ * But on the real stack, there should be 1 entry because tail-call
+ * reuses the return address on the stack and jump to the next function.
+ * Thus we will continue to find real return address.
+ */
+ if (ret_stack->retp == retp &&
+ ret_stack->ret != return_handler) {
+ *idx = i;
+ return ret_stack->ret;
+ }
+ }
return ret;
}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int task_idx;
-
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
- return ret;
-
- task_idx = task->curr_ret_stack;
-
- if (!task->ret_stack || task_idx < *idx)
- return ret;
-
- task_idx -= *idx;
- (*idx)++;
-
- return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
.func = ftrace_graph_func,
- .flags = FTRACE_OPS_FL_INITIALIZED |
- FTRACE_OPS_FL_PID |
- FTRACE_OPS_GRAPH_STUB,
+ .flags = FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
#endif
- ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
-void ftrace_graph_sleep_time_control(bool enable)
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops)
{
- fgraph_sleep_time = enable;
+ dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (src_ops) {
+ dst_ops->func_hash = &src_ops->local_hash;
+ mutex_init(&dst_ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&dst_ops->subop_list);
+ dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
}
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+void ftrace_graph_sleep_time_control(bool enable)
{
- return 0;
+ fgraph_sleep_time = enable;
}
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
{
int i;
int ret = 0;
@@ -399,10 +971,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
struct task_struct *g, *t;
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -420,9 +989,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
if (t->ret_stack == NULL) {
atomic_set(&t->trace_overrun, 0);
- t->curr_ret_stack = -1;
+ ret_stack_init_task_vars(ret_stack_list[start]);
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
- /* Make sure the tasks see the -1 first: */
+ /* Make sure the tasks see the 0 first: */
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
@@ -442,8 +1012,9 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
+ struct ftrace_ret_stack *ret_stack;
unsigned long long timestamp;
- int index;
+ int offset;
/*
* Does the user want to count the time a function was asleep.
@@ -466,57 +1037,23 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
*/
timestamp -= next->ftrace_timestamp;
- for (index = next->curr_ret_stack; index >= 0; index--)
- next->ret_stack[index].calltime += timestamp;
-}
-
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
- if (!ftrace_ops_test(&global_ops, trace->func, NULL))
- return 0;
- return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
- struct ftrace_ops *op;
- bool do_test = false;
-
- /*
- * The graph and global ops share the same set of functions
- * to test. If any other ops is on the list, then
- * the graph tracing needs to test if its the function
- * it should call.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op != &global_ops && op != &graph_ops &&
- op != &ftrace_list_end) {
- do_test = true;
- /* in double loop, break out with goto */
- goto out;
- }
- } while_for_each_ftrace_op(op);
- out:
- if (do_test)
- ftrace_graph_entry = ftrace_graph_entry_test;
- else
- ftrace_graph_entry = __ftrace_graph_entry;
+ for (offset = next->curr_ret_stack; offset > 0; ) {
+ ret_stack = get_ret_stack(next, offset, &offset);
+ if (ret_stack)
+ ret_stack->calltime += timestamp;
+ }
}
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+graph_init_task(struct task_struct *t, unsigned long *ret_stack)
{
atomic_set(&t->trace_overrun, 0);
+ ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
+ t->curr_ret_stack = 0;
+ t->curr_ret_depth = -1;
/* make curr_ret_stack visible before we add the ret_stack */
smp_wmb();
t->ret_stack = ret_stack;
@@ -528,7 +1065,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
*/
void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
{
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/*
* The idle task has no parent, it either has its own
@@ -538,14 +1075,11 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -559,15 +1093,13 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
- ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -576,7 +1108,7 @@ void ftrace_graph_init_task(struct task_struct *t)
void ftrace_graph_exit_task(struct task_struct *t)
{
- struct ftrace_ret_stack *ret_stack = t->ret_stack;
+ unsigned long *ret_stack = t->ret_stack;
t->ret_stack = NULL;
/* NULL must become visible to IRQs before we free it: */
@@ -585,25 +1117,57 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+static int fgraph_pid_func(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ struct trace_array *tr = gops->ops.private;
+ int pid;
+
+ if (tr) {
+ pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ if (pid == FTRACE_PID_IGNORE)
+ return 0;
+ if (pid != FTRACE_PID_TRACE &&
+ pid != current->pid)
+ return 0;
+ }
+
+ return gops->saved_func(trace, gops);
+}
+
+void fgraph_update_pid_func(void)
+{
+ struct fgraph_ops *gops;
+ struct ftrace_ops *op;
+
+ if (!(graph_ops.flags & FTRACE_OPS_FL_INITIALIZED))
+ return;
+
+ list_for_each_entry(op, &graph_ops.subop_list, list) {
+ if (op->flags & FTRACE_OPS_FL_PID) {
+ gops = container_of(op, struct fgraph_ops, ops);
+ gops->entryfunc = ftrace_pids_enabled(op) ?
+ fgraph_pid_func : gops->saved_func;
+ if (ftrace_graph_active == 1)
+ static_call_update(fgraph_func, gops->entryfunc);
+ }
+ }
+}
+#endif
+
/* Allocate a return stack for each task */
static int start_graph_tracing(void)
{
- struct ftrace_ret_stack **ret_stack_list;
- int ret, cpu;
+ unsigned long **ret_stack_list;
+ int ret;
- ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
+ ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(*ret_stack_list), GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -619,60 +1183,174 @@ static int start_graph_tracing(void)
return ret;
}
+static void init_task_vars(int idx)
+{
+ struct task_struct *g, *t;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (idle_task(cpu)->ret_stack)
+ ret_stack_set_task_var(idle_task(cpu), idx, 0);
+ }
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, t) {
+ if (t->ret_stack)
+ ret_stack_set_task_var(t, idx, 0);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops)
+{
+ trace_func_graph_ent_t func = NULL;
+ trace_func_graph_ret_t retfunc = NULL;
+ int i;
+
+ if (gops) {
+ func = gops->entryfunc;
+ retfunc = gops->retfunc;
+ fgraph_direct_gops = gops;
+ } else {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ func = fgraph_array[i]->entryfunc;
+ retfunc = fgraph_array[i]->retfunc;
+ fgraph_direct_gops = fgraph_array[i];
+ }
+ }
+ if (WARN_ON_ONCE(!func))
+ return;
+
+ static_call_update(fgraph_func, func);
+ static_call_update(fgraph_retfunc, retfunc);
+ if (enable_branch)
+ static_branch_disable(&fgraph_do_direct);
+}
+
+static void ftrace_graph_disable_direct(bool disable_branch)
+{
+ if (disable_branch)
+ static_branch_disable(&fgraph_do_direct);
+ static_call_update(fgraph_func, ftrace_graph_entry_stub);
+ static_call_update(fgraph_retfunc, ftrace_graph_ret_stub);
+ fgraph_direct_gops = &fgraph_stub;
+}
+
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
+ int command = 0;
int ret = 0;
+ int i = -1;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- /* we currently allow only one tracer registered at a time */
- if (ftrace_graph_active) {
- ret = -EBUSY;
- goto out;
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
}
- register_pm_notifier(&ftrace_suspend_notifier);
+ if (!fgraph_array[0]) {
+ /* The array must always have real data on it */
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_array[i] = &fgraph_stub;
+ fgraph_lru_init();
+ }
+
+ i = fgraph_lru_alloc_index();
+ if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub))
+ return -ENOSPC;
+ gops->idx = i;
ftrace_graph_active++;
- ret = start_graph_tracing();
- if (ret) {
- ftrace_graph_active--;
- goto out;
- }
- ftrace_graph_return = gops->retfunc;
+ if (ftrace_graph_active == 2)
+ ftrace_graph_disable_direct(true);
- /*
- * Update the indirect function to the entryfunc, and the
- * function that gets called to the entry_test first. Then
- * call the update fgraph entry function to determine if
- * the entryfunc should be called directly or not.
- */
- __ftrace_graph_entry = gops->entryfunc;
- ftrace_graph_entry = ftrace_graph_entry_test;
- update_function_graph_func();
+ if (ftrace_graph_active == 1) {
+ ftrace_graph_enable_direct(false, gops);
+ register_pm_notifier(&ftrace_suspend_notifier);
+ ret = start_graph_tracing();
+ if (ret)
+ goto error;
+ /*
+ * Some archs just test to see if these are not
+ * the default function
+ */
+ ftrace_graph_return = return_run;
+ ftrace_graph_entry = entry_run;
+ command = FTRACE_START_FUNC_RET;
+ } else {
+ init_task_vars(gops->idx);
+ }
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
- ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-out:
- mutex_unlock(&ftrace_lock);
+ ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+ if (!ret)
+ fgraph_array[i] = gops;
+
+error:
+ if (ret) {
+ ftrace_graph_active--;
+ gops->saved_func = NULL;
+ fgraph_lru_release_index(i);
+ }
return ret;
}
void unregister_ftrace_graph(struct fgraph_ops *gops)
{
+ int command = 0;
+
mutex_lock(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
goto out;
+ if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+ fgraph_array[gops->idx] != gops))
+ goto out;
+
+ if (fgraph_lru_release_index(gops->idx) < 0)
+ goto out;
+
+ fgraph_array[gops->idx] = &fgraph_stub;
+
ftrace_graph_active--;
- ftrace_graph_return = ftrace_stub_graph;
- ftrace_graph_entry = ftrace_graph_entry_stub;
- __ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
- unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ if (!ftrace_graph_active)
+ command = FTRACE_STOP_FUNC_RET;
+
+ ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
+
+ if (ftrace_graph_active == 1)
+ ftrace_graph_enable_direct(true, NULL);
+ else if (!ftrace_graph_active)
+ ftrace_graph_disable_direct(false);
+
+ if (!ftrace_graph_active) {
+ ftrace_graph_return = ftrace_stub_graph;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ }
out:
+ gops->saved_func = NULL;
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eacab4020508..4c28dd177ca6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -74,7 +74,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), \
+ .subop_list = LIST_HEAD_INIT(opsname.subop_list),
#else
#define INIT_OPS_HASH(opsname)
#endif
@@ -99,7 +100,7 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-static bool ftrace_pids_enabled(struct ftrace_ops *ops)
+bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
struct trace_array *tr;
@@ -121,7 +122,7 @@ static int ftrace_disabled __read_mostly;
DEFINE_MUTEX(ftrace_lock);
-struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = (struct ftrace_ops __rcu *)&ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
@@ -161,12 +162,14 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
mutex_init(&ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&ops->subop_list);
ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
}
+/* Call this function for when a callback filters on set_ftrace_pid */
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
@@ -235,8 +238,6 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
- update_function_graph_func();
-
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
@@ -310,7 +311,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
lockdep_is_held(&ftrace_lock)) == ops &&
rcu_dereference_protected(ops->next,
lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
- *list = &ftrace_list_end;
+ rcu_assign_pointer(*list, &ftrace_list_end);
return 0;
}
@@ -406,6 +407,8 @@ static void ftrace_update_pid_func(void)
}
} while_for_each_ftrace_op(op);
+ fgraph_update_pid_func();
+
update_ftrace_function();
}
@@ -817,7 +820,8 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct ftrace_ret_stack *ret_stack;
@@ -834,7 +838,8 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
return 1;
}
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct ftrace_ret_stack *ret_stack;
struct ftrace_profile_stat *stat;
@@ -1314,7 +1319,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
-
+/* Used to save filters on functions for modules not loaded yet */
static int ftrace_add_mod(struct trace_array *tr,
const char *func, const char *module,
int enable)
@@ -1380,15 +1385,17 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
}
-static void
-ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
-static void
-ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops);
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
+/*
+ * Allocate a new hash and remove entries from @src and move them to the new hash.
+ * On success, the @src hash will be empty and should be freed.
+ */
+static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
@@ -1424,6 +1431,7 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
return new_hash;
}
+/* Move the @src entries to a newly allocated hash */
static struct ftrace_hash *
__ftrace_hash_move(struct ftrace_hash *src)
{
@@ -1435,9 +1443,29 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (ftrace_hash_empty(src))
return EMPTY_HASH;
- return dup_hash(src, size);
+ return __move_hash(src, size);
}
+/**
+ * ftrace_hash_move - move a new hash to a filter and do updates
+ * @ops: The ops with the hash that @dst points to
+ * @enable: True if for the filter hash, false for the notrace hash
+ * @dst: Points to the @ops hash that should be updated
+ * @src: The hash to update @dst with
+ *
+ * This is called when an ftrace_ops hash is being updated and the
+ * the kernel needs to reflect this. Note, this only updates the kernel
+ * function callbacks if the @ops is enabled (not to be confused with
+ * @enable above). If the @ops is enabled, its hash determines what
+ * callbacks get called. This function gets called when the @ops hash
+ * is updated and it requires new callbacks.
+ *
+ * On success the elements of @src is moved to @dst, and @dst is updated
+ * properly, as well as the functions determined by the @ops hashes
+ * are now calling the @ops callback function.
+ *
+ * Regardless of return type, @src should be freed with free_ftrace_hash().
+ */
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1467,11 +1495,11 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* Remove the current set, update the hash and add
* them back.
*/
- ftrace_hash_rec_disable_modify(ops, enable);
+ ftrace_hash_rec_disable_modify(ops);
rcu_assign_pointer(*dst, new_hash);
- ftrace_hash_rec_enable_modify(ops, enable);
+ ftrace_hash_rec_enable_modify(ops);
return 0;
}
@@ -1694,12 +1722,21 @@ static bool skip_record(struct dyn_ftrace *rec)
!(rec->flags & FTRACE_FL_ENABLED);
}
+/*
+ * This is the main engine to the ftrace updates to the dyn_ftrace records.
+ *
+ * It will iterate through all the available ftrace functions
+ * (the ones that ftrace can have callbacks to) and set the flags
+ * in the associated dyn_ftrace records.
+ *
+ * @inc: If true, the functions associated to @ops are added to
+ * the dyn_ftrace records, otherwise they are removed.
+ */
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
- int filter_hash,
bool inc)
{
struct ftrace_hash *hash;
- struct ftrace_hash *other_hash;
+ struct ftrace_hash *notrace_hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
bool update = false;
@@ -1711,35 +1748,16 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return false;
/*
- * In the filter_hash case:
* If the count is zero, we update all records.
* Otherwise we just update the items in the hash.
- *
- * In the notrace_hash case:
- * We enable the update in the hash.
- * As disabling notrace means enabling the tracing,
- * and enabling notrace means disabling, the inc variable
- * gets inversed.
*/
- if (filter_hash) {
- hash = ops->func_hash->filter_hash;
- other_hash = ops->func_hash->notrace_hash;
- if (ftrace_hash_empty(hash))
- all = true;
- } else {
- inc = !inc;
- hash = ops->func_hash->notrace_hash;
- other_hash = ops->func_hash->filter_hash;
- /*
- * If the notrace hash has no items,
- * then there's nothing to do.
- */
- if (ftrace_hash_empty(hash))
- return false;
- }
+ hash = ops->func_hash->filter_hash;
+ notrace_hash = ops->func_hash->notrace_hash;
+ if (ftrace_hash_empty(hash))
+ all = true;
do_for_each_ftrace_rec(pg, rec) {
- int in_other_hash = 0;
+ int in_notrace_hash = 0;
int in_hash = 0;
int match = 0;
@@ -1751,26 +1769,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* Only the filter_hash affects all records.
* Update if the record is not in the notrace hash.
*/
- if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ if (!notrace_hash || !ftrace_lookup_ip(notrace_hash, rec->ip))
match = 1;
} else {
in_hash = !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_notrace_hash = !!ftrace_lookup_ip(notrace_hash, rec->ip);
/*
- * If filter_hash is set, we want to match all functions
- * that are in the hash but not in the other hash.
- *
- * If filter_hash is not set, then we are decrementing.
- * That means we match anything that is in the hash
- * and also in the other_hash. That is, we need to turn
- * off functions in the other hash because they are disabled
- * by this hash.
+ * We want to match all functions that are in the hash but
+ * not in the other hash.
*/
- if (filter_hash && in_hash && !in_other_hash)
- match = 1;
- else if (!filter_hash && in_hash &&
- (in_other_hash || ftrace_hash_empty(other_hash)))
+ if (in_hash && !in_notrace_hash)
match = 1;
}
if (!match)
@@ -1876,24 +1885,48 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return update;
}
-static bool ftrace_hash_rec_disable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is removed from tracing. It will decrement
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_disable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 0);
+ return __ftrace_hash_rec_update(ops, false);
}
-static bool ftrace_hash_rec_enable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is added to tracing. It will increment
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_enable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 1);
+ return __ftrace_hash_rec_update(ops, true);
}
-static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
- int filter_hash, int inc)
+/*
+ * This function will update what functions @ops traces when its filter
+ * changes.
+ *
+ * The @inc states if the @ops callbacks are going to be added or removed.
+ * When one of the @ops hashes are updated to a "new_hash" the dyn_ftrace
+ * records are update via:
+ *
+ * ftrace_hash_rec_disable_modify(ops);
+ * ops->hash = new_hash
+ * ftrace_hash_rec_enable_modify(ops);
+ *
+ * Where the @ops is removed from all the records it is tracing using
+ * its old hash. The @ops hash is updated to the new hash, and then
+ * the @ops is added back to the records so that it is tracing all
+ * the new functions.
+ */
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, bool inc)
{
struct ftrace_ops *op;
- __ftrace_hash_rec_update(ops, filter_hash, inc);
+ __ftrace_hash_rec_update(ops, inc);
if (ops->func_hash != &global_ops.local_hash)
return;
@@ -1907,20 +1940,18 @@ static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
if (op == ops)
continue;
if (op->func_hash == &global_ops.local_hash)
- __ftrace_hash_rec_update(op, filter_hash, inc);
+ __ftrace_hash_rec_update(op, inc);
} while_for_each_ftrace_op(op);
}
-static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+ ftrace_hash_rec_update_modify(ops, false);
}
-static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+ ftrace_hash_rec_update_modify(ops, true);
}
/*
@@ -3043,7 +3074,7 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
}
- if (ftrace_hash_rec_enable(ops, 1))
+ if (ftrace_hash_rec_enable(ops))
command |= FTRACE_UPDATE_CALLS;
ftrace_startup_enable(command);
@@ -3085,7 +3116,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
/* Disabling ipmodify never fails */
ftrace_hash_ipmodify_disable(ops);
- if (ftrace_hash_rec_disable(ops, 1))
+ if (ftrace_hash_rec_disable(ops))
command |= FTRACE_UPDATE_CALLS;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -3164,6 +3195,474 @@ out:
return 0;
}
+/* Simply make a copy of @src and return it */
+static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
+{
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return alloc_and_copy_ftrace_hash(src->size_bits, src);
+}
+
+/*
+ * Append @new_hash entries to @hash:
+ *
+ * If @hash is the EMPTY_HASH then it traces all functions and nothing
+ * needs to be done.
+ *
+ * If @new_hash is the EMPTY_HASH, then make *hash the EMPTY_HASH so
+ * that it traces everything.
+ *
+ * Otherwise, go through all of @new_hash and add anything that @hash
+ * doesn't already have, to @hash.
+ *
+ * The filter_hash updates uses just the append_hash() function
+ * and the notrace_hash does not.
+ */
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+
+ /* If new_hash has everything make hash have everything */
+ if (ftrace_hash_empty(new_hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash->buckets[i], hlist) {
+ /* Only add if not already in hash */
+ if (!__ftrace_lookup_ip(*hash, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Add to @hash only those that are in both @new_hash1 and @new_hash2
+ *
+ * The notrace_hash updates uses just the intersect_hash() function
+ * and the filter_hash does not.
+ */
+static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash1,
+ struct ftrace_hash *new_hash2)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /*
+ * If new_hash1 or new_hash2 is the EMPTY_HASH then make the hash
+ * empty as well as empty for notrace means none are notraced.
+ */
+ if (ftrace_hash_empty(new_hash1) || ftrace_hash_empty(new_hash2)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash1->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash1->buckets[i], hlist) {
+ /* Only add if in both @new_hash1 and @new_hash2 */
+ if (__ftrace_lookup_ip(new_hash2, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ /* If nothing intersects, make it the empty set */
+ if (ftrace_hash_empty(*hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+/* Return a new hash that has a union of all @ops->filter_hash entries */
+static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash;
+ struct ftrace_ops *subops;
+ int ret;
+
+ new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits);
+ if (!new_hash)
+ return NULL;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ ret = append_hash(&new_hash, subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ return new_hash;
+}
+
+/* Make @ops trace evenything except what all its subops do not trace */
+static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash = NULL;
+ struct ftrace_ops *subops;
+ int size_bits;
+ int ret;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ struct ftrace_hash *next_hash;
+
+ if (!new_hash) {
+ size_bits = subops->func_hash->notrace_hash->size_bits;
+ new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash);
+ if (!new_hash)
+ return NULL;
+ continue;
+ }
+ size_bits = new_hash->size_bits;
+ next_hash = new_hash;
+ new_hash = alloc_ftrace_hash(size_bits);
+ ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash);
+ free_ftrace_hash(next_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ return new_hash;
+}
+
+static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (ftrace_hash_empty(A))
+ return ftrace_hash_empty(B);
+
+ if (ftrace_hash_empty(B))
+ return ftrace_hash_empty(A);
+
+ if (A->count != B->count)
+ return false;
+
+ size = 1 << A->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &A->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(B, entry->ip))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash);
+
+static int __ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+ struct ftrace_hash **orig_hash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ int ret;
+
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ return ret;
+}
+
+static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_hash,
+ struct ftrace_hash *notrace_hash)
+{
+ int ret;
+
+ if (!ops_equal(filter_hash, ops->func_hash->filter_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->filter_hash,
+ filter_hash, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!ops_equal(notrace_hash, ops->func_hash->notrace_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->notrace_hash,
+ notrace_hash, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ftrace_startup_subops - enable tracing for subops of an ops
+ * @ops: Manager ops (used to pick all the functions of its subops)
+ * @subops: A new ops to add to @ops
+ * @command: Extra commands to use to enable tracing
+ *
+ * The @ops is a manager @ops that has the filter that includes all the functions
+ * that its list of subops are tracing. Adding a new @subops will add the
+ * functions of @subops to @ops.
+ */
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *save_filter_hash;
+ struct ftrace_hash *save_notrace_hash;
+ int size_bits;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ftrace_ops_init(ops);
+ ftrace_ops_init(subops);
+
+ if (WARN_ON_ONCE(subops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+ /* Make everything canonical (Just in case!) */
+ if (!ops->func_hash->filter_hash)
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ if (!ops->func_hash->notrace_hash)
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ if (!subops->func_hash->filter_hash)
+ subops->func_hash->filter_hash = EMPTY_HASH;
+ if (!subops->func_hash->notrace_hash)
+ subops->func_hash->notrace_hash = EMPTY_HASH;
+
+ /* For the first subops to ops just enable it normally */
+ if (list_empty(&ops->subop_list)) {
+ /* Just use the subops hashes */
+ filter_hash = copy_hash(subops->func_hash->filter_hash);
+ notrace_hash = copy_hash(subops->func_hash->notrace_hash);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return -ENOMEM;
+ }
+
+ save_filter_hash = ops->func_hash->filter_hash;
+ save_notrace_hash = ops->func_hash->notrace_hash;
+
+ ops->func_hash->filter_hash = filter_hash;
+ ops->func_hash->notrace_hash = notrace_hash;
+ list_add(&subops->list, &ops->subop_list);
+ ret = ftrace_startup(ops, command);
+ if (ret < 0) {
+ list_del(&subops->list);
+ ops->func_hash->filter_hash = save_filter_hash;
+ ops->func_hash->notrace_hash = save_notrace_hash;
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ } else {
+ free_ftrace_hash(save_filter_hash);
+ free_ftrace_hash(save_notrace_hash);
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+ }
+
+ /*
+ * Here there's already something attached. Here are the rules:
+ * o If either filter_hash is empty then the final stays empty
+ * o Otherwise, the final is a superset of both hashes
+ * o If either notrace_hash is empty then the final stays empty
+ * o Otherwise, the final is an intersection between the hashes
+ */
+ if (ftrace_hash_empty(ops->func_hash->filter_hash) ||
+ ftrace_hash_empty(subops->func_hash->filter_hash)) {
+ filter_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
+ if (!filter_hash)
+ return -ENOMEM;
+ ret = append_hash(&filter_hash, subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ return ret;
+ }
+ }
+
+ if (ftrace_hash_empty(ops->func_hash->notrace_hash) ||
+ ftrace_hash_empty(subops->func_hash->notrace_hash)) {
+ notrace_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ return -ENOMEM;
+ }
+
+ ret = intersect_hash(&notrace_hash, ops->func_hash->filter_hash,
+ subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+ }
+ }
+
+ list_add(&subops->list, &ops->subop_list);
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ if (ret < 0) {
+ list_del(&subops->list);
+ } else {
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+}
+
+/**
+ * ftrace_shutdown_subops - Remove a subops from a manager ops
+ * @ops: A manager ops to remove @subops from
+ * @subops: The subops to remove from @ops
+ * @command: Any extra command flags to add to modifying the text
+ *
+ * Removes the functions being traced by the @subops from @ops. Note, it
+ * will not affect functions that are being traced by other subops that
+ * still exist in @ops.
+ *
+ * If the last subops is removed from @ops, then @ops is shutdown normally.
+ */
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ if (WARN_ON_ONCE(!(subops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EINVAL;
+
+ list_del(&subops->list);
+
+ if (list_empty(&ops->subop_list)) {
+ /* Last one, just disable the current ops */
+
+ ret = ftrace_shutdown(ops, command);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ return ret;
+ }
+
+ subops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+
+ return 0;
+ }
+
+ /* Rebuild the hashes without subops */
+ filter_hash = append_hashes(ops);
+ notrace_hash = intersect_hashes(ops);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ list_add(&subops->list, &ops->subop_list);
+ return -ENOMEM;
+ }
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ } else {
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+ }
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+}
+
+static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
+ struct ftrace_hash **orig_subhash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops *ops = subops->managed;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *save_hash;
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Manager ops can not be subops (yet) */
+ if (WARN_ON_ONCE(!ops || ops->flags & FTRACE_OPS_FL_SUBOP))
+ return -EINVAL;
+
+ /* Move the new hash over to the subops hash */
+ save_hash = *orig_subhash;
+ *orig_subhash = __ftrace_hash_move(hash);
+ if (!*orig_subhash) {
+ *orig_subhash = save_hash;
+ return -ENOMEM;
+ }
+
+ /* Create a new_hash to hold the ops new functions */
+ if (enable) {
+ orig_hash = &ops->func_hash->filter_hash;
+ new_hash = append_hashes(ops);
+ } else {
+ orig_hash = &ops->func_hash->notrace_hash;
+ new_hash = intersect_hashes(ops);
+ }
+
+ /* Move the hash over to the new hash */
+ ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable);
+
+ free_ftrace_hash(new_hash);
+
+ if (ret) {
+ /* Put back the original hash */
+ free_ftrace_hash_rcu(*orig_subhash);
+ *orig_subhash = save_hash;
+ } else {
+ free_ftrace_hash_rcu(save_hash);
+ }
+ return ret;
+}
+
+
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
@@ -4380,19 +4879,33 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
struct ftrace_hash *hash,
int enable)
{
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_hash *old_hash;
- int ret;
+ if (ops->flags & FTRACE_OPS_FL_SUBOP)
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable);
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret) {
- ftrace_ops_update_code(ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
+ /*
+ * If this ops is not enabled, it could be sharing its filters
+ * with a subop. If that's the case, update the subop instead of
+ * this ops. Shared filters are only allowed to have one ops set
+ * at a time, and if we update the ops that is not enabled,
+ * it will not affect subops that share it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
+ struct ftrace_ops *op;
+
+ /* Check if any other manager subops maps to this hash */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ struct ftrace_ops *subops;
+
+ list_for_each_entry(subops, &op->subop_list, list) {
+ if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ subops->func_hash == ops->func_hash) {
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable);
+ }
+ }
+ } while_for_each_ftrace_op(op);
}
- return ret;
+
+ return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
static bool module_exists(const char *module)
@@ -5475,6 +5988,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
* unregister_ftrace_direct - Remove calls to custom trampoline
* previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the direct function that is called by the @ops functions
+ * @free_filters: Set to true to remove all filters for the ftrace_ops, false otherwise
*
* This is used to remove a direct calls to @addr from the nop locations
* of the functions registered in @ops (with by ftrace_set_filter_ip
@@ -7324,6 +7839,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
tr->ops = &global_ops;
tr->ops->private = tr;
ftrace_init_trace_array(tr);
+ init_array_fgraph_ops(tr, tr->ops);
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -7404,6 +7920,7 @@ out:
void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
+ kmsan_unpoison_memory(fregs, sizeof(*fregs));
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
#else
@@ -8218,7 +8735,7 @@ static bool is_permanent_ops_registered(void)
}
static int
-ftrace_enable_sysctl(struct ctl_table *table, int write,
+ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = -ENODEV;
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 5012c04f92c0..3235470e61b3 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -15,6 +15,8 @@ extern struct ftrace_ops global_ops;
int ftrace_startup(struct ftrace_ops *ops, int command);
int ftrace_shutdown(struct ftrace_ops *ops, int command);
int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
#else /* !CONFIG_DYNAMIC_FTRACE */
@@ -38,14 +40,26 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
+static inline int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
+static inline int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern int ftrace_graph_active;
-void update_function_graph_func(void);
+# ifdef CONFIG_DYNAMIC_FTRACE
+extern void fgraph_update_pid_func(void);
+# else
+static inline void fgraph_update_pid_func(void) {}
+# endif
#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
# define ftrace_graph_active 0
-static inline void update_function_graph_func(void) { }
+static inline void fgraph_update_pid_func(void) {}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#else /* !CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b32d..4966e6bbdf6f 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
@@ -451,6 +451,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
/**
* trace_pid_list_free - Frees an allocated pid_list.
+ * @pid_list: The pid list to free.
*
* Frees the memory for a pid_list that was allocated.
*/
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index cb0871fbdb07..314ffc143039 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
static void busy_wait(ulong time)
{
u64 start, end;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..5807116bcd0b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -32,6 +32,8 @@
#include <asm/local64.h>
#include <asm/local.h>
+#include "trace.h"
+
/*
* The "absolute" timestamp in the buffer is only 59 bits.
* If a clock has the 5 MSBs set, it needs to be saved and
@@ -42,6 +44,21 @@
static void update_pages_handler(struct work_struct *work);
+#define RING_BUFFER_META_MAGIC 0xBADFEED
+
+struct ring_buffer_meta {
+ int magic;
+ int struct_size;
+ unsigned long text_addr;
+ unsigned long data_addr;
+ unsigned long first_buffer;
+ unsigned long head_buffer;
+ unsigned long commit_buffer;
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+ int buffers[];
+};
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -342,7 +359,8 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
- u32 id; /* ID for external mapping */
+ u32 id:30; /* ID for external mapping */
+ u32 range:1; /* Mapped via a range */
struct buffer_data_page *page; /* Actual data page */
};
@@ -373,7 +391,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_pages((unsigned long)bpage->page, bpage->order);
+ /* Range pages are not to be freed */
+ if (!bpage->range)
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -491,9 +511,11 @@ struct ring_buffer_per_cpu {
unsigned long pages_removed;
unsigned int mapped;
+ unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
unsigned long *subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
+ struct ring_buffer_meta *ring_meta;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -523,6 +545,12 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+ unsigned long range_addr_start;
+ unsigned long range_addr_end;
+
+ long last_text_delta;
+ long last_data_delta;
+
unsigned int subbuf_size;
unsigned int subbuf_order;
unsigned int max_data_size;
@@ -693,18 +721,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
}
/**
- * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
- * @buffer: The ring_buffer to get the number of pages from
- * @cpu: The cpu of the ring_buffer to get the number of pages from
- *
- * Returns the number of pages used by a per_cpu buffer of the ring buffer.
- */
-size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
-{
- return buffer->buffers[cpu]->nr_pages;
-}
-
-/**
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
@@ -1251,6 +1267,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
+
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->head_buffer = (unsigned long)head->page;
+ }
}
static void rb_list_head_clear(struct list_head *list)
@@ -1490,9 +1511,484 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
}
}
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+ addr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_subbufs;
+ return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return the ring_buffer_meta for a given @cpu.
+ */
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
+{
+ int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *meta;
+ int nr_subbufs;
+
+ if (!ptr)
+ return NULL;
+
+ /* When nr_pages passed in is zero, the first meta has already been initialized */
+ if (!nr_pages) {
+ meta = (struct ring_buffer_meta *)ptr;
+ nr_subbufs = meta->nr_subbufs;
+ } else {
+ meta = NULL;
+ /* Include the reader page */
+ nr_subbufs = nr_pages + 1;
+ }
+
+ /*
+ * The first chunk may not be subbuffer aligned, where as
+ * the rest of the chunks are.
+ */
+ if (cpu) {
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* We can use multiplication to find chunks greater than 1 */
+ if (cpu > 1) {
+ unsigned long size;
+ unsigned long p;
+
+ /* Save the beginning of this CPU chunk */
+ p = ptr;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* Now all chunks after this are the same size */
+ size = ptr - p;
+ ptr += size * (cpu - 2);
+ }
+ }
+ return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+ int subbuf_size = meta->subbuf_size;
+ unsigned long ptr;
+
+ ptr = (unsigned long)meta;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+ return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long ptr;
+ int subbuf_size;
+
+ meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu);
+ if (!meta)
+ return NULL;
+
+ if (WARN_ON_ONCE(idx >= meta->nr_subbufs))
+ return NULL;
+
+ subbuf_size = meta->subbuf_size;
+
+ /* Map this buffer to the order that's in meta->buffers[] */
+ idx = meta->buffers[idx];
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+
+ ptr += subbuf_size * idx;
+ if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end)
+ return NULL;
+
+ return (void *)ptr;
+}
+
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages)
+{
+ int subbuf_size = PAGE_SIZE;
+ struct buffer_data_page *subbuf;
+ unsigned long buffers_start;
+ unsigned long buffers_end;
+ int i;
+
+ /* Check the meta magic and meta struct size */
+ if (meta->magic != RING_BUFFER_META_MAGIC ||
+ meta->struct_size != sizeof(*meta)) {
+ pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
+ return false;
+ }
+
+ /* The subbuffer's size and number of subbuffers must match */
+ if (meta->subbuf_size != subbuf_size ||
+ meta->nr_subbufs != nr_pages + 1) {
+ pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
+ return false;
+ }
+
+ buffers_start = meta->first_buffer;
+ buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+ /* Is the head and commit buffers within the range of buffers? */
+ if (meta->head_buffer < buffers_start ||
+ meta->head_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu);
+ return false;
+ }
+
+ if (meta->commit_buffer < buffers_start ||
+ meta->commit_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu);
+ return false;
+ }
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ /* Is the meta buffers and the subbufs themselves have correct data? */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ if (meta->buffers[i] < 0 ||
+ meta->buffers[i] >= meta->nr_subbufs) {
+ pr_info("Ring buffer boot meta [%d] array out of range\n", cpu);
+ return false;
+ }
+
+ if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+ pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
+ return false;
+ }
+
+ subbuf = (void *)subbuf + subbuf_size;
+ }
+
+ return true;
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
+ unsigned long long *timestamp, u64 *delta_ptr)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int events = 0;
+ int e;
+
+ *delta_ptr = 0;
+ *timestamp = 0;
+
+ ts = dpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(dpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ *delta_ptr = delta;
+ *timestamp = ts;
+ return -1;
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ events++;
+ ts += event->time_delta;
+ break;
+
+ default:
+ return -1;
+ }
+ }
+ *timestamp = ts;
+ return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+ unsigned long long ts;
+ u64 delta;
+ int tail;
+
+ tail = local_read(&dpage->commit);
+ return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page;
+ unsigned long entry_bytes = 0;
+ unsigned long entries = 0;
+ int ret;
+ int i;
+
+ if (!meta || !meta->head_buffer)
+ return;
+
+ /* Do the reader page first */
+ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer reader page is invalid\n");
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+ local_set(&cpu_buffer->reader_page->entries, ret);
+
+ head_page = cpu_buffer->head_page;
+
+ /* If both the head and commit are on the reader_page then we are done. */
+ if (head_page == cpu_buffer->reader_page &&
+ head_page == cpu_buffer->commit_page)
+ goto done;
+
+ /* Iterate until finding the commit page */
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+ /* Reader page has already been done */
+ if (head_page == cpu_buffer->reader_page)
+ continue;
+
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer meta [%d] invalid buffer page\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&head_page->page->commit);
+ local_set(&cpu_buffer->head_page->entries, ret);
+
+ if (head_page == cpu_buffer->commit_page)
+ break;
+ }
+
+ if (head_page != cpu_buffer->commit_page) {
+ pr_info("Ring buffer meta [%d] commit page not found\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ done:
+ local_set(&cpu_buffer->entries, entries);
+ local_set(&cpu_buffer->entries_bytes, entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ return;
+
+ invalid:
+ /* The content of the buffers are invalid, reset the meta data */
+ meta->head_buffer = 0;
+ meta->commit_buffer = 0;
+
+ /* Reset the reader page */
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+
+ /* Reset all the subbuffers */
+ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ }
+}
+
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
+#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
+#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
+
+static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
+{
+ meta->text_addr = THIS_TEXT_PTR;
+ meta->data_addr = THIS_DATA_PTR;
+}
+
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long delta;
+ void *subbuf;
+ int cpu;
+ int i;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *next_meta;
+
+ meta = rb_range_meta(buffer, nr_pages, cpu);
+
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ /* Make the mappings match the current address */
+ subbuf = rb_subbufs_from_meta(meta);
+ delta = (unsigned long)subbuf - meta->first_buffer;
+ meta->first_buffer += delta;
+ meta->head_buffer += delta;
+ meta->commit_buffer += delta;
+ buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
+ buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
+ continue;
+ }
+
+ if (cpu < nr_cpu_ids - 1)
+ next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+ else
+ next_meta = (void *)buffer->range_addr_end;
+
+ memset(meta, 0, next_meta - (void *)meta);
+
+ meta->magic = RING_BUFFER_META_MAGIC;
+ meta->struct_size = sizeof(*meta);
+
+ meta->nr_subbufs = nr_pages + 1;
+ meta->subbuf_size = PAGE_SIZE;
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ meta->first_buffer = (unsigned long)subbuf;
+ rb_meta_init_text_addr(meta);
+
+ /*
+ * The buffers[] array holds the order of the sub-buffers
+ * that are after the meta data. The sub-buffers may
+ * be swapped out when read and inserted into a different
+ * location of the ring buffer. Although their addresses
+ * remain the same, the buffers[] array contains the
+ * index into the sub-buffers holding their actual order.
+ */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ meta->buffers[i] = i;
+ rb_init_page(subbuf);
+ subbuf += meta->subbuf_size;
+ }
+ }
+}
+
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val;
+
+ if (!meta)
+ return NULL;
+
+ if (*pos > meta->nr_subbufs)
+ return NULL;
+
+ val = *pos;
+ val++;
+
+ return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return rbm_start(m, pos);
+}
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val = (unsigned long)v;
+
+ if (val == 1) {
+ seq_printf(m, "head_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+ seq_printf(m, "commit_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer));
+ seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size);
+ seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs);
+ return 0;
+ }
+
+ val -= 2;
+ seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+
+ return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+ .start = rbm_start,
+ .next = rbm_next,
+ .show = rbm_show,
+ .stop = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &rb_meta_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = buffer->buffers[cpu];
+
+ return 0;
+}
+
+/* Map the buffer_pages to the previous head and commit pages */
+static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+
+ if (meta->head_buffer == (unsigned long)bpage->page)
+ cpu_buffer->head_page = bpage;
+
+ if (meta->commit_buffer == (unsigned long)bpage->page) {
+ cpu_buffer->commit_page = bpage;
+ cpu_buffer->tail_page = bpage;
+ }
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
+ struct trace_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -1527,6 +2023,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
if (user_thread)
set_current_oom_origin();
+
+ if (buffer->range_addr_start)
+ meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1537,16 +2037,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_bpage(cpu_buffer, bpage);
- list_add(&bpage->list, pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
+ /*
+ * Append the pages as for mapped buffers we want to keep
+ * the order
+ */
+ list_add_tail(&bpage->list, pages);
+
+ if (meta) {
+ /* A range was given. Use that for the buffer page */
+ bpage->page = rb_range_buffer(cpu_buffer, i + 1);
+ if (!bpage->page)
+ goto free_pages;
+ /* If this is valid from a previous boot */
+ if (meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ bpage->id = i + 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
bpage->order = cpu_buffer->buffer->subbuf_order;
- rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
@@ -1596,6 +2112,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -1626,12 +2143,28 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ if (buffer->range_addr_start) {
+ /*
+ * Range mapped buffers have the same restrictions as memory
+ * mapped ones do.
+ */
+ cpu_buffer->mapped = 1;
+ cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu);
+ bpage->page = rb_range_buffer(cpu_buffer, 0);
+ if (!bpage->page)
+ goto fail_free_reader;
+ if (cpu_buffer->ring_meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
@@ -1640,11 +2173,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (ret < 0)
goto fail_free_reader;
- cpu_buffer->head_page
- = list_entry(cpu_buffer->pages, struct buffer_page, list);
- cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_meta_validate_events(cpu_buffer);
+
+ /* If the boot meta was valid then this has already been updated */
+ meta = cpu_buffer->ring_meta;
+ if (!meta || !meta->head_buffer ||
+ !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) {
+ if (meta && meta->head_buffer &&
+ (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) {
+ pr_warn("Ring buffer meta buffers not all mapped\n");
+ if (!cpu_buffer->head_page)
+ pr_warn(" Missing head_page\n");
+ if (!cpu_buffer->commit_page)
+ pr_warn(" Missing commit_page\n");
+ if (!cpu_buffer->tail_page)
+ pr_warn(" Missing tail_page\n");
+ }
- rb_head_page_activate(cpu_buffer);
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ rb_head_page_activate(cpu_buffer);
+
+ if (cpu_buffer->ring_meta)
+ meta->commit_buffer = meta->head_buffer;
+ } else {
+ /* The valid meta buffer still needs to activate the head page */
+ rb_head_page_activate(cpu_buffer);
+ }
return cpu_buffer;
@@ -1681,22 +2238,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/**
- * __ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes per cpu that is needed.
- * @flags: attributes to set for the ring buffer.
- * @key: ring buffer reader_lock_key.
- *
- * Currently the only flag that is available is the RB_FL_OVERWRITE
- * flag. This flag means that the buffer will overwrite old data
- * when the buffer wraps. If this flag is not set, the buffer will
- * drop data when the tail hits the head.
- */
-struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
- struct lock_class_key *key)
+static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long end,
+ struct lock_class_key *key)
{
struct trace_buffer *buffer;
long nr_pages;
+ int subbuf_size;
int bsize;
int cpu;
int ret;
@@ -1710,14 +2259,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- /* Default buffer page size - one system page */
- buffer->subbuf_order = 0;
- buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+ buffer->subbuf_order = order;
+ subbuf_size = (PAGE_SIZE << order);
+ buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
- nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1725,10 +2273,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
- /* need at least two pages */
- if (nr_pages < 2)
- nr_pages = 2;
-
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1737,6 +2281,56 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ /* If start/end are specified, then that overrides size */
+ if (start && end) {
+ unsigned long ptr;
+ int n;
+
+ size = end - start;
+ size = size / nr_cpu_ids;
+
+ /*
+ * The number of sub-buffers (nr_pages) is determined by the
+ * total size allocated minus the meta data size.
+ * Then that is divided by the number of per CPU buffers
+ * needed, plus account for the integer array index that
+ * will be appended to the meta data.
+ */
+ nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ (subbuf_size + sizeof(int));
+ /* Need at least two pages plus the reader page */
+ if (nr_pages < 3)
+ goto fail_free_buffers;
+
+ again:
+ /* Make sure that the size fits aligned */
+ for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_pages;
+ ptr = ALIGN(ptr, subbuf_size);
+ ptr += subbuf_size * nr_pages;
+ }
+ if (ptr > end) {
+ if (nr_pages <= 3)
+ goto fail_free_buffers;
+ nr_pages--;
+ goto again;
+ }
+
+ /* nr_pages should not count the reader page */
+ nr_pages--;
+ buffer->range_addr_start = start;
+ buffer->range_addr_end = end;
+
+ rb_range_meta_init(buffer, nr_pages);
+ } else {
+
+ /* need at least two pages */
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
+ if (nr_pages < 2)
+ nr_pages = 2;
+ }
+
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
@@ -1765,9 +2359,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
kfree(buffer);
return NULL;
}
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ /* Default buffer page size - one system page */
+ return alloc_buffer(size, flags, 0, 0, 0,key);
+
+}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
+ * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @start: start of allocated range
+ * @range_size: size of allocated range
+ * @order: sub-buffer order
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(size, flags, order, start, start + range_size, key);
+}
+
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+ long *data)
+{
+ if (!buffer)
+ return false;
+
+ if (!buffer->last_text_delta)
+ return false;
+
+ *text = buffer->last_text_delta;
+ *data = buffer->last_data_delta;
+
+ return true;
+}
+
+/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
@@ -2376,6 +3034,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->next_event = 0;
}
+/* Return the index into the sub-buffers for a given sub-buffer */
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+{
+ void *subbuf_array;
+
+ subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs;
+ subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size);
+ return (subbuf - subbuf_array) / meta->subbuf_size;
+}
+
+static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *next_page)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long old_head = (unsigned long)next_page->page;
+ unsigned long new_head;
+
+ rb_inc_page(&next_page);
+ new_head = (unsigned long)next_page->page;
+
+ /*
+ * Only move it forward once, if something else came in and
+ * moved it forward, then we don't want to touch it.
+ */
+ (void)cmpxchg(&meta->head_buffer, old_head, new_head);
+}
+
+static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *reader)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ void *old_reader = cpu_buffer->reader_page->page;
+ void *new_reader = reader->page;
+ int id;
+
+ id = reader->id;
+ cpu_buffer->reader_page->id = id;
+ reader->id = 0;
+
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader);
+ meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader);
+
+ /* The head pointer is the one after the reader */
+ rb_update_meta_head(cpu_buffer, reader);
+}
+
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2425,6 +3129,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_head(cpu_buffer, next_page);
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -2986,6 +3692,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
+ }
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3432,11 +4142,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
- int e;
+ int ret;
bpage = info->tail_page->page;
@@ -3462,39 +4171,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ts = bpage->time_stamp;
-
- for (e = 0; e < tail; e += rb_event_length(event)) {
-
- event = (struct ring_buffer_event *)(bpage->data + e);
-
- switch (event->type_len) {
-
- case RINGBUF_TYPE_TIME_EXTEND:
- delta = rb_event_time_stamp(event);
- ts += delta;
- break;
-
- case RINGBUF_TYPE_TIME_STAMP:
- delta = rb_event_time_stamp(event);
- delta = rb_fix_abs_ts(delta, ts);
- if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
- }
- ts = delta;
- break;
-
- case RINGBUF_TYPE_PADDING:
- if (event->time_delta == 1)
- break;
- fallthrough;
- case RINGBUF_TYPE_DATA:
- ts += event->time_delta;
- break;
-
- default:
- RB_WARN_ON(cpu_buffer, 1);
+ ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ if (ret < 0) {
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ goto out;
}
}
if ((full && ts > info->ts) ||
@@ -4603,6 +5285,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_reader(cpu_buffer, reader);
+
/*
* Yay! We succeeded in replacing the page.
*
@@ -5224,6 +5909,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ if (!meta)
+ return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -5280,11 +5968,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
- if (cpu_buffer->mapped)
- rb_update_meta_page(cpu_buffer);
-
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
+
+ if (cpu_buffer->mapped) {
+ rb_update_meta_page(cpu_buffer);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = meta->head_buffer;
+ }
+ }
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5315,6 +6008,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -5333,6 +6027,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -5347,6 +6046,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -5374,6 +6074,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -6020,39 +6725,38 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_data_page *old_free_data_page;
+ struct list_head old_pages;
+ unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
cpu_buffer = buffer->buffers[cpu];
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
/* Clear the head bit to make the link list normal to read */
rb_head_page_deactivate(cpu_buffer);
- /* Now walk the list and free all the old sub buffers */
- list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- /* The above loop stopped an the last page needing to be freed */
- bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
- free_buffer_page(bpage);
-
- /* Free the current reader page */
- free_buffer_page(cpu_buffer->reader_page);
+ /*
+ * Collect buffers from the cpu_buffer pages list and the
+ * reader_page on old_pages, so they can be freed later when not
+ * under a spinlock. The pages list is a linked list with no
+ * head, adding old_pages turns it into a regular list with
+ * old_pages being the head.
+ */
+ list_add(&old_pages, cpu_buffer->pages);
+ list_add(&cpu_buffer->reader_page->list, &old_pages);
/* One page was allocated for the reader page */
cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
struct buffer_page, list);
list_del_init(&cpu_buffer->reader_page->list);
- /* The cpu_buffer pages are a link list with no head */
+ /* Install the new pages, remove the head from the list */
cpu_buffer->pages = cpu_buffer->new_pages.next;
- cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
- cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
-
- /* Clear the new_pages list */
- INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ list_del_init(&cpu_buffer->new_pages);
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
@@ -6061,11 +6765,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
cpu_buffer->nr_pages_to_update = 0;
- free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ old_free_data_page = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
rb_head_page_activate(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ /* Free old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, &old_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ free_pages((unsigned long)old_free_data_page, old_order);
+
rb_check_pages(cpu_buffer);
}
@@ -6147,10 +6860,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
- meta->meta_page_size = PAGE_SIZE;
meta->meta_struct_len = sizeof(*meta);
meta->nr_subbufs = nr_subbufs;
meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ meta->meta_page_size = meta->subbuf_size;
rb_update_meta_page(cpu_buffer);
}
@@ -6167,7 +6880,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6191,19 +6904,26 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
+ /* mapped is always greater or equal to user_mapped */
+ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ return -EINVAL;
+
if (inc && cpu_buffer->mapped == UINT_MAX)
return -EBUSY;
- if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+ if (WARN_ON(!inc && cpu_buffer->user_mapped == 0))
return -EINVAL;
mutex_lock(&cpu_buffer->buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- if (inc)
+ if (inc) {
+ cpu_buffer->user_mapped++;
cpu_buffer->mapped++;
- else
+ } else {
+ cpu_buffer->user_mapped--;
cpu_buffer->mapped--;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
mutex_unlock(&cpu_buffer->buffer->mutex);
@@ -6226,7 +6946,7 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
struct vm_area_struct *vma)
{
- unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+ unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
struct page **pages;
int p = 0, s = 0;
@@ -6237,6 +6957,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
!(vma->vm_flags & VM_MAYSHARE))
return -EPERM;
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ if (subbuf_order && pgoff % subbuf_pages)
+ return -EINVAL;
+
/*
* Make sure the mapping cannot become writable later. Also tell the VM
* to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
@@ -6246,37 +6972,38 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
- subbuf_order = cpu_buffer->buffer->subbuf_order;
- subbuf_pages = 1 << subbuf_order;
-
nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
- nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
- vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
- if (!vma_pages || vma_pages > nr_pages)
+ nr_vma_pages = vma_pages(vma);
+ if (!nr_vma_pages || nr_vma_pages > nr_pages)
return -EINVAL;
- nr_pages = vma_pages;
+ nr_pages = nr_vma_pages;
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
if (!pgoff) {
+ unsigned long meta_page_padding;
+
pages[p++] = virt_to_page(cpu_buffer->meta_page);
/*
- * TODO: Align sub-buffers on their size, once
- * vm_insert_pages() supports the zero-page.
+ * Pad with the zero-page to align the meta-page with the
+ * sub-buffers.
*/
- } else {
- /* Skip the meta-page */
- pgoff--;
+ meta_page_padding = subbuf_pages - 1;
+ while (meta_page_padding-- && p < nr_pages) {
+ unsigned long __maybe_unused zero_addr =
+ vma->vm_start + (PAGE_SIZE * p);
- if (pgoff % subbuf_pages) {
- err = -EINVAL;
- goto out;
+ pages[p++] = ZERO_PAGE(zero_addr);
}
+ } else {
+ /* Skip the meta-page */
+ pgoff -= subbuf_pages;
s += pgoff / subbuf_pages;
}
@@ -6328,7 +7055,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
mutex_lock(&cpu_buffer->mapping_lock);
- if (cpu_buffer->mapped) {
+ if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
@@ -6359,12 +7086,15 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
*/
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
err = __rb_map_vma(cpu_buffer, vma);
if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 1;
+ /* This is the first time it is mapped by user */
+ cpu_buffer->mapped++;
+ cpu_buffer->user_mapped = 1;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6392,10 +7122,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&cpu_buffer->mapping_lock);
- if (!cpu_buffer->mapped) {
+ if (!cpu_buffer->user_mapped) {
err = -ENODEV;
goto out;
- } else if (cpu_buffer->mapped > 1) {
+ } else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
goto out;
}
@@ -6403,7 +7133,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- cpu_buffer->mapped = 0;
+ /* This is the last user space mapping */
+ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ cpu_buffer->mapped--;
+ cpu_buffer->user_mapped = 0;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index df0745a42a3f..dc819aec43e8 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -306,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
static const struct file_operations interface_enable_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitor_enable_write_data,
.read = monitor_enable_read_data,
};
@@ -329,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf,
static const struct file_operations interface_desc_fops = {
.open = simple_open,
- .llseek = no_llseek,
.read = monitor_desc_read_data,
};
@@ -674,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
static const struct file_operations monitoring_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitoring_on_write_data,
.read = monitoring_on_read_data,
};
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 6aae106695b6..7b49cbe388d4 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
static const struct file_operations reacting_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = reacting_on_write_data,
.read = reacting_on_read_data,
};
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 578a49ff5c32..6a891e00aa7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -500,6 +500,29 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+static struct trace_array *printk_trace = &global_trace;
+
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+static void update_printk_trace(struct trace_array *tr)
+{
+ if (printk_trace == tr)
+ return;
+
+ printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace = tr;
+ tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -1117,7 +1140,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
*/
int __trace_puts(unsigned long ip, const char *str, int size)
{
- return __trace_array_puts(&global_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1128,6 +1151,7 @@ EXPORT_SYMBOL_GPL(__trace_puts);
*/
int __trace_bputs(unsigned long ip, const char *str)
{
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
@@ -1135,14 +1159,17 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int ret = 0;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str, strlen(str));
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -1155,7 +1182,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -2226,10 +2253,6 @@ static __init int init_trace_selftests(void)
}
core_initcall(init_trace_selftests);
#else
-static inline int run_tracer_selftest(struct tracer *type)
-{
- return 0;
-}
static inline int do_run_tracer_selftest(struct tracer *type)
{
return 0;
@@ -2363,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
ring_buffer_record_enable(buffer);
}
+static void tracing_reset_all_cpus(struct array_buffer *buf)
+{
+ struct trace_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ ring_buffer_reset(buffer);
+
+ ring_buffer_record_enable(buffer);
+}
+
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
@@ -2767,7 +2809,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3025,7 +3067,7 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.array_buffer.buffer,
+ __ftrace_trace_stack(printk_trace->array_buffer.buffer,
tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3244,12 +3286,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- struct trace_array *tr = &global_trace;
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct bprint_entry *entry;
unsigned int trace_ctx;
char *tbuffer;
int len = 0, size;
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3342,7 +3387,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
}
out:
@@ -3438,7 +3483,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -3450,7 +3495,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- return trace_array_vprintk(&global_trace, ip, fmt, args);
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -3671,8 +3716,11 @@ static void test_can_verify(void)
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
va_list ap)
{
+ long text_delta = 0;
+ long data_delta = 0;
const char *p = fmt;
const char *str;
+ bool good;
int i, j;
if (WARN_ON_ONCE(!fmt))
@@ -3681,6 +3729,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
if (static_branch_unlikely(&trace_no_verify))
goto print;
+ /*
+ * When the kernel is booted with the tp_printk command line
+ * parameter, trace events go directly through to printk().
+ * It also is checked by this function, but it does not
+ * have an associated trace_array (tr) for it.
+ */
+ if (iter->tr) {
+ text_delta = iter->tr->text_delta;
+ data_delta = iter->tr->data_delta;
+ }
+
/* Don't bother checking when doing a ftrace_dump() */
if (iter->fmt == static_fmt_buf)
goto print;
@@ -3691,7 +3750,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
j = 0;
- /* We only care about %s and variants */
+ /*
+ * We only care about %s and variants
+ * as well as %p[sS] if delta is non-zero
+ */
for (i = 0; p[i]; i++) {
if (i + 1 >= iter->fmt_size) {
/*
@@ -3720,6 +3782,11 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
}
if (p[i+j] == 's')
break;
+
+ if (text_delta && p[i+1] == 'p' &&
+ ((p[i+2] == 's' || p[i+2] == 'S')))
+ break;
+
star = false;
}
j = 0;
@@ -3733,6 +3800,24 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /* Add delta to %pS pointers */
+ if (p[i+1] == 'p') {
+ unsigned long addr;
+ char fmt[4];
+
+ fmt[0] = '%';
+ fmt[1] = 'p';
+ fmt[2] = p[i+2]; /* Either %ps or %pS */
+ fmt[3] = '\0';
+
+ addr = va_arg(ap, unsigned long);
+ addr += text_delta;
+ trace_seq_printf(&iter->seq, fmt, (void *)addr);
+
+ p += i + 3;
+ continue;
+ }
+
/*
* If iter->seq is full, the above call no longer guarantees
* that ap is in sync with fmt processing, and further calls
@@ -3751,6 +3836,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
+ good = trace_safe_str(iter, str, star, len);
+
+ /* Could be from the last boot */
+ if (data_delta && !good) {
+ str += data_delta;
+ good = trace_safe_str(iter, str, star, len);
+ }
+
/*
* If you hit this warning, it is likely that the
* trace event in question used %s on a string that
@@ -3760,8 +3853,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
+ if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
@@ -3958,6 +4050,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
@@ -4921,6 +5015,11 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
+#ifdef CONFIG_TRACER_SNAPSHOT
+ /* arrays with mapped buffer range do not have snapshots */
+ if (tr->range_addr_start && t->use_max_tr)
+ return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5013,7 +5112,7 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5054,7 +5153,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = show_traces_release,
+ .release = tracing_seq_release,
};
static ssize_t
@@ -5239,7 +5338,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD))
+ (mask == TRACE_ITER_RECORD_CMD) ||
+ (mask == TRACE_ITER_TRACE_PRINTK))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5251,6 +5351,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
+ if (mask == TRACE_ITER_TRACE_PRINTK) {
+ if (enabled) {
+ update_printk_trace(tr);
+ } else {
+ /*
+ * The global_trace cannot clear this.
+ * It's flag only gets cleared if another instance sets it.
+ */
+ if (printk_trace == &global_trace)
+ return -EINVAL;
+ /*
+ * An instance must always have it set.
+ * by default, that's the global_trace instane.
+ */
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+ }
+ }
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -5401,6 +5520,10 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
+ "By default tracefs removes all OTH file permission bits.\n"
+ "When mounting tracefs an optional group id can be specified\n"
+ "which adds the group to every directory and file in tracefs:\n\n"
+ "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n"
"# echo 0 > tracing_on : quick way to disable tracing\n"
"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
" Important files:\n"
@@ -6036,6 +6159,23 @@ out:
return ret;
}
+static void update_last_data(struct trace_array *tr)
+{
+ if (!tr->text_delta && !tr->data_delta)
+ return;
+
+ /*
+ * Need to clear all CPU buffers as there cannot be events
+ * from the previous boot mixed with events with this boot
+ * as that will cause a confusing trace. Need to clear all
+ * CPU buffers, even for those that may currently be offline.
+ */
+ tracing_reset_all_cpus(&tr->array_buffer);
+
+ /* Using current data now */
+ tr->text_delta = 0;
+ tr->data_delta = 0;
+}
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6053,6 +6193,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6108,6 +6251,8 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6856,6 +7001,37 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
}
static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct seq_buf seq;
+ char buf[64];
+
+ seq_buf_init(&seq, buf, 64);
+
+ seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+ seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+}
+
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+ if (ret < 0)
+ __trace_array_put(tr);
+ return ret;
+}
+
+static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7420,7 +7596,6 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
- .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
@@ -7431,6 +7606,13 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_buffer_meta_fops = {
+ .open = tracing_buffer_meta_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
static const struct file_operations tracing_total_entries_fops = {
.open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
@@ -7471,6 +7653,13 @@ static const struct file_operations trace_time_stamp_mode_fops = {
.release = tracing_single_release_tr,
};
+static const struct file_operations last_boot_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_last_boot_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -7485,7 +7674,6 @@ static const struct file_operations snapshot_raw_fops = {
.read = tracing_buffers_read,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
- .llseek = no_llseek,
};
#endif /* CONFIG_TRACER_SNAPSHOT */
@@ -7956,7 +8144,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
trace_access_unlock(iter->cpu_file);
if (ret < 0) {
- if (trace_empty(iter)) {
+ if (trace_empty(iter) && !iter->closed) {
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8315,7 +8503,6 @@ static const struct file_operations tracing_buffers_fops = {
.flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
- .llseek = no_llseek,
.mmap = tracing_buffers_mmap,
};
@@ -8663,12 +8850,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
+ if (tr->range_addr_start)
+ trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &tracing_buffer_meta_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
- tr, cpu, &snapshot_fops);
+ if (!tr->range_addr_start) {
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
- tr, cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+ }
#endif
}
@@ -9205,7 +9397,21 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
buf->tr = tr;
- buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (tr->range_addr_start && tr->range_addr_size) {
+ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+
+ ring_buffer_last_boot_delta(buf->buffer,
+ &tr->text_delta, &tr->data_delta);
+ /*
+ * This is basically the same as a mapped buffer,
+ * with the same restrictions.
+ */
+ tr->mapped++;
+ } else {
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ }
if (!buf->buffer)
return -ENOMEM;
@@ -9242,6 +9448,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9342,7 +9552,9 @@ static int trace_array_create_dir(struct trace_array *tr)
}
static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+ unsigned long range_addr_start,
+ unsigned long range_addr_size)
{
struct trace_array *tr;
int ret;
@@ -9368,6 +9580,10 @@ trace_array_create_systems(const char *name, const char *systems)
goto out_free_tr;
}
+ /* Only for boot up memory mapped ring buffers */
+ tr->range_addr_start = range_addr_start;
+ tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9425,7 +9641,7 @@ trace_array_create_systems(const char *name, const char *systems)
static struct trace_array *trace_array_create(const char *name)
{
- return trace_array_create_systems(name, NULL);
+ return trace_array_create_systems(name, NULL, 0, 0);
}
static int instance_mkdir(const char *name)
@@ -9450,6 +9666,31 @@ out_unlock:
return ret;
}
+static u64 map_pages(u64 start, u64 size)
+{
+ struct page **pages;
+ phys_addr_t page_start;
+ unsigned int page_count;
+ unsigned int i;
+ void *vaddr;
+
+ page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ page_start = start;
+ pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ for (i = 0; i < page_count; i++) {
+ phys_addr_t addr = page_start + i * PAGE_SIZE;
+ pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ }
+ vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+
+ return (u64)(unsigned long)vaddr;
+}
+
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
@@ -9479,7 +9720,7 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
goto out_unlock;
}
- tr = trace_array_create_systems(name, systems);
+ tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
@@ -9509,6 +9750,9 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1 << i, 0);
}
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9671,10 +9915,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
+ if (tr->range_addr_start) {
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+ } else {
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
+ tr, &snapshot_fops);
#endif
+ }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
@@ -10294,6 +10543,7 @@ __init static void enable_instances(void)
{
struct trace_array *tr;
char *curr_str;
+ char *name;
char *str;
char *tok;
@@ -10302,19 +10552,107 @@ __init static void enable_instances(void)
str = boot_instance_info;
while ((curr_str = strsep(&str, "\t"))) {
+ phys_addr_t start = 0;
+ phys_addr_t size = 0;
+ unsigned long addr = 0;
+ bool traceprintk = false;
+ bool traceoff = false;
+ char *flag_delim;
+ char *addr_delim;
tok = strsep(&curr_str, ",");
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(tok);
+ flag_delim = strchr(tok, '^');
+ addr_delim = strchr(tok, '@');
- tr = trace_array_get_by_name(tok, NULL);
- if (!tr) {
- pr_warn("Failed to create instance buffer %s\n", curr_str);
+ if (addr_delim)
+ *addr_delim++ = '\0';
+
+ if (flag_delim)
+ *flag_delim++ = '\0';
+
+ name = tok;
+
+ if (flag_delim) {
+ char *flag;
+
+ while ((flag = strsep(&flag_delim, "^"))) {
+ if (strcmp(flag, "traceoff") == 0) {
+ traceoff = true;
+ } else if ((strcmp(flag, "printk") == 0) ||
+ (strcmp(flag, "traceprintk") == 0) ||
+ (strcmp(flag, "trace_printk") == 0)) {
+ traceprintk = true;
+ } else {
+ pr_info("Tracing: Invalid instance flag '%s' for %s\n",
+ flag, name);
+ }
+ }
+ }
+
+ tok = addr_delim;
+ if (tok && isdigit(*tok)) {
+ start = memparse(tok, &tok);
+ if (!start) {
+ pr_warn("Tracing: Invalid boot instance address for %s\n",
+ name);
+ continue;
+ }
+ if (*tok != ':') {
+ pr_warn("Tracing: No size specified for instance %s\n", name);
+ continue;
+ }
+ tok++;
+ size = memparse(tok, &tok);
+ if (!size) {
+ pr_warn("Tracing: Invalid boot instance size for %s\n",
+ name);
+ continue;
+ }
+ } else if (tok) {
+ if (!reserve_mem_find_by_name(tok, &start, &size)) {
+ start = 0;
+ pr_warn("Failed to map boot instance %s to %s\n", name, tok);
+ continue;
+ }
+ }
+
+ if (start) {
+ addr = map_pages(start, size);
+ if (addr) {
+ pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
+ name, &start, (unsigned long)size);
+ } else {
+ pr_warn("Tracing: Failed to map boot instance %s\n", name);
+ continue;
+ }
+ } else {
+ /* Only non mapped buffers have snapshot buffers */
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(name);
+ }
+
+ tr = trace_array_create_systems(name, NULL, addr, size);
+ if (IS_ERR(tr)) {
+ pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str);
continue;
}
- /* Allow user space to delete it */
- trace_array_put(tr);
+
+ if (traceoff)
+ tracer_tracing_off(tr);
+
+ if (traceprintk)
+ update_printk_trace(tr);
+
+ /*
+ * If start is set, then this is a mapped buffer, and
+ * cannot be deleted by user space, so keep the reference
+ * to it.
+ */
+ if (start) {
+ tr->flags |= TRACE_ARRAY_FL_BOOT;
+ tr->ref++;
+ }
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 749a182dab48..c866991b9c78 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,7 +336,6 @@ struct trace_array {
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
- unsigned int mapped;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -344,6 +343,13 @@ struct trace_array {
struct irq_work fsnotify_irqwork;
#endif
#endif
+ /* The below is for memory mapped ring buffer */
+ unsigned int mapped;
+ unsigned long range_addr_start;
+ unsigned long range_addr_size;
+ long text_delta;
+ long data_delta;
+
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
/*
@@ -397,6 +403,9 @@ struct trace_array {
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
struct trace_pid_list __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ struct fgraph_ops *gops;
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_head func_probes;
@@ -420,7 +429,8 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
};
extern struct list_head ftrace_trace_arrays;
@@ -641,6 +651,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer,
unsigned long len,
unsigned int trace_ctx);
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -679,9 +691,8 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace);
-int trace_graph_entry(struct ftrace_graph_ent *trace);
-void set_graph_array(struct trace_array *tr);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -892,12 +903,59 @@ extern int __trace_graph_entry(struct trace_array *tr,
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned int trace_ctx);
+extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern void free_fgraph_ops(struct trace_array *tr);
+
+enum {
+ TRACE_GRAPH_FL = 1,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_FL
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
+
+ /*
+ * To implement set_graph_notrace, if this bit is set, we ignore
+ * function graph tracing of called functions, until the return
+ * function is called to clear it.
+ */
+ TRACE_GRAPH_NOTRACE_BIT,
+};
+
+#define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT)
+
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+ return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+ *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+ *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
@@ -919,13 +977,12 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
}
if (ftrace_lookup_ip(hash, addr)) {
-
/*
* This needs to be cleared on the return functions
* when the depth is zero.
*/
- trace_recursion_set(TRACE_GRAPH_BIT);
- trace_recursion_set_depth(trace->depth);
+ *task_var |= TRACE_GRAPH_FL;
+ ftrace_graph_set_depth(task_var, trace->depth);
/*
* If no irqs are to be traced, but a set_graph_function
@@ -944,11 +1001,14 @@ out:
return ret;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{
- if (trace_recursion_test(TRACE_GRAPH_BIT) &&
- trace->depth == trace_recursion_depth())
- trace_recursion_clear(TRACE_GRAPH_BIT);
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
+ if ((*task_var & TRACE_GRAPH_FL) &&
+ trace->depth == ftrace_graph_depth(task_var))
+ *task_var &= ~TRACE_GRAPH_FL;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -974,7 +1034,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -983,27 +1043,37 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
- ftrace_graph_addr(trace)) ||
+ return !((*task_var & TRACE_GRAPH_FL) ||
+ ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops);
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
+/* ftrace_ops may not be defined */
+#define init_array_fgraph_ops(tr, ops) do { } while (0)
+#define allocate_fgraph_ops(tr, ops) ({ 0; })
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
extern struct list_head ftrace_pids;
@@ -1251,6 +1321,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(TRACE_PRINTK, "trace_printk_dest"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
@@ -1573,6 +1644,29 @@ static inline void *event_file_data(struct file *filp)
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+/*
+ * When the trace_event_file is the filp->i_private pointer,
+ * it must be taken under the event_mutex lock, and then checked
+ * if the EVENT_FILE_FL_FREED flag is set. If it is, then the
+ * data pointed to by the trace_event_file can not be trusted.
+ *
+ * Use the event_file_file() to access the trace_event_file from
+ * the filp the first time under the event_mutex and check for
+ * NULL. If it is needed to be retrieved again and the event_mutex
+ * is still held, then the event_file_data() can be used and it
+ * is guaranteed to be valid.
+ */
+static inline struct trace_event_file *event_file_file(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = READ_ONCE(file_inode(filp)->i_private);
+ if (!file || file->flags & EVENT_FILE_FL_FREED)
+ return NULL;
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index b0e0ec85912e..ebda68ee9abf 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -912,6 +912,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
}
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto error;
+ }
+
mutex_lock(&event_mutex);
event_call = find_and_get_event(sys_name, sys_event);
ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
argc -= 2; argv += 2;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ret = trace_eprobe_tp_update_arg(ep, argv, i);
if (ret)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6ef29eba90ce..7266ec2a4eea 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
void event_file_get(struct trace_event_file *file)
{
- atomic_inc(&file->ref);
+ refcount_inc(&file->ref);
}
void event_file_put(struct trace_event_file *file)
{
- if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (WARN_ON_ONCE(!refcount_read(&file->ref))) {
if (file->flags & EVENT_FILE_FL_FREED)
kmem_cache_free(file_cachep, file);
return;
}
- if (atomic_dec_and_test(&file->ref)) {
+ if (refcount_dec_and_test(&file->ref)) {
/* Count should only go to zero when it is freed */
if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
return;
@@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[4] = "0";
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (likely(file))
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file || flags & EVENT_FILE_FL_FREED)
+ if (!file)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ file = event_file_file(filp);
+ if (likely(file)) {
ret = tracing_update_buffers(file->tr);
if (ret < 0) {
mutex_unlock(&event_mutex);
@@ -1540,7 +1540,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- if (!event_file_data(m->private))
+ file = event_file_file(m->private);
+ if (!file)
return ERR_PTR(-ENODEV);
while (l < *pos && p)
@@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ file = event_file_file(filp);
+ if (file)
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return PTR_ERR(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ file = event_file_file(filp);
+ if (file) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ err = -ENODEV;
+ else
+ err = apply_event_filter(file, buf);
+ }
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data,
if (strcmp(name, "format") == 0) {
*mode = TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
- *data = call;
return 1;
}
@@ -2996,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
- event_file_get(file);
+ refcount_set(&file->ref, 1);
return file;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d36a..5f9119eb7c67 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
@@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 8650562bdaa9..a8f076809db4 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
strim(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (file) {
call = file->event_call;
size = parse_entry(buf, call, &entry);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4bec043c8690..a5e3d6acf1e1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
@@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
mutex_lock(&event_mutex);
- if (unlikely(!event_file_data(file))) {
+ if (unlikely(!event_file_file(file))) {
mutex_unlock(&event_mutex);
return -ENODEV;
}
@@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
mutex_lock(&event_mutex);
- event_file = event_file_data(file);
+ event_file = event_file_file(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
kfree(buf);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 3a2b46847c8b..42b0d998d103 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2885,7 +2885,7 @@ err:
return -ENODEV;
}
-static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 62e6a8f4aae9..c62d1629cffe 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -21,6 +21,7 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
+#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -385,6 +386,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
struct tracepoint *tpoint,
+ struct module *mod,
int maxactive,
int nargs, bool is_return)
{
@@ -405,6 +407,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->fp.entry_handler = fentry_dispatcher;
tf->tpoint = tpoint;
+ tf->mod = mod;
tf->fp.nr_maxactive = maxactive;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
@@ -672,6 +675,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
return trace_probe_unregister_event_call(&tf->tp);
}
+static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
+{
+ struct tracepoint *tpoint = tf->tpoint;
+ unsigned long ip = (unsigned long)tpoint->probestub;
+ int ret;
+
+ /*
+ * Here, we do 2 steps to enable fprobe on a tracepoint.
+ * At first, put __probestub_##TP function on the tracepoint
+ * and put a fprobe on the stub function.
+ */
+ ret = tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+ if (ret < 0)
+ return ret;
+ return register_fprobe_ips(&tf->fp, &ip, 1);
+}
+
/* Internal register function - just handle fprobe and flags */
static int __register_trace_fprobe(struct trace_fprobe *tf)
{
@@ -698,18 +719,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
tf->fp.flags |= FPROBE_FL_DISABLED;
if (trace_fprobe_is_tracepoint(tf)) {
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
- /*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
- */
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+
+ /* This tracepoint is not loaded yet */
+ if (tf->tpoint == TRACEPOINT_STUB)
+ return 0;
+
+ return __regsiter_tracepoint_fprobe(tf);
}
/* TODO: handle filter, nofilter or symbol list */
@@ -862,20 +877,106 @@ end:
return ret;
}
+struct __find_tracepoint_cb_data {
+ const char *tp_name;
+ struct tracepoint *tpoint;
+ struct module *mod;
+};
+
+static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
+ data->tpoint = tp;
+ if (!data->mod) {
+ data->mod = mod;
+ if (!try_module_get(data->mod)) {
+ data->tpoint = NULL;
+ data->mod = NULL;
+ }
+ }
+ }
+}
+
+static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name))
+ data->tpoint = tp;
+}
+
+/*
+ * Find a tracepoint from kernel and module. If the tracepoint is in a module,
+ * this increments the module refcount to prevent unloading until the
+ * trace_fprobe is registered to the list. After registering the trace_fprobe
+ * on the trace_fprobe list, the module refcount is decremented because
+ * tracepoint_probe_module_cb will handle it.
+ */
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ .mod = NULL,
+ };
+
+ for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
+
+ if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) {
+ for_each_module_tracepoint(__find_tracepoint_module_cb, &data);
+ *tp_mod = data.mod;
+ }
+
+ return data.tpoint;
+}
+
#ifdef CONFIG_MODULES
+static void reenable_trace_fprobe(struct trace_fprobe *tf)
+{
+ struct trace_probe *tp = &tf->tp;
+
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ __enable_trace_fprobe(tf);
+ }
+}
+
+static struct tracepoint *find_tracepoint_in_module(struct module *mod,
+ const char *tp_name)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ .mod = mod,
+ };
+
+ for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data);
+ return data.tpoint;
+}
+
static int __tracepoint_probe_module_cb(struct notifier_block *self,
unsigned long val, void *data)
{
struct tp_module *tp_mod = data;
+ struct tracepoint *tpoint;
struct trace_fprobe *tf;
struct dyn_event *pos;
- if (val != MODULE_STATE_GOING)
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
return NOTIFY_DONE;
mutex_lock(&event_mutex);
for_each_trace_fprobe(tf, pos) {
- if (tp_mod->mod == tf->mod) {
+ if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
+ if (tpoint) {
+ tf->tpoint = tpoint;
+ tf->mod = tp_mod->mod;
+ if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
+ trace_probe_is_enabled(&tf->tp))
+ reenable_trace_fprobe(tf);
+ }
+ } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
tracepoint_probe_unregister(tf->tpoint,
tf->tpoint->probestub, NULL);
tf->tpoint = NULL;
@@ -892,30 +993,6 @@ static struct notifier_block tracepoint_module_nb = {
};
#endif /* CONFIG_MODULES */
-struct __find_tracepoint_cb_data {
- const char *tp_name;
- struct tracepoint *tpoint;
-};
-
-static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
-{
- struct __find_tracepoint_cb_data *data = priv;
-
- if (!data->tpoint && !strcmp(data->tp_name, tp->name))
- data->tpoint = tp;
-}
-
-static struct tracepoint *find_tracepoint(const char *tp_name)
-{
- struct __find_tracepoint_cb_data data = {
- .tp_name = tp_name,
- };
-
- for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
-
- return data.tpoint;
-}
-
static int parse_symbol_and_return(int argc, const char *argv[],
char **symbol, bool *is_return,
bool is_tracepoint)
@@ -996,6 +1073,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
char abuf[MAX_BTF_ARGS_LEN];
char *dbuf = NULL;
bool is_tracepoint = false;
+ struct module *tp_mod = NULL;
struct tracepoint *tpoint = NULL;
struct traceprobe_parse_context ctx = {
.flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
@@ -1080,15 +1158,20 @@ static int __trace_fprobe_create(int argc, const char *argv[])
if (is_tracepoint) {
ctx.flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol);
- if (!tpoint) {
+ tpoint = find_tracepoint(symbol, &tp_mod);
+ if (tpoint) {
+ ctx.funcname = kallsyms_lookup(
+ (unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
+ } else if (IS_ENABLED(CONFIG_MODULES)) {
+ /* This *may* be loaded afterwards */
+ tpoint = TRACEPOINT_STUB;
+ ctx.funcname = symbol;
+ } else {
trace_probe_log_set_index(1);
trace_probe_log_err(0, NO_TRACEPOINT);
goto parse_error;
}
- ctx.funcname = kallsyms_lookup(
- (unsigned long)tpoint->probestub,
- NULL, NULL, NULL, sbuf);
} else
ctx.funcname = symbol;
@@ -1104,14 +1187,18 @@ static int __trace_fprobe_create(int argc, const char *argv[])
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto out;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
goto out;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
- argc, is_return);
+ tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
+ maxactive, argc, is_return);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
@@ -1119,12 +1206,8 @@ static int __trace_fprobe_create(int argc, const char *argv[])
goto out; /* We know tf is not allocated */
}
- if (is_tracepoint)
- tf->mod = __module_text_address(
- (unsigned long)tf->tpoint->probestub);
-
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ctx.offset = 0;
ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
@@ -1155,6 +1238,8 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
out:
+ if (tp_mod)
+ module_put(tp_mod);
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
kfree(new_argv);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..3b0cea37e029 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent)
{
+ int ret;
/*
* The top level array uses the "global_ops", and the files are
* created on boot up.
@@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
+ ret = allocate_fgraph_ops(tr, tr->ops);
+ if (ret) {
+ kfree(tr->ops);
+ return ret;
+ }
+
ftrace_create_filter_files(tr->ops, parent);
return 0;
@@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
{
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+ free_fgraph_ops(tr);
}
static ftrace_func_t select_trace_function(u32 flags_val)
@@ -223,6 +231,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
long disabled;
int cpu;
unsigned int trace_ctx;
+ int skip = STACK_SKIP;
if (unlikely(!tr->function_enabled))
return;
@@ -239,7 +248,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
trace_function(tr, ip, parent_ip, trace_ctx);
- __trace_stack(tr, trace_ctx, STACK_SKIP);
+#ifdef CONFIG_UNWINDER_FRAME_POINTER
+ if (ftrace_pids_enabled(op))
+ skip++;
+#endif
+ __trace_stack(tr, trace_ctx, skip);
}
atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c35fbaab2a47..a569daaac4c4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -83,8 +83,6 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
-static struct trace_array *graph_array;
-
/*
* DURATION column is being also used to display IRQ signs,
* following values are used by print_graph_irq and others
@@ -129,9 +127,11 @@ static inline int ftrace_graph_ignore_irqs(void)
return in_hardirq();
}
-int trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
@@ -139,7 +139,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int ret;
int cpu;
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+ if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
/*
@@ -150,7 +150,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+ *task_var |= TRACE_GRAPH_NOTRACE_BIT;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -161,7 +161,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
if (ftrace_graph_ignore_irqs())
@@ -238,19 +238,21 @@ void __trace_graph_return(struct trace_array *tr,
trace_buffer_unlock_commit_nostack(buffer, event);
}
-void trace_graph_return(struct ftrace_graph_ret *trace)
+void trace_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
long disabled;
int cpu;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
@@ -266,18 +268,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_restore(flags);
}
-void set_graph_array(struct trace_array *tr)
-{
- graph_array = tr;
-
- /* Make graph_array visible before we start tracing */
-
- smp_mb();
-}
-
-static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
@@ -288,28 +282,60 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
(trace->rettime - trace->calltime < tracing_thresh))
return;
else
- trace_graph_return(trace);
+ trace_graph_return(trace, gops);
}
-static struct fgraph_ops funcgraph_thresh_ops = {
- .entryfunc = &trace_graph_entry,
- .retfunc = &trace_graph_thresh_return,
-};
-
static struct fgraph_ops funcgraph_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_return,
};
+int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ struct fgraph_ops *gops;
+
+ gops = kzalloc(sizeof(*gops), GFP_KERNEL);
+ if (!gops)
+ return -ENOMEM;
+
+ gops->entryfunc = &trace_graph_entry;
+ gops->retfunc = &trace_graph_return;
+
+ tr->gops = gops;
+ gops->private = tr;
+
+ fgraph_init_ops(&gops->ops, ops);
+
+ return 0;
+}
+
+void free_fgraph_ops(struct trace_array *tr)
+{
+ kfree(tr->gops);
+}
+
+__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ tr->gops = &funcgraph_ops;
+ funcgraph_ops.private = tr;
+ fgraph_init_ops(&tr->gops->ops, ops);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
- set_graph_array(tr);
+ tr->gops->entryfunc = trace_graph_entry;
+
if (tracing_thresh)
- ret = register_ftrace_graph(&funcgraph_thresh_ops);
+ tr->gops->retfunc = trace_graph_thresh_return;
else
- ret = register_ftrace_graph(&funcgraph_ops);
+ tr->gops->retfunc = trace_graph_return;
+
+ /* Make gops functions are visible before we start tracing */
+ smp_mb();
+
+ ret = register_ftrace_graph(tr->gops);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -320,10 +346,7 @@ static int graph_trace_init(struct trace_array *tr)
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
- if (tracing_thresh)
- unregister_ftrace_graph(&funcgraph_thresh_ops);
- else
- unregister_ftrace_graph(&funcgraph_ops);
+ unregister_ftrace_graph(tr->gops);
}
static int graph_trace_update_thresh(struct trace_array *tr)
@@ -521,6 +544,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
+ addr += iter->tr->text_delta;
+
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return;
@@ -687,6 +712,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ unsigned long func;
int cpu = iter->cpu;
int i;
@@ -694,6 +720,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
+ func = call->func + iter->tr->text_delta;
+
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -724,10 +752,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
* enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
+ print_graph_retval(s, graph_ret->retval, true, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
else
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ trace_seq_printf(s, "%ps();\n", (void *)func);
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -743,6 +771,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
int i;
if (data) {
@@ -765,7 +794,9 @@ print_graph_entry_nested(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+ func = call->func + iter->tr->text_delta;
+
+ trace_seq_printf(s, "%ps() {\n", (void *)func);
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -840,6 +871,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
int *depth_irq;
struct fgraph_data *data = iter->private;
+ addr += iter->tr->text_delta;
+
/*
* If we are either displaying irqs, or we got called as
* a graph event and private data does not exist,
@@ -967,11 +1000,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
int i;
+ func = trace->func + iter->tr->text_delta;
+
if (check_irq_return(iter, flags, trace->depth))
return TRACE_TYPE_HANDLED;
@@ -1010,7 +1046,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* function-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)trace->func,
+ print_graph_retval(s, trace->retval, false, (void *)func,
!!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
} else {
/*
@@ -1023,7 +1059,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
trace_seq_puts(s, "}\n");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ trace_seq_printf(s, "} /* %ps */\n", (void *)func);
}
/* Overrun */
@@ -1362,6 +1398,7 @@ static struct tracer graph_trace __tracer_data = {
.print_header = print_graph_headers,
.flags = &tracer_flags,
.set_flag = func_graph_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b791524a6536..3bd6071441ad 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -520,6 +520,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy)
if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
goto out_unlock;
+ if (!cpu_online(cpu))
+ goto out_unlock;
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
goto out_unlock;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ba37f768e2f2..fce064e20570 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,7 +175,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
return start_irqsoff_tracer(irqsoff_trace, set);
}
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
@@ -183,7 +184,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
unsigned int trace_ctx;
int ret;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -205,14 +206,15 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 16383247bdbf..263fac44d3ca 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -678,6 +678,21 @@ end:
}
#ifdef CONFIG_MODULES
+static int validate_module_probe_symbol(const char *modname, const char *symbol);
+
+static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk)
+{
+ const char *p;
+ int ret = 0;
+
+ p = strchr(trace_kprobe_symbol(tk), ':');
+ if (p)
+ ret = validate_module_probe_symbol(module_name(mod), p + 1);
+ if (!ret)
+ ret = __register_trace_kprobe(tk);
+ return ret;
+}
+
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -696,7 +711,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
__unregister_trace_kprobe(tk);
- ret = __register_trace_kprobe(tk);
+ ret = register_module_trace_kprobe(mod, tk);
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
trace_probe_name(&tk->tp),
@@ -747,17 +762,81 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused)
return 0;
}
-static unsigned int number_of_same_symbols(char *func_name)
+static unsigned int number_of_same_symbols(const char *mod, const char *func_name)
{
struct sym_count_ctx ctx = { .count = 0, .name = func_name };
- kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ if (!mod)
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
- module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+ module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
}
+static int validate_module_probe_symbol(const char *modname, const char *symbol)
+{
+ unsigned int count = number_of_same_symbols(modname, symbol);
+
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ return -EADDRNOTAVAIL;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return -ENOENT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+/* Return NULL if the module is not loaded or under unloading. */
+static struct module *try_module_get_by_name(const char *name)
+{
+ struct module *mod;
+
+ rcu_read_lock_sched();
+ mod = find_module(name);
+ if (mod && !try_module_get(mod))
+ mod = NULL;
+ rcu_read_unlock_sched();
+
+ return mod;
+}
+#else
+#define try_module_get_by_name(name) (NULL)
+#endif
+
+static int validate_probe_symbol(char *symbol)
+{
+ struct module *mod = NULL;
+ char *modname = NULL, *p;
+ int ret = 0;
+
+ p = strchr(symbol, ':');
+ if (p) {
+ modname = symbol;
+ symbol = p + 1;
+ *p = '\0';
+ mod = try_module_get_by_name(modname);
+ if (!mod)
+ goto out;
+ }
+
+ ret = validate_module_probe_symbol(modname, symbol);
+out:
+ if (p)
+ *p = ':';
+ if (mod)
+ module_put(mod);
+ return ret;
+}
+
static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
struct pt_regs *regs);
@@ -881,6 +960,14 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
goto parse_error;
}
+ ret = validate_probe_symbol(symbol);
+ if (ret) {
+ if (ret == -EADDRNOTAVAIL)
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ else
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ goto parse_error;
+ }
if (is_return)
ctx.flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
@@ -893,31 +980,6 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
}
- if (symbol && !strchr(symbol, ':')) {
- unsigned int count;
-
- count = number_of_same_symbols(symbol);
- if (count > 1) {
- /*
- * Users should use ADDR to remove the ambiguity of
- * using KSYM only.
- */
- trace_probe_log_err(0, NON_UNIQ_SYMBOL);
- ret = -EADDRNOTAVAIL;
-
- goto error;
- } else if (count == 0) {
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- trace_probe_log_err(0, BAD_PROBE_ADDR);
- ret = -ENOENT;
-
- goto error;
- }
- }
-
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
@@ -951,6 +1013,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto out;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
@@ -967,7 +1033,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ctx.offset = 0;
ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
@@ -1835,21 +1901,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
char *event;
if (func) {
- unsigned int count;
-
- count = number_of_same_symbols(func);
- if (count > 1)
- /*
- * Users should use addr to remove the ambiguity of
- * using func only.
- */
- return ERR_PTR(-EADDRNOTAVAIL);
- else if (count == 0)
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- return ERR_PTR(-ENOENT);
+ ret = validate_probe_symbol(func);
+ if (ret)
+ return ERR_PTR(ret);
}
/*
@@ -2023,19 +2077,16 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function entry.\n");
+ if (WARN_ONCE(ret, "error on probing function entry.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on probing function entry.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2044,19 +2095,16 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function return.\n");
+ if (WARN_ONCE(ret, "error on probing function return.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2079,18 +2127,15 @@ static __init int kprobe_trace_self_tests_init(void)
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2098,18 +2143,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe2 hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe2 hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2117,23 +2159,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("-:testprobe");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
ret = create_or_delete_trace_kprobe("-:testprobe2");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
+
end:
- ret = dyn_events_release_all(&trace_kprobe_ops);
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on cleaning up probes.\n");
- warn++;
- }
/*
* Wait for the optimizer work to finish. Otherwise it might fiddle
* with probes in already freed __init text.
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a8e28f9b9271..a50ed23bee77 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
return this_cpu_ptr(&per_cpu_osnoise_var);
}
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Runtime information for the timer mode.
@@ -259,14 +264,20 @@ static inline void tlat_var_reset(void)
{
struct timerlat_variables *tlat_var;
int cpu;
+
+ /* Synchronize with the timerlat interfaces */
+ mutex_lock(&interface_lock);
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
#define tlat_var_reset() do {} while (0)
@@ -332,11 +343,6 @@ struct timerlat_sample {
#endif
/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -1444,9 +1450,9 @@ static int run_osnoise(void)
save_osn_sample_stats(osn_var, &s);
/*
- * if threshold is 0, use the default value of 5 us.
+ * if threshold is 0, use the default value of 1 us.
*/
- threshold = tracing_thresh ? : 5000;
+ threshold = tracing_thresh ? : 1000;
/*
* Apply PREEMPT and IRQ disabled options.
@@ -1535,7 +1541,7 @@ static int run_osnoise(void)
* This will eventually cause unwarranted noise as PREEMPT_RCU
* will force preemption as the means of ending the current
* grace period. We avoid this problem by calling
- * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * rcu_momentary_eqs(), which performs a zero duration
* EQS allowing PREEMPT_RCU to end the current grace period.
* This call shouldn't be wrapped inside an RCU critical
* section.
@@ -1547,7 +1553,7 @@ static int run_osnoise(void)
if (!disable_irq)
local_irq_disable();
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
if (!disable_irq)
local_irq_enable();
@@ -1612,6 +1618,7 @@ out:
static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
+static struct cpumask kthread_cpumask;
/*
* osnoise_sleep - sleep until the next period
@@ -1675,6 +1682,7 @@ static inline int osnoise_migration_pending(void)
*/
mutex_lock(&interface_lock);
this_cpu_osn_var()->kthread = NULL;
+ cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
mutex_unlock(&interface_lock);
return 1;
@@ -1945,11 +1953,12 @@ static void stop_kthread(unsigned int cpu)
{
struct task_struct *kthread;
- kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
if (kthread) {
- if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
+ !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
kthread_stop(kthread);
- } else {
+ } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
/*
* This is a user thread waiting on the timerlat_fd. We need
* to close all users, and the best way to guarantee this is
@@ -1958,7 +1967,6 @@ static void stop_kthread(unsigned int cpu)
kill_pid(kthread->thread_pid, SIGKILL, 1);
put_task_struct(kthread);
}
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
} else {
/* if no workload, just return */
if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
@@ -1967,7 +1975,6 @@ static void stop_kthread(unsigned int cpu)
*/
per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
barrier();
- return;
}
}
}
@@ -1999,6 +2006,10 @@ static int start_kthread(unsigned int cpu)
void *main = osnoise_main;
char comm[24];
+ /* Do not start a new thread if it is already running */
+ if (per_cpu(per_cpu_osnoise_var, cpu).kthread)
+ return 0;
+
if (timerlat_enabled()) {
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
@@ -2021,6 +2032,7 @@ static int start_kthread(unsigned int cpu)
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+ cpumask_set_cpu(cpu, &kthread_cpumask);
return 0;
}
@@ -2048,8 +2060,15 @@ static int start_per_cpu_kthreads(void)
*/
cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
- for_each_possible_cpu(cpu)
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
+ struct task_struct *kthread;
+
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
+ if (!WARN_ON(!kthread))
+ kthread_stop(kthread);
+ }
+ }
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
@@ -2078,6 +2097,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
mutex_lock(&interface_lock);
cpus_read_lock();
+ if (!cpu_online(cpu))
+ goto out_unlock;
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
goto out_unlock;
@@ -2579,7 +2600,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file)
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
- hrtimer_cancel(&tlat_var->timer);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
osn_var->sampling = 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..868f2f912f28 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, int flags)
+ unsigned long parent_ip, long delta, int flags)
{
+ ip += delta;
+ parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1230,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+ long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1242,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
- seq_print_ip_sym(s, *p, flags);
+ seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
@@ -1587,10 +1591,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long ip;
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
+ ip = field->ip + iter->tr->text_delta;
+
+ seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
@@ -1674,7 +1681,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 39877c80d6cb..16a5e368e7b7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -276,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
- } else if (len > MAX_EVENT_NAME_LEN) {
+ } else if (len >= MAX_EVENT_NAME_LEN) {
trace_probe_log_err(offset, EVENT_TOO_LONG);
return -EINVAL;
}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0469a04a355f..ae2ace5e515a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -112,14 +112,15 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
return start_func_tracer(tr, set);
}
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
int ret = 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -141,13 +142,14 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+static void wakeup_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
@@ -545,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
- (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e9c5058a8efd..1469dd8075fa 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -756,13 +756,275 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define CHAR_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+ struct fgraph_ops gops;
+ int store_size;
+ const char *store_type_name;
+ char error_str_buf[ERRSTR_BUFLEN];
+ char *error_str;
+};
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ int size = fixture->store_size;
+ void *p;
+
+ p = fgraph_reserve_data(gops->idx, size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to reserve %s\n", type);
+ return 0;
+ }
+
+ switch (size) {
+ case 1:
+ *(char *)p = CHAR_NUMBER;
+ break;
+ case 2:
+ *(short *)p = SHORT_NUMBER;
+ break;
+ case 4:
+ *(int *)p = WORD_NUMBER;
+ break;
+ case 8:
+ *(long long *)p = LONG_NUMBER;
+ break;
+ }
+
+ return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ long long expect = 0;
+ long long found = -1;
+ int size;
+ char *p;
+
+ p = fgraph_retrieve_data(gops->idx, &size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to retrieve %s\n", type);
+ return;
+ }
+ if (fixture->store_size > size) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Retrieved size %d is smaller than expected %d\n",
+ size, (int)fixture->store_size);
+ return;
+ }
+
+ switch (fixture->store_size) {
+ case 1:
+ expect = CHAR_NUMBER;
+ found = *(char *)p;
+ break;
+ case 2:
+ expect = SHORT_NUMBER;
+ found = *(short *)p;
+ break;
+ case 4:
+ expect = WORD_NUMBER;
+ found = *(int *)p;
+ break;
+ case 8:
+ expect = LONG_NUMBER;
+ found = *(long long *)p;
+ break;
+ }
+
+ if (found != expect) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "%s returned not %lld but %lld\n", type, expect, found);
+ return;
+ }
+ fixture->error_str = NULL;
+}
+
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
+{
+ char *func_name;
+ int len;
+
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to execute storage %s\n", fixture->store_type_name);
+ fixture->error_str = fixture->error_str_buf;
+
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ return ftrace_set_filter(&fixture->gops.ops, func_name, len, 1);
+}
+
+/* Test fgraph storage for each size */
+static int __init test_graph_storage_single(struct fgraph_fixture *fixture)
+{
+ int size = fixture->store_size;
+ int ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing fgraph storage of %d byte%s: ", size, str_plural(size));
+
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ return -1;
+ }
+
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ return -1;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str) {
+ pr_cont("*** %s ***", fixture->error_str);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct fgraph_fixture store_bytes[4] __initdata = {
+ [0] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 1,
+ .store_type_name = "byte",
+ },
+ [1] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 2,
+ .store_type_name = "short",
+ },
+ [2] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 4,
+ .store_type_name = "word",
+ },
+ [3] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 8,
+ .store_type_name = "long long",
+ },
+};
+
+static __init int test_graph_storage_multi(void)
+{
+ struct fgraph_fixture *fixture;
+ bool printed = false;
+ int i, j, ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing multiple fgraph storage on a function: ");
+
+ for (i = 0; i < ARRAY_SIZE(store_bytes); i++) {
+ fixture = &store_bytes[i];
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ printed = true;
+ goto out2;
+ }
+ }
+
+ for (j = 0; j < ARRAY_SIZE(store_bytes); j++) {
+ fixture = &store_bytes[j];
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ printed = true;
+ goto out1;
+ }
+ }
+
+ DYN_FTRACE_TEST_NAME();
+out1:
+ while (--j >= 0) {
+ fixture = &store_bytes[j];
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+out2:
+ while (--i >= 0) {
+ fixture = &store_bytes[i];
+ ftrace_free_filter(&fixture->gops.ops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+ return printed ? -1 : 0;
+}
+
+/* Test the storage passed across function_graph entry and return */
+static __init int test_graph_storage(void)
+{
+ int ret;
+
+ ret = test_graph_storage_single(&store_bytes[0]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[1]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[2]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[3]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_multi();
+ if (ret)
+ return ret;
+ return 0;
+}
+#else
+static inline int test_graph_storage(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
-static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -776,7 +1038,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
return 0;
}
- return trace_graph_entry(trace);
+ return trace_graph_entry(trace, gops);
}
static struct fgraph_ops fgraph_ops __initdata = {
@@ -812,7 +1074,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
@@ -855,7 +1117,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
cond_resched();
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
/*
* Some archs *cough*PowerPC*cough* add characters to the
@@ -912,6 +1174,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ftrace_set_global_filter(NULL, 0, 1);
#endif
+ ret = test_graph_storage();
+
/* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
@@ -1221,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* reset the max latency */
tr->max_latency = 0;
- while (p->on_rq) {
+ while (task_is_runnable(p)) {
/*
* Sleep to make sure the -deadline thread is asleep too.
* On virtual machines we can't rely on timings,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5a48dba912ea..7f9572a37333 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -514,7 +514,7 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int was_enabled;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..785733245ead 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
@@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
@@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c98e3b3386ba..b30fc8fcd095 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/rculist.h>
#include <linux/filter.h>
+#include <linux/percpu.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -58,11 +59,11 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
- struct inode *inode;
char *filename;
+ struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
- unsigned long nhit;
+ unsigned long __percpu *nhits;
struct trace_probe tp;
};
@@ -337,6 +338,12 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
+ tu->nhits = alloc_percpu(unsigned long);
+ if (!tu->nhits) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
ret = trace_probe_init(&tu->tp, event, group, true, nargs);
if (ret < 0)
goto error;
@@ -349,6 +356,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
return tu;
error:
+ free_percpu(tu->nhits);
kfree(tu);
return ERR_PTR(ret);
@@ -362,6 +370,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
path_put(&tu->path);
trace_probe_cleanup(&tu->tp);
kfree(tu->filename);
+ free_percpu(tu->nhits);
kfree(tu);
}
@@ -556,6 +565,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
+ if (argc - 2 > MAX_TRACE_ARGS)
+ return -E2BIG;
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -681,7 +692,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
tu->filename = filename;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
struct traceprobe_parse_context ctx = {
.flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
};
@@ -815,13 +826,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_uprobe *tu;
+ unsigned long nhits;
+ int cpu;
if (!is_trace_uprobe(ev))
return 0;
tu = to_trace_uprobe(ev);
+
+ nhits = 0;
+ for_each_possible_cpu(cpu) {
+ nhits += per_cpu(*tu->nhits, cpu);
+ }
+
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- trace_probe_name(&tu->tp), tu->nhit);
+ trace_probe_name(&tu->tp), nhits);
return 0;
}
@@ -858,6 +877,7 @@ struct uprobe_cpu_buffer {
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
+#define MAX_UCB_BUFFER_SIZE PAGE_SIZE
static int uprobe_buffer_init(void)
{
@@ -962,6 +982,11 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
ucb = uprobe_buffer_get();
ucb->dsize = tu->tp.size + dsize;
+ if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
+ ucb->dsize = MAX_UCB_BUFFER_SIZE;
+ dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
+ }
+
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
*ucbp = ucb;
@@ -981,9 +1006,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
WARN_ON(call != trace_file->event_call);
- if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
- return;
-
if (trace_trigger_soft_disabled(trace_file))
return;
@@ -1078,43 +1100,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
-typedef bool (*filter_func_t)(struct uprobe_consumer *self,
- enum uprobe_filter_ctx ctx,
- struct mm_struct *mm);
+typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- int ret;
+ struct inode *inode = d_real_inode(tu->path.dentry);
+ struct uprobe *uprobe;
tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
-
- if (tu->ref_ctr_offset)
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- else
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
- if (ret)
- tu->inode = NULL;
-
- return ret;
+ tu->uprobe = uprobe;
+ return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
+ bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- if (!tu->inode)
+ if (!tu->uprobe)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
+ sync = true;
+ tu->uprobe = NULL;
}
+ if (sync)
+ uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
@@ -1310,7 +1329,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1334,7 +1353,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
@@ -1344,8 +1363,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
-static bool uprobe_perf_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
@@ -1431,7 +1449,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer **ucbp)
{
- if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
@@ -1512,7 +1530,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
int ret = 0;
tu = container_of(con, struct trace_uprobe, consumer);
- tu->nhit++;
+
+ this_cpu_inc(*tu->nhits);
udd.tu = tu;
udd.bp_addr = instruction_pointer(regs);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index a4dcf0f24352..3a56e7c8aa4f 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
struct tracing_map_elt *elt = NULL;
int idx;
- idx = atomic_inc_return(&map->next_elt);
+ idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts);
if (idx < map->max_elts) {
elt = *(TRACING_MAP_ELT(map->elts, idx));
if (map->ops && map->ops->elt_init)
@@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map)
{
unsigned int i;
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
atomic64_set(&map->hits, 0);
atomic64_set(&map->drops, 0);
@@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map->map_bits = map_bits;
map->max_elts = (1 << map_bits);
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
map->map_size = (1 << (map_bits + 1));
map->ops = ops;