aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c621
1 files changed, 548 insertions, 73 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52f4a9e46704..4ce8a0885c2f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
+#include <linux/highmem.h>
#include "internal.h"
@@ -1134,6 +1135,11 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1706,28 +1712,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1759,6 +1771,15 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
+
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ size += sizeof(data->code_page_size);
+
event->header_size = size;
}
@@ -1768,8 +1789,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1800,23 +1822,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -1844,6 +1887,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -1947,6 +1991,11 @@ static void perf_put_aux_event(struct perf_event *event)
}
}
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
static int perf_get_aux_event(struct perf_event *event,
struct perf_event *group_leader)
{
@@ -1959,7 +2008,17 @@ static int perf_get_aux_event(struct perf_event *event,
if (!group_leader)
return 0;
- if (!perf_aux_output_match(event, group_leader))
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -1976,8 +2035,24 @@ static int perf_get_aux_event(struct perf_event *event,
return 1;
}
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ event_sched_out(event, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+}
+
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
@@ -1996,9 +2071,10 @@ static void perf_group_detach(struct perf_event *event)
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
+ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -2009,6 +2085,9 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2030,10 +2109,10 @@ static void perf_group_detach(struct perf_event *event)
}
out:
- perf_event__header_size(event->group_leader);
-
- for_each_sibling_event(tmp, event->group_leader)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
+
+ perf_event__header_size(leader);
}
static bool is_orphaned_event(struct perf_event *event)
@@ -2802,6 +2881,7 @@ static void _perf_event_enable(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2813,8 +2893,16 @@ static void _perf_event_enable(struct perf_event *event)
* has gone back into error state, as distinct from the task having
* been scheduled away before the cross-call arrived.
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state == PERF_EVENT_STATE_ERROR) {
+ /*
+ * Detached SIBLING events cannot leave ERROR state.
+ */
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
+
event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, NULL);
@@ -4849,7 +4937,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4859,6 +4947,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -4881,15 +4996,20 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -4908,10 +5028,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -4942,7 +5058,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -4952,6 +5068,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -5819,10 +5937,10 @@ again:
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -6206,6 +6324,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct ring_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct ring_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ unsigned long pad;
+ struct ring_buffer *rb;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6281,7 +6515,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6295,6 +6529,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6305,7 +6541,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ u64 values[6];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
@@ -6323,6 +6559,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6336,6 +6574,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6525,6 +6765,22 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ perf_output_put(handle, data->code_page_size);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6575,6 +6831,121 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
+#ifdef CONFIG_MMU
+
+/*
+ * Return the MMU page size of a given virtual address.
+ *
+ * This generic implementation handles page-table aligned huge pages, as well
+ * as non-page-table aligned hugetlbfs compound pages.
+ *
+ * If an architecture supports and uses non-page-table aligned pages in their
+ * kernel mapping it will need to provide it's own implementation of this
+ * function.
+ */
+__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *page;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_leaf(*p4d))
+ return 1ULL << P4D_SHIFT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_leaf(*pud)) {
+#ifdef pud_page
+ page = pud_page(*pud);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
+ return 1ULL << PUD_SHIFT;
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_leaf(*pmd)) {
+#ifdef pmd_page
+ page = pmd_page(*pmd);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
+ return 1ULL << PMD_SHIFT;
+ }
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return 0;
+ }
+
+ page = pte_page(*pte);
+ if (PageHuge(page)) {
+ u64 size = page_size(compound_head(page));
+ pte_unmap(pte);
+ return size;
+ }
+
+ pte_unmap(pte);
+ return PAGE_SIZE;
+}
+
+#else
+
+static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ return 0;
+}
+
+#endif
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = arch_perf_get_page_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
@@ -6610,7 +6981,7 @@ void perf_prepare_sample(struct perf_event_header *header,
__perf_event_header__init_id(header, data, event);
- if (sample_type & PERF_SAMPLE_IP)
+ if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
@@ -6716,6 +7087,56 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+#ifdef CONFIG_CGROUP_PERF
+ if (sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_get_kernfs_id(cgrp);
+ }
+#endif
+
+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ data->code_page_size = perf_get_page_size(data->ip);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -8166,8 +8587,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -9997,8 +10418,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -10010,9 +10430,30 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
@@ -10035,29 +10476,24 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
- ret = device_add(pmu->dev);
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */
- if (pmu->nr_addr_filters)
- ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
+ ret = device_add(pmu->dev);
if (ret)
- goto del_dev;
+ goto free_dev;
- if (pmu->attr_update)
+ if (pmu->attr_update) {
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
- if (ret)
- goto del_dev;
+ if (ret)
+ goto del_dev;
+ }
out:
return ret;
@@ -10685,7 +11121,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->size = size;
- if (attr->__reserved_1 || attr->__reserved_2)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10754,6 +11190,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+
out:
return ret;
@@ -10763,14 +11205,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct ring_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -10785,7 +11238,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -10808,8 +11261,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -10819,6 +11279,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -10826,20 +11292,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
@@ -10923,7 +11382,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx, *gctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -11114,6 +11573,9 @@ SYSCALL_DEFINE5(perf_event_open,
* Do not allow to attach to a group in a different task
* or CPU context. If we're moving SW events, we'll fix
* this up later, so allow that.
+ *
+ * Racy, not holding group_leader->ctx->mutex, see comment with
+ * perf_event_ctx_lock().
*/
if (!move_group && group_leader->ctx != ctx)
goto err_context;
@@ -11181,6 +11643,7 @@ SYSCALL_DEFINE5(perf_event_open,
} else {
perf_event_ctx_unlock(group_leader, gctx);
move_group = 0;
+ goto not_move_group;
}
}
@@ -11197,7 +11660,17 @@ SYSCALL_DEFINE5(perf_event_open,
}
} else {
mutex_lock(&ctx->mutex);
+
+ /*
+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
+ * see the group_leader && !move_group test earlier.
+ */
+ if (group_leader && group_leader->ctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ }
}
+not_move_group:
if (ctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
@@ -11225,7 +11698,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) {
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
goto err_locked;
}
@@ -11952,6 +12425,8 @@ static int inherit_group(struct perf_event *parent_event,
!perf_get_aux_event(child_ctr, leader))
return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}