mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-07-06 14:31:46 +00:00
perf_counter: Add event overlow handling
Alternative method of mmap() data output handling that provides better overflow management and a more reliable data stream. Unlike the previous method, that didn't have any user->kernel feedback and relied on userspace keeping up, this method relies on userspace writing its last read position into the control page. It will ensure new output doesn't overwrite not-yet read events, new events for which there is no space left are lost and the overflow counter is incremented, providing exact event loss numbers. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
d3a9262e59
commit
43a21ea81a
2 changed files with 162 additions and 71 deletions
|
@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
|
||||||
/*
|
/*
|
||||||
* Control data for the mmap() data buffer.
|
* Control data for the mmap() data buffer.
|
||||||
*
|
*
|
||||||
* User-space reading this value should issue an rmb(), on SMP capable
|
* User-space reading the @data_head value should issue an rmb(), on
|
||||||
* platforms, after reading this value -- see perf_counter_wakeup().
|
* SMP capable platforms, after reading this value -- see
|
||||||
|
* perf_counter_wakeup().
|
||||||
|
*
|
||||||
|
* When the mapping is PROT_WRITE the @data_tail value should be
|
||||||
|
* written by userspace to reflect the last read data. In this case
|
||||||
|
* the kernel will not over-write unread data.
|
||||||
*/
|
*/
|
||||||
__u64 data_head; /* head in the data section */
|
__u64 data_head; /* head in the data section */
|
||||||
|
__u64 data_tail; /* user-space written tail */
|
||||||
};
|
};
|
||||||
|
|
||||||
#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
|
#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
|
||||||
|
@ -273,6 +279,15 @@ enum perf_event_type {
|
||||||
*/
|
*/
|
||||||
PERF_EVENT_MMAP = 1,
|
PERF_EVENT_MMAP = 1,
|
||||||
|
|
||||||
|
/*
|
||||||
|
* struct {
|
||||||
|
* struct perf_event_header header;
|
||||||
|
* u64 id;
|
||||||
|
* u64 lost;
|
||||||
|
* };
|
||||||
|
*/
|
||||||
|
PERF_EVENT_LOST = 2,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* struct {
|
* struct {
|
||||||
* struct perf_event_header header;
|
* struct perf_event_header header;
|
||||||
|
@ -313,26 +328,26 @@ enum perf_event_type {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
|
* When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
|
||||||
* will be PERF_RECORD_*
|
* will be PERF_SAMPLE_*
|
||||||
*
|
*
|
||||||
* struct {
|
* struct {
|
||||||
* struct perf_event_header header;
|
* struct perf_event_header header;
|
||||||
*
|
*
|
||||||
* { u64 ip; } && PERF_RECORD_IP
|
* { u64 ip; } && PERF_SAMPLE_IP
|
||||||
* { u32 pid, tid; } && PERF_RECORD_TID
|
* { u32 pid, tid; } && PERF_SAMPLE_TID
|
||||||
* { u64 time; } && PERF_RECORD_TIME
|
* { u64 time; } && PERF_SAMPLE_TIME
|
||||||
* { u64 addr; } && PERF_RECORD_ADDR
|
* { u64 addr; } && PERF_SAMPLE_ADDR
|
||||||
* { u64 config; } && PERF_RECORD_CONFIG
|
* { u64 config; } && PERF_SAMPLE_CONFIG
|
||||||
* { u32 cpu, res; } && PERF_RECORD_CPU
|
* { u32 cpu, res; } && PERF_SAMPLE_CPU
|
||||||
*
|
*
|
||||||
* { u64 nr;
|
* { u64 nr;
|
||||||
* { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP
|
* { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP
|
||||||
*
|
*
|
||||||
* { u16 nr,
|
* { u16 nr,
|
||||||
* hv,
|
* hv,
|
||||||
* kernel,
|
* kernel,
|
||||||
* user;
|
* user;
|
||||||
* u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
|
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
|
||||||
* };
|
* };
|
||||||
*/
|
*/
|
||||||
};
|
};
|
||||||
|
@ -424,6 +439,7 @@ struct file;
|
||||||
struct perf_mmap_data {
|
struct perf_mmap_data {
|
||||||
struct rcu_head rcu_head;
|
struct rcu_head rcu_head;
|
||||||
int nr_pages; /* nr of data pages */
|
int nr_pages; /* nr of data pages */
|
||||||
|
int writable; /* are we writable */
|
||||||
int nr_locked; /* nr pages mlocked */
|
int nr_locked; /* nr pages mlocked */
|
||||||
|
|
||||||
atomic_t poll; /* POLL_ for wakeups */
|
atomic_t poll; /* POLL_ for wakeups */
|
||||||
|
@ -433,8 +449,8 @@ struct perf_mmap_data {
|
||||||
atomic_long_t done_head; /* completed head */
|
atomic_long_t done_head; /* completed head */
|
||||||
|
|
||||||
atomic_t lock; /* concurrent writes */
|
atomic_t lock; /* concurrent writes */
|
||||||
|
|
||||||
atomic_t wakeup; /* needs a wakeup */
|
atomic_t wakeup; /* needs a wakeup */
|
||||||
|
atomic_t lost; /* nr records lost */
|
||||||
|
|
||||||
struct perf_counter_mmap_page *user_page;
|
struct perf_counter_mmap_page *user_page;
|
||||||
void *data_pages[0];
|
void *data_pages[0];
|
||||||
|
|
|
@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
struct perf_mmap_data *data;
|
struct perf_mmap_data *data;
|
||||||
int ret = VM_FAULT_SIGBUS;
|
int ret = VM_FAULT_SIGBUS;
|
||||||
|
|
||||||
|
if (vmf->flags & FAULT_FLAG_MKWRITE) {
|
||||||
|
if (vmf->pgoff == 0)
|
||||||
|
ret = 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
data = rcu_dereference(counter->data);
|
data = rcu_dereference(counter->data);
|
||||||
if (!data)
|
if (!data)
|
||||||
|
@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
if ((unsigned)nr > data->nr_pages)
|
if ((unsigned)nr > data->nr_pages)
|
||||||
goto unlock;
|
goto unlock;
|
||||||
|
|
||||||
|
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
vmf->page = virt_to_page(data->data_pages[nr]);
|
vmf->page = virt_to_page(data->data_pages[nr]);
|
||||||
}
|
}
|
||||||
|
|
||||||
get_page(vmf->page);
|
get_page(vmf->page);
|
||||||
|
vmf->page->mapping = vma->vm_file->f_mapping;
|
||||||
|
vmf->page->index = vmf->pgoff;
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
unlock:
|
unlock:
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
@ -1862,6 +1875,14 @@ fail:
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void perf_mmap_free_page(unsigned long addr)
|
||||||
|
{
|
||||||
|
struct page *page = virt_to_page(addr);
|
||||||
|
|
||||||
|
page->mapping = NULL;
|
||||||
|
__free_page(page);
|
||||||
|
}
|
||||||
|
|
||||||
static void __perf_mmap_data_free(struct rcu_head *rcu_head)
|
static void __perf_mmap_data_free(struct rcu_head *rcu_head)
|
||||||
{
|
{
|
||||||
struct perf_mmap_data *data;
|
struct perf_mmap_data *data;
|
||||||
|
@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
|
||||||
|
|
||||||
data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
|
data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
|
||||||
|
|
||||||
free_page((unsigned long)data->user_page);
|
perf_mmap_free_page((unsigned long)data->user_page);
|
||||||
for (i = 0; i < data->nr_pages; i++)
|
for (i = 0; i < data->nr_pages; i++)
|
||||||
free_page((unsigned long)data->data_pages[i]);
|
perf_mmap_free_page((unsigned long)data->data_pages[i]);
|
||||||
|
|
||||||
kfree(data);
|
kfree(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct vm_operations_struct perf_mmap_vmops = {
|
static struct vm_operations_struct perf_mmap_vmops = {
|
||||||
.open = perf_mmap_open,
|
.open = perf_mmap_open,
|
||||||
.close = perf_mmap_close,
|
.close = perf_mmap_close,
|
||||||
.fault = perf_mmap_fault,
|
.fault = perf_mmap_fault,
|
||||||
|
.page_mkwrite = perf_mmap_fault,
|
||||||
};
|
};
|
||||||
|
|
||||||
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
|
@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
long user_extra, extra;
|
long user_extra, extra;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
|
if (!(vma->vm_flags & VM_SHARED))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
vma_size = vma->vm_end - vma->vm_start;
|
vma_size = vma->vm_end - vma->vm_start;
|
||||||
|
@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
atomic_long_add(user_extra, &user->locked_vm);
|
atomic_long_add(user_extra, &user->locked_vm);
|
||||||
vma->vm_mm->locked_vm += extra;
|
vma->vm_mm->locked_vm += extra;
|
||||||
counter->data->nr_locked = extra;
|
counter->data->nr_locked = extra;
|
||||||
|
if (vma->vm_flags & VM_WRITE)
|
||||||
|
counter->data->writable = 1;
|
||||||
|
|
||||||
unlock:
|
unlock:
|
||||||
mutex_unlock(&counter->mmap_mutex);
|
mutex_unlock(&counter->mmap_mutex);
|
||||||
|
|
||||||
vma->vm_flags &= ~VM_MAYWRITE;
|
|
||||||
vma->vm_flags |= VM_RESERVED;
|
vma->vm_flags |= VM_RESERVED;
|
||||||
vma->vm_ops = &perf_mmap_vmops;
|
vma->vm_ops = &perf_mmap_vmops;
|
||||||
|
|
||||||
|
@ -2163,11 +2188,38 @@ struct perf_output_handle {
|
||||||
unsigned long head;
|
unsigned long head;
|
||||||
unsigned long offset;
|
unsigned long offset;
|
||||||
int nmi;
|
int nmi;
|
||||||
int overflow;
|
int sample;
|
||||||
int locked;
|
int locked;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static bool perf_output_space(struct perf_mmap_data *data,
|
||||||
|
unsigned int offset, unsigned int head)
|
||||||
|
{
|
||||||
|
unsigned long tail;
|
||||||
|
unsigned long mask;
|
||||||
|
|
||||||
|
if (!data->writable)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
mask = (data->nr_pages << PAGE_SHIFT) - 1;
|
||||||
|
/*
|
||||||
|
* Userspace could choose to issue a mb() before updating the tail
|
||||||
|
* pointer. So that all reads will be completed before the write is
|
||||||
|
* issued.
|
||||||
|
*/
|
||||||
|
tail = ACCESS_ONCE(data->user_page->data_tail);
|
||||||
|
smp_rmb();
|
||||||
|
|
||||||
|
offset = (offset - tail) & mask;
|
||||||
|
head = (head - tail) & mask;
|
||||||
|
|
||||||
|
if ((int)(head - offset) < 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static void perf_output_wakeup(struct perf_output_handle *handle)
|
static void perf_output_wakeup(struct perf_output_handle *handle)
|
||||||
{
|
{
|
||||||
atomic_set(&handle->data->poll, POLL_IN);
|
atomic_set(&handle->data->poll, POLL_IN);
|
||||||
|
@ -2258,55 +2310,6 @@ out:
|
||||||
local_irq_restore(handle->flags);
|
local_irq_restore(handle->flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int perf_output_begin(struct perf_output_handle *handle,
|
|
||||||
struct perf_counter *counter, unsigned int size,
|
|
||||||
int nmi, int overflow)
|
|
||||||
{
|
|
||||||
struct perf_mmap_data *data;
|
|
||||||
unsigned int offset, head;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For inherited counters we send all the output towards the parent.
|
|
||||||
*/
|
|
||||||
if (counter->parent)
|
|
||||||
counter = counter->parent;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
data = rcu_dereference(counter->data);
|
|
||||||
if (!data)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
handle->data = data;
|
|
||||||
handle->counter = counter;
|
|
||||||
handle->nmi = nmi;
|
|
||||||
handle->overflow = overflow;
|
|
||||||
|
|
||||||
if (!data->nr_pages)
|
|
||||||
goto fail;
|
|
||||||
|
|
||||||
perf_output_lock(handle);
|
|
||||||
|
|
||||||
do {
|
|
||||||
offset = head = atomic_long_read(&data->head);
|
|
||||||
head += size;
|
|
||||||
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
|
|
||||||
|
|
||||||
handle->offset = offset;
|
|
||||||
handle->head = head;
|
|
||||||
|
|
||||||
if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
|
|
||||||
atomic_set(&data->wakeup, 1);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
perf_output_wakeup(handle);
|
|
||||||
out:
|
|
||||||
rcu_read_unlock();
|
|
||||||
|
|
||||||
return -ENOSPC;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void perf_output_copy(struct perf_output_handle *handle,
|
static void perf_output_copy(struct perf_output_handle *handle,
|
||||||
const void *buf, unsigned int len)
|
const void *buf, unsigned int len)
|
||||||
{
|
{
|
||||||
|
@ -2346,6 +2349,78 @@ static void perf_output_copy(struct perf_output_handle *handle,
|
||||||
#define perf_output_put(handle, x) \
|
#define perf_output_put(handle, x) \
|
||||||
perf_output_copy((handle), &(x), sizeof(x))
|
perf_output_copy((handle), &(x), sizeof(x))
|
||||||
|
|
||||||
|
static int perf_output_begin(struct perf_output_handle *handle,
|
||||||
|
struct perf_counter *counter, unsigned int size,
|
||||||
|
int nmi, int sample)
|
||||||
|
{
|
||||||
|
struct perf_mmap_data *data;
|
||||||
|
unsigned int offset, head;
|
||||||
|
int have_lost;
|
||||||
|
struct {
|
||||||
|
struct perf_event_header header;
|
||||||
|
u64 id;
|
||||||
|
u64 lost;
|
||||||
|
} lost_event;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For inherited counters we send all the output towards the parent.
|
||||||
|
*/
|
||||||
|
if (counter->parent)
|
||||||
|
counter = counter->parent;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
data = rcu_dereference(counter->data);
|
||||||
|
if (!data)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
handle->data = data;
|
||||||
|
handle->counter = counter;
|
||||||
|
handle->nmi = nmi;
|
||||||
|
handle->sample = sample;
|
||||||
|
|
||||||
|
if (!data->nr_pages)
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
have_lost = atomic_read(&data->lost);
|
||||||
|
if (have_lost)
|
||||||
|
size += sizeof(lost_event);
|
||||||
|
|
||||||
|
perf_output_lock(handle);
|
||||||
|
|
||||||
|
do {
|
||||||
|
offset = head = atomic_long_read(&data->head);
|
||||||
|
head += size;
|
||||||
|
if (unlikely(!perf_output_space(data, offset, head)))
|
||||||
|
goto fail;
|
||||||
|
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
|
||||||
|
|
||||||
|
handle->offset = offset;
|
||||||
|
handle->head = head;
|
||||||
|
|
||||||
|
if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
|
||||||
|
atomic_set(&data->wakeup, 1);
|
||||||
|
|
||||||
|
if (have_lost) {
|
||||||
|
lost_event.header.type = PERF_EVENT_LOST;
|
||||||
|
lost_event.header.misc = 0;
|
||||||
|
lost_event.header.size = sizeof(lost_event);
|
||||||
|
lost_event.id = counter->id;
|
||||||
|
lost_event.lost = atomic_xchg(&data->lost, 0);
|
||||||
|
|
||||||
|
perf_output_put(handle, lost_event);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
atomic_inc(&data->lost);
|
||||||
|
perf_output_unlock(handle);
|
||||||
|
out:
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
return -ENOSPC;
|
||||||
|
}
|
||||||
|
|
||||||
static void perf_output_end(struct perf_output_handle *handle)
|
static void perf_output_end(struct perf_output_handle *handle)
|
||||||
{
|
{
|
||||||
struct perf_counter *counter = handle->counter;
|
struct perf_counter *counter = handle->counter;
|
||||||
|
@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)
|
||||||
|
|
||||||
int wakeup_events = counter->attr.wakeup_events;
|
int wakeup_events = counter->attr.wakeup_events;
|
||||||
|
|
||||||
if (handle->overflow && wakeup_events) {
|
if (handle->sample && wakeup_events) {
|
||||||
int events = atomic_inc_return(&data->events);
|
int events = atomic_inc_return(&data->events);
|
||||||
if (events >= wakeup_events) {
|
if (events >= wakeup_events) {
|
||||||
atomic_sub(wakeup_events, &data->events);
|
atomic_sub(wakeup_events, &data->events);
|
||||||
|
@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Generic counter overflow handling.
|
* Generic counter overflow handling, sampling.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int perf_counter_overflow(struct perf_counter *counter, int nmi,
|
int perf_counter_overflow(struct perf_counter *counter, int nmi,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue