perf_counter: Add event overlow handling (43a21ea8) · Commits · e / devices / android_kernel_xiaomi_markw

include/linux/perf_counter.h

+28 −12

Original line number	Diff line number	Diff line
		@@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
		/*
		* Control data for the mmap() data buffer.
		*
		* User-space reading this value should issue an rmb(), on SMP capable
		* platforms, after reading this value -- see perf_counter_wakeup().
		* User-space reading the @data_head value should issue an rmb(), on
		* SMP capable platforms, after reading this value -- see
		* perf_counter_wakeup().
		*
		* When the mapping is PROT_WRITE the @data_tail value should be
		* written by userspace to reflect the last read data. In this case
		* the kernel will not over-write unread data.
		*/
		__u64 data_head; /* head in the data section */
		__u64 data_tail; /* user-space written tail */
		};

		#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
		@@ -273,6 +279,15 @@ enum perf_event_type {
		*/
		PERF_EVENT_MMAP = 1,

		/*
		* struct {
		* struct perf_event_header header;
		* u64 id;
		* u64 lost;
		* };
		*/
		PERF_EVENT_LOST = 2,

		/*
		* struct {
		* struct perf_event_header header;
		@@ -313,26 +328,26 @@ enum perf_event_type {

		/*
		* When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
		* will be PERF_RECORD_*
		* will be PERF_SAMPLE_*
		*
		* struct {
		* struct perf_event_header header;
		*
		* { u64 ip; } && PERF_RECORD_IP
		* { u32 pid, tid; } && PERF_RECORD_TID
		* { u64 time; } && PERF_RECORD_TIME
		* { u64 addr; } && PERF_RECORD_ADDR
		* { u64 config; } && PERF_RECORD_CONFIG
		* { u32 cpu, res; } && PERF_RECORD_CPU
		* { u64 ip; } && PERF_SAMPLE_IP
		* { u32 pid, tid; } && PERF_SAMPLE_TID
		* { u64 time; } && PERF_SAMPLE_TIME
		* { u64 addr; } && PERF_SAMPLE_ADDR
		* { u64 config; } && PERF_SAMPLE_CONFIG
		* { u32 cpu, res; } && PERF_SAMPLE_CPU
		*
		* { u64 nr;
		* { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP
		* { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP
		*
		* { u16 nr,
		* hv,
		* kernel,
		* user;
		* u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
		* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
		* };
		*/
		};
		@@ -424,6 +439,7 @@ struct file;
		struct perf_mmap_data {
		struct rcu_head rcu_head;
		int nr_pages; /* nr of data pages */
		int writable; /* are we writable */
		int nr_locked; /* nr pages mlocked */

		atomic_t poll; /* POLL_ for wakeups */
		@@ -433,8 +449,8 @@ struct perf_mmap_data {
		atomic_long_t done_head; /* completed head */

		atomic_t lock; /* concurrent writes */

		atomic_t wakeup; /* needs a wakeup */
		atomic_t lost; /* nr records lost */

		struct perf_counter_mmap_page *user_page;
		void *data_pages[0];

kernel/perf_counter.c

+130 −55

Original line number	Diff line number	Diff line
		@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct vma, struct vm_fault vmf)
		struct perf_mmap_data *data;
		int ret = VM_FAULT_SIGBUS;

		if (vmf->flags & FAULT_FLAG_MKWRITE) {
		if (vmf->pgoff == 0)
		ret = 0;
		return ret;
		}

		rcu_read_lock();
		data = rcu_dereference(counter->data);
		if (!data)
		@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct vma, struct vm_fault vmf)
		if ((unsigned)nr > data->nr_pages)
		goto unlock;

		if (vmf->flags & FAULT_FLAG_WRITE)
		goto unlock;

		vmf->page = virt_to_page(data->data_pages[nr]);
		}

		get_page(vmf->page);
		vmf->page->mapping = vma->vm_file->f_mapping;
		vmf->page->index = vmf->pgoff;

		ret = 0;
		unlock:
		rcu_read_unlock();
		@@ -1862,6 +1875,14 @@ fail:
		return -ENOMEM;
		}

		static void perf_mmap_free_page(unsigned long addr)
		{
		struct page *page = virt_to_page(addr);

		page->mapping = NULL;
		__free_page(page);
		}

		static void __perf_mmap_data_free(struct rcu_head *rcu_head)
		{
		struct perf_mmap_data *data;
		@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)

		data = container_of(rcu_head, struct perf_mmap_data, rcu_head);

		free_page((unsigned long)data->user_page);
		perf_mmap_free_page((unsigned long)data->user_page);
		for (i = 0; i < data->nr_pages; i++)
		free_page((unsigned long)data->data_pages[i]);
		perf_mmap_free_page((unsigned long)data->data_pages[i]);

		kfree(data);
		}

		@@ -1911,6 +1933,7 @@ static struct vm_operations_struct perf_mmap_vmops = {
		.open = perf_mmap_open,
		.close = perf_mmap_close,
		.fault = perf_mmap_fault,
		.page_mkwrite = perf_mmap_fault,
		};

		static int perf_mmap(struct file file, struct vm_area_struct vma)
		@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file file, struct vm_area_struct vma)
		long user_extra, extra;
		int ret = 0;

		if (!(vma->vm_flags & VM_SHARED) \|\| (vma->vm_flags & VM_WRITE))
		if (!(vma->vm_flags & VM_SHARED))
		return -EINVAL;

		vma_size = vma->vm_end - vma->vm_start;
		@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file file, struct vm_area_struct vma)
		atomic_long_add(user_extra, &user->locked_vm);
		vma->vm_mm->locked_vm += extra;
		counter->data->nr_locked = extra;
		if (vma->vm_flags & VM_WRITE)
		counter->data->writable = 1;

		unlock:
		mutex_unlock(&counter->mmap_mutex);

		vma->vm_flags &= ~VM_MAYWRITE;
		vma->vm_flags \|= VM_RESERVED;
		vma->vm_ops = &perf_mmap_vmops;

		@@ -2163,11 +2188,38 @@ struct perf_output_handle {
		unsigned long head;
		unsigned long offset;
		int nmi;
		int overflow;
		int sample;
		int locked;
		unsigned long flags;
		};

		static bool perf_output_space(struct perf_mmap_data *data,
		unsigned int offset, unsigned int head)
		{
		unsigned long tail;
		unsigned long mask;

		if (!data->writable)
		return true;

		mask = (data->nr_pages << PAGE_SHIFT) - 1;
		/*
		* Userspace could choose to issue a mb() before updating the tail
		* pointer. So that all reads will be completed before the write is
		* issued.
		*/
		tail = ACCESS_ONCE(data->user_page->data_tail);
		smp_rmb();

		offset = (offset - tail) & mask;
		head = (head - tail) & mask;

		if ((int)(head - offset) < 0)
		return false;

		return true;
		}

		static void perf_output_wakeup(struct perf_output_handle *handle)
		{
		atomic_set(&handle->data->poll, POLL_IN);
		@@ -2258,12 +2310,57 @@ out:
		local_irq_restore(handle->flags);
		}

		static void perf_output_copy(struct perf_output_handle *handle,
		const void *buf, unsigned int len)
		{
		unsigned int pages_mask;
		unsigned int offset;
		unsigned int size;
		void **pages;

		offset = handle->offset;
		pages_mask = handle->data->nr_pages - 1;
		pages = handle->data->data_pages;

		do {
		unsigned int page_offset;
		int nr;

		nr = (offset >> PAGE_SHIFT) & pages_mask;
		page_offset = offset & (PAGE_SIZE - 1);
		size = min_t(unsigned int, PAGE_SIZE - page_offset, len);

		memcpy(pages[nr] + page_offset, buf, size);

		len -= size;
		buf += size;
		offset += size;
		} while (len);

		handle->offset = offset;

		/*
		* Check we didn't copy past our reservation window, taking the
		* possible unsigned int wrap into account.
		*/
		WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
		}

		#define perf_output_put(handle, x) \
		perf_output_copy((handle), &(x), sizeof(x))

		static int perf_output_begin(struct perf_output_handle *handle,
		struct perf_counter *counter, unsigned int size,
		int nmi, int overflow)
		int nmi, int sample)
		{
		struct perf_mmap_data *data;
		unsigned int offset, head;
		int have_lost;
		struct {
		struct perf_event_header header;
		u64 id;
		u64 lost;
		} lost_event;

		/*
		* For inherited counters we send all the output towards the parent.
		@@ -2279,16 +2376,22 @@ static int perf_output_begin(struct perf_output_handle *handle,
		handle->data = data;
		handle->counter = counter;
		handle->nmi = nmi;
		handle->overflow = overflow;
		handle->sample = sample;

		if (!data->nr_pages)
		goto fail;

		have_lost = atomic_read(&data->lost);
		if (have_lost)
		size += sizeof(lost_event);

		perf_output_lock(handle);

		do {
		offset = head = atomic_long_read(&data->head);
		head += size;
		if (unlikely(!perf_output_space(data, offset, head)))
		goto fail;
		} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);

		handle->offset = offset;
		@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
		if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
		atomic_set(&data->wakeup, 1);

		if (have_lost) {
		lost_event.header.type = PERF_EVENT_LOST;
		lost_event.header.misc = 0;
		lost_event.header.size = sizeof(lost_event);
		lost_event.id = counter->id;
		lost_event.lost = atomic_xchg(&data->lost, 0);

		perf_output_put(handle, lost_event);
		}

		return 0;

		fail:
		perf_output_wakeup(handle);
		atomic_inc(&data->lost);
		perf_output_unlock(handle);
		out:
		rcu_read_unlock();

		return -ENOSPC;
		}

		static void perf_output_copy(struct perf_output_handle *handle,
		const void *buf, unsigned int len)
		{
		unsigned int pages_mask;
		unsigned int offset;
		unsigned int size;
		void **pages;

		offset = handle->offset;
		pages_mask = handle->data->nr_pages - 1;
		pages = handle->data->data_pages;

		do {
		unsigned int page_offset;
		int nr;

		nr = (offset >> PAGE_SHIFT) & pages_mask;
		page_offset = offset & (PAGE_SIZE - 1);
		size = min_t(unsigned int, PAGE_SIZE - page_offset, len);

		memcpy(pages[nr] + page_offset, buf, size);

		len -= size;
		buf += size;
		offset += size;
		} while (len);

		handle->offset = offset;

		/*
		* Check we didn't copy past our reservation window, taking the
		* possible unsigned int wrap into account.
		*/
		WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
		}

		#define perf_output_put(handle, x) \
		perf_output_copy((handle), &(x), sizeof(x))

		static void perf_output_end(struct perf_output_handle *handle)
		{
		struct perf_counter *counter = handle->counter;
		@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)

		int wakeup_events = counter->attr.wakeup_events;

		if (handle->overflow && wakeup_events) {
		if (handle->sample && wakeup_events) {
		int events = atomic_inc_return(&data->events);
		if (events >= wakeup_events) {
		atomic_sub(wakeup_events, &data->events);
		@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
		}

		/*
		* Generic counter overflow handling.
		* Generic counter overflow handling, sampling.
		*/

		int perf_counter_overflow(struct perf_counter *counter, int nmi,