Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit aece948f authored by Arnaldo Carvalho de Melo's avatar Arnaldo Carvalho de Melo
Browse files

perf evlist: Fix per thread mmap setup



The PERF_EVENT_IOC_SET_OUTPUT ioctl was returning -EINVAL when using
--pid when monitoring multithreaded apps, as we can only share a ring
buffer for events on the same thread if not doing per cpu.

Fix it by using per thread ring buffers.

Tested with:

[root@felicio ~]# tuna -t 26131 -CP | nl
  1                      thread       ctxt_switches
  2    pid SCHED_ rtpri affinity voluntary nonvoluntary             cmd
  3 26131   OTHER     0      0,1  10814276      2397830 chromium-browse
  4  642    OTHER     0      0,1     14688            0 chromium-browse
  5  26148  OTHER     0      0,1    713602       115479 chromium-browse
  6  26149  OTHER     0      0,1    801958         2262 chromium-browse
  7  26150  OTHER     0      0,1   1271128          248 chromium-browse
  8  26151  OTHER     0      0,1         3            0 chromium-browse
  9  27049  OTHER     0      0,1     36796            9 chromium-browse
 10  618    OTHER     0      0,1     14711            0 chromium-browse
 11  661    OTHER     0      0,1     14593            0 chromium-browse
 12  29048  OTHER     0      0,1     28125            0 chromium-browse
 13  26143  OTHER     0      0,1   2202789          781 chromium-browse
[root@felicio ~]#

So 11 threads under pid 26131, then:

[root@felicio ~]# perf record -F 50000 --pid 26131

[root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl
  1 7fa4a2538000-7fa4a25b9000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  2 7fa4a25b9000-7fa4a263a000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  3 7fa4a263a000-7fa4a26bb000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  4 7fa4a26bb000-7fa4a273c000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  5 7fa4a273c000-7fa4a27bd000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  6 7fa4a27bd000-7fa4a283e000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  7 7fa4a283e000-7fa4a28bf000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  8 7fa4a28bf000-7fa4a2940000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
  9 7fa4a2940000-7fa4a29c1000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
 10 7fa4a29c1000-7fa4a2a42000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
 11 7fa4a2a42000-7fa4a2ac3000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
[root@felicio ~]#

11 mmaps, one per thread since we didn't specify any CPU list, so we need one
mmap per thread and:

[root@felicio ~]# perf record -F 50000 --pid 26131
^M
^C[ perf record: Woken up 79 times to write data ]
[ perf record: Captured and wrote 20.614 MB perf.data (~900639 samples) ]

[root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl
     1	 371310 26131
     2	  96516 26148
     3	  95694 26149
     4	  95203 26150
     5	   7291 26143
     6	     87 27049
     7	     76 661
     8	     60 29048
     9	     47 618
    10	     43 642
[root@felicio ~]#

Ok, one of the threads, 26151 was quiescent, so no samples there, but all the
others are there.

Then, if I specify one CPU:

[root@felicio ~]# perf record -F 50000 --pid 26131 --cpu 1
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.680 MB perf.data (~29730 samples) ]

[root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl
     1	   8444 26131
     2	   2584 26149
     3	   2518 26148
     4	   2324 26150
     5	    123 26143
     6	      9 661
     7	      9 29048
[root@felicio ~]#

This machine has two cores, so fewer threads appeared on the radar, and:

[root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl
 1 7f484b922000-7f484b9a3000 rwxs 00000000 00:09 4064 anon_inode:[perf_event]
[root@felicio ~]#

Just one mmap, as now we can use just one per-cpu buffer instead of the
per-thread needed in the previous case.

For global profiling:

[root@felicio ~]# perf record -F 50000 -a
^C[ perf record: Woken up 26 times to write data ]
[ perf record: Captured and wrote 7.128 MB perf.data (~311412 samples) ]

[root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl
     1	7fb49b435000-7fb49b4b6000 rwxs 00000000 00:09 4064                       anon_inode:[perf_event]
     2	7fb49b4b6000-7fb49b537000 rwxs 00000000 00:09 4064                       anon_inode:[perf_event]
[root@felicio ~]#

It uses per-cpu buffers.

For just one thread:

[root@felicio ~]# perf record -F 50000 --tid 26148
^C[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.330 MB perf.data (~14426 samples) ]

[root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl
     1	   9969 26148
[root@felicio ~]#

[root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl
     1	7f286a51b000-7f286a59c000 rwxs 00000000 00:09 4064                       anon_inode:[perf_event]
[root@felicio ~]#

Tested-by: default avatarDavid Ahern <dsahern@gmail.com>
Tested-by: default avatarLin Ming <ming.m.lin@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/r/20110426204401.GB1746@ghostprotocols.net


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent b9019418
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -427,7 +427,7 @@ static void mmap_read_all(void)
{
{
	int i;
	int i;


	for (i = 0; i < evsel_list->cpus->nr; i++) {
	for (i = 0; i < evsel_list->nr_mmaps; i++) {
		if (evsel_list->mmap[i].base)
		if (evsel_list->mmap[i].base)
			mmap_read(&evsel_list->mmap[i]);
			mmap_read(&evsel_list->mmap[i]);
	}
	}
+1 −1
Original line number Original line Diff line number Diff line
@@ -549,7 +549,7 @@ static int test__basic_mmap(void)
			++foo;
			++foo;
		}
		}


	while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) {
	while ((event = perf_evlist__mmap_read(evlist, 0)) != NULL) {
		struct perf_sample sample;
		struct perf_sample sample;


		if (event->header.type != PERF_RECORD_SAMPLE) {
		if (event->header.type != PERF_RECORD_SAMPLE) {
+4 −4
Original line number Original line Diff line number Diff line
@@ -801,12 +801,12 @@ static void perf_event__process_sample(const union perf_event *event,
	}
	}
}
}


static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
static void perf_session__mmap_read_idx(struct perf_session *self, int idx)
{
{
	struct perf_sample sample;
	struct perf_sample sample;
	union perf_event *event;
	union perf_event *event;


	while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) {
	while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) {
		perf_session__parse_sample(self, event, &sample);
		perf_session__parse_sample(self, event, &sample);


		if (event->header.type == PERF_RECORD_SAMPLE)
		if (event->header.type == PERF_RECORD_SAMPLE)
@@ -820,8 +820,8 @@ static void perf_session__mmap_read(struct perf_session *self)
{
{
	int i;
	int i;


	for (i = 0; i < top.evlist->cpus->nr; i++)
	for (i = 0; i < top.evlist->nr_mmaps; i++)
		perf_session__mmap_read_cpu(self, i);
		perf_session__mmap_read_idx(self, i);
}
}


static void start_counters(struct perf_evlist *evlist)
static void start_counters(struct perf_evlist *evlist)
+106 −45
Original line number Original line Diff line number Diff line
@@ -166,11 +166,11 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
	return NULL;
	return NULL;
}
}


union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
{
{
	/* XXX Move this to perf.c, making it generally available */
	/* XXX Move this to perf.c, making it generally available */
	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
	struct perf_mmap *md = &evlist->mmap[cpu];
	struct perf_mmap *md = &evlist->mmap[idx];
	unsigned int head = perf_mmap__read_head(md);
	unsigned int head = perf_mmap__read_head(md);
	unsigned int old = md->prev;
	unsigned int old = md->prev;
	unsigned char *data = md->base + page_size;
	unsigned char *data = md->base + page_size;
@@ -235,31 +235,37 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)


void perf_evlist__munmap(struct perf_evlist *evlist)
void perf_evlist__munmap(struct perf_evlist *evlist)
{
{
	int cpu;
	int i;


	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
	for (i = 0; i < evlist->nr_mmaps; i++) {
		if (evlist->mmap[cpu].base != NULL) {
		if (evlist->mmap[i].base != NULL) {
			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
			munmap(evlist->mmap[i].base, evlist->mmap_len);
			evlist->mmap[cpu].base = NULL;
			evlist->mmap[i].base = NULL;
		}
		}
	}
	}

	free(evlist->mmap);
	evlist->mmap = NULL;
}
}


int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
{
{
	evlist->mmap = zalloc(evlist->cpus->nr * sizeof(struct perf_mmap));
	evlist->nr_mmaps = evlist->cpus->nr;
	if (evlist->cpus->map[0] == -1)
		evlist->nr_mmaps = evlist->threads->nr;
	evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
	return evlist->mmap != NULL ? 0 : -ENOMEM;
	return evlist->mmap != NULL ? 0 : -ENOMEM;
}
}


static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *evsel,
static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *evsel,
			       int cpu, int prot, int mask, int fd)
			       int idx, int prot, int mask, int fd)
{
{
	evlist->mmap[cpu].prev = 0;
	evlist->mmap[idx].prev = 0;
	evlist->mmap[cpu].mask = mask;
	evlist->mmap[idx].mask = mask;
	evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
	evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot,
				      MAP_SHARED, fd, 0);
				      MAP_SHARED, fd, 0);
	if (evlist->mmap[cpu].base == MAP_FAILED) {
	if (evlist->mmap[idx].base == MAP_FAILED) {
		if (evlist->cpus->map[cpu] == -1 && evsel->attr.inherit)
		if (evlist->cpus->map[idx] == -1 && evsel->attr.inherit)
			ui__warning("Inherit is not allowed on per-task "
			ui__warning("Inherit is not allowed on per-task "
				    "events using mmap.\n");
				    "events using mmap.\n");
		return -1;
		return -1;
@@ -269,6 +275,86 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *ev
	return 0;
	return 0;
}
}


static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int mask)
{
	struct perf_evsel *evsel;
	int cpu, thread;

	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
		int output = -1;

		for (thread = 0; thread < evlist->threads->nr; thread++) {
			list_for_each_entry(evsel, &evlist->entries, node) {
				int fd = FD(evsel, cpu, thread);

				if (output == -1) {
					output = fd;
					if (__perf_evlist__mmap(evlist, evsel, cpu,
								prot, mask, output) < 0)
						goto out_unmap;
				} else {
					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0)
						goto out_unmap;
				}

				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
				    perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
					goto out_unmap;
			}
		}
	}

	return 0;

out_unmap:
	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
		if (evlist->mmap[cpu].base != NULL) {
			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
			evlist->mmap[cpu].base = NULL;
		}
	}
	return -1;
}

static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, int mask)
{
	struct perf_evsel *evsel;
	int thread;

	for (thread = 0; thread < evlist->threads->nr; thread++) {
		int output = -1;

		list_for_each_entry(evsel, &evlist->entries, node) {
			int fd = FD(evsel, 0, thread);

			if (output == -1) {
				output = fd;
				if (__perf_evlist__mmap(evlist, evsel, thread,
							prot, mask, output) < 0)
					goto out_unmap;
			} else {
				if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0)
					goto out_unmap;
			}

			if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
			    perf_evlist__id_add_fd(evlist, evsel, 0, thread, fd) < 0)
				goto out_unmap;
		}
	}

	return 0;

out_unmap:
	for (thread = 0; thread < evlist->threads->nr; thread++) {
		if (evlist->mmap[thread].base != NULL) {
			munmap(evlist->mmap[thread].base, evlist->mmap_len);
			evlist->mmap[thread].base = NULL;
		}
	}
	return -1;
}

/** perf_evlist__mmap - Create per cpu maps to receive events
/** perf_evlist__mmap - Create per cpu maps to receive events
 *
 *
 * @evlist - list of events
 * @evlist - list of events
@@ -287,11 +373,11 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *ev
int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
{
{
	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
	int mask = pages * page_size - 1, cpu;
	int mask = pages * page_size - 1;
	struct perf_evsel *first_evsel, *evsel;
	struct perf_evsel *evsel;
	const struct cpu_map *cpus = evlist->cpus;
	const struct cpu_map *cpus = evlist->cpus;
	const struct thread_map *threads = evlist->threads;
	const struct thread_map *threads = evlist->threads;
	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
	int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);


	if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
	if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
		return -ENOMEM;
		return -ENOMEM;
@@ -301,43 +387,18 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)


	evlist->overwrite = overwrite;
	evlist->overwrite = overwrite;
	evlist->mmap_len = (pages + 1) * page_size;
	evlist->mmap_len = (pages + 1) * page_size;
	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);


	list_for_each_entry(evsel, &evlist->entries, node) {
	list_for_each_entry(evsel, &evlist->entries, node) {
		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
		    evsel->sample_id == NULL &&
		    evsel->sample_id == NULL &&
		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
			return -ENOMEM;
			return -ENOMEM;

		for (cpu = 0; cpu < cpus->nr; cpu++) {
			for (thread = 0; thread < threads->nr; thread++) {
				int fd = FD(evsel, cpu, thread);

				if (evsel->idx || thread) {
					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
						  FD(first_evsel, cpu, 0)) != 0)
						goto out_unmap;
				} else if (__perf_evlist__mmap(evlist, evsel, cpu,
							       prot, mask, fd) < 0)
					goto out_unmap;

				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
				    perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
					goto out_unmap;
			}
		}
	}
	}


	return 0;
	if (evlist->cpus->map[0] == -1)
		return perf_evlist__mmap_per_thread(evlist, prot, mask);


out_unmap:
	return perf_evlist__mmap_per_cpu(evlist, prot, mask);
	for (cpu = 0; cpu < cpus->nr; cpu++) {
		if (evlist->mmap[cpu].base != NULL) {
			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
			evlist->mmap[cpu].base = NULL;
		}
	}
	return -1;
}
}


int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
+2 −1
Original line number Original line Diff line number Diff line
@@ -17,6 +17,7 @@ struct perf_evlist {
	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
	int		 nr_entries;
	int		 nr_entries;
	int		 nr_fds;
	int		 nr_fds;
	int		 nr_mmaps;
	int		 mmap_len;
	int		 mmap_len;
	bool		 overwrite;
	bool		 overwrite;
	union perf_event event_copy;
	union perf_event event_copy;
@@ -46,7 +47,7 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);


struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);


union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx);


int perf_evlist__alloc_mmap(struct perf_evlist *evlist);
int perf_evlist__alloc_mmap(struct perf_evlist *evlist);
int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite);
int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite);
Loading