Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d00aa669 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'perfcounters-fixes-for-linus' of...

Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (27 commits)
  perf_counter: Zero dead bytes from ftrace raw samples size alignment
  perf_counter: Subtract the buffer size field from the event record size
  perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data
  perf_counter: Correct PERF_SAMPLE_RAW output
  perf tools: callchain: Fix bad rounding of minimum rate
  perf_counter tools: Fix libbfd detection for systems with libz dependency
  perf: "Longum est iter per praecepta, breve et efficax per exempla"
  perf_counter: Fix a race on perf_counter_ctx
  perf_counter: Fix tracepoint sampling to be part of generic sampling
  perf_counter: Work around gcc warning by initializing tracepoint record unconditionally
  perf tools: callchain: Fix sum of percentages to be 100% by displaying amount of ignored chains in fractal mode
  perf tools: callchain: Fix 'perf report' display to be callchain by default
  perf tools: callchain: Fix spurious 'perf report' warnings: ignore empty callchains
  perf record: Fix the -A UI for empty or non-existent perf.data
  perf util: Fix do_read() to fail on EOF instead of busy-looping
  perf list: Fix the output to not include tracepoints without an id
  perf_counter/powerpc: Fix oops on cpus without perf_counter hardware support
  perf stat: Fix tool option consistency: rename -S/--scale to -c/--scale
  perf report: Add debug help for the finding of symbol bugs - show the symtab origin (DSO, build-id, kernel, etc)
  perf report: Fix per task mult-counter stat reporting
  ...
parents cec36911 1853db0e
Loading
Loading
Loading
Loading
+8 −0
Original line number Original line Diff line number Diff line
@@ -518,6 +518,8 @@ void hw_perf_disable(void)
	struct cpu_hw_counters *cpuhw;
	struct cpu_hw_counters *cpuhw;
	unsigned long flags;
	unsigned long flags;


	if (!ppmu)
		return;
	local_irq_save(flags);
	local_irq_save(flags);
	cpuhw = &__get_cpu_var(cpu_hw_counters);
	cpuhw = &__get_cpu_var(cpu_hw_counters);


@@ -572,6 +574,8 @@ void hw_perf_enable(void)
	int n_lim;
	int n_lim;
	int idx;
	int idx;


	if (!ppmu)
		return;
	local_irq_save(flags);
	local_irq_save(flags);
	cpuhw = &__get_cpu_var(cpu_hw_counters);
	cpuhw = &__get_cpu_var(cpu_hw_counters);
	if (!cpuhw->disabled) {
	if (!cpuhw->disabled) {
@@ -737,6 +741,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
	long i, n, n0;
	long i, n, n0;
	struct perf_counter *sub;
	struct perf_counter *sub;


	if (!ppmu)
		return 0;
	cpuhw = &__get_cpu_var(cpu_hw_counters);
	cpuhw = &__get_cpu_var(cpu_hw_counters);
	n0 = cpuhw->n_counters;
	n0 = cpuhw->n_counters;
	n = collect_events(group_leader, ppmu->n_counter - n0,
	n = collect_events(group_leader, ppmu->n_counter - n0,
@@ -1281,6 +1287,8 @@ void hw_perf_counter_setup(int cpu)
{
{
	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);


	if (!ppmu)
		return;
	memset(cpuhw, 0, sizeof(*cpuhw));
	memset(cpuhw, 0, sizeof(*cpuhw));
	cpuhw->mmcr[0] = MMCR0_FC;
	cpuhw->mmcr[0] = MMCR0_FC;
}
}
+7 −5
Original line number Original line Diff line number Diff line
@@ -121,7 +121,7 @@ enum perf_counter_sample_format {
	PERF_SAMPLE_CPU				= 1U << 7,
	PERF_SAMPLE_CPU				= 1U << 7,
	PERF_SAMPLE_PERIOD			= 1U << 8,
	PERF_SAMPLE_PERIOD			= 1U << 8,
	PERF_SAMPLE_STREAM_ID			= 1U << 9,
	PERF_SAMPLE_STREAM_ID			= 1U << 9,
	PERF_SAMPLE_TP_RECORD			= 1U << 10,
	PERF_SAMPLE_RAW				= 1U << 10,


	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
};
};
@@ -369,6 +369,8 @@ enum perf_event_type {
	 *
	 *
	 *	{ u64			nr,
	 *	{ u64			nr,
	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
	 *	{ u32			size;
	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
	 * };
	 * };
	 */
	 */
	PERF_EVENT_SAMPLE		= 9,
	PERF_EVENT_SAMPLE		= 9,
@@ -414,9 +416,9 @@ struct perf_callchain_entry {
	__u64				ip[PERF_MAX_STACK_DEPTH];
	__u64				ip[PERF_MAX_STACK_DEPTH];
};
};


struct perf_tracepoint_record {
struct perf_raw_record {
	int				size;
	u32				size;
	char				*record;
	void				*data;
};
};


struct task_struct;
struct task_struct;
@@ -687,7 +689,7 @@ struct perf_sample_data {
	struct pt_regs			*regs;
	struct pt_regs			*regs;
	u64				addr;
	u64				addr;
	u64				period;
	u64				period;
	void				*private;
	struct perf_raw_record		*raw;
};
};


extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+13 −2
Original line number Original line Diff line number Diff line
@@ -637,12 +637,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
 *	pc = preempt_count();
 *	pc = preempt_count();
 *
 *
 *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
 *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
 *	__entry_size = __data_size + sizeof(*entry);
 *
 *	// Below we want to get the aligned size by taking into account
 *	// the u32 field that will later store the buffer size
 *	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),
 *			     sizeof(u64));
 *	__entry_size -= sizeof(u32);
 *
 *
 *	do {
 *	do {
 *		char raw_data[__entry_size]; <- allocate our sample in the stack
 *		char raw_data[__entry_size]; <- allocate our sample in the stack
 *		struct trace_entry *ent;
 *		struct trace_entry *ent;
 *
 *
 *		zero dead bytes from alignment to avoid stack leak to userspace:
 *
 *		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
 *		entry = (struct ftrace_raw_<call> *)raw_data;
 *		entry = (struct ftrace_raw_<call> *)raw_data;
 *		ent = &entry->ent;
 *		ent = &entry->ent;
 *		tracing_generic_entry_update(ent, irq_flags, pc);
 *		tracing_generic_entry_update(ent, irq_flags, pc);
@@ -685,12 +693,15 @@ static void ftrace_profile_##call(proto) \
	pc = preempt_count();						\
	pc = preempt_count();						\
									\
									\
	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
	__entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
			     sizeof(u64));				\
	__entry_size -= sizeof(u32);					\
									\
									\
	do {								\
	do {								\
		char raw_data[__entry_size];				\
		char raw_data[__entry_size];				\
		struct trace_entry *ent;				\
		struct trace_entry *ent;				\
									\
									\
		*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;	\
		entry = (struct ftrace_raw_##call *)raw_data;		\
		entry = (struct ftrace_raw_##call *)raw_data;		\
		ent = &entry->ent;					\
		ent = &entry->ent;					\
		tracing_generic_entry_update(ent, irq_flags, pc);	\
		tracing_generic_entry_update(ent, irq_flags, pc);	\
+145 −94
Original line number Original line Diff line number Diff line
@@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
		u64 counter;
		u64 counter;
	} group_entry;
	} group_entry;
	struct perf_callchain_entry *callchain = NULL;
	struct perf_callchain_entry *callchain = NULL;
	struct perf_tracepoint_record *tp;
	int callchain_size = 0;
	int callchain_size = 0;
	u64 time;
	u64 time;
	struct {
	struct {
@@ -2715,9 +2714,16 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
			header.size += sizeof(u64);
			header.size += sizeof(u64);
	}
	}


	if (sample_type & PERF_SAMPLE_TP_RECORD) {
	if (sample_type & PERF_SAMPLE_RAW) {
		tp = data->private;
		int size = sizeof(u32);
		header.size += tp->size;

		if (data->raw)
			size += data->raw->size;
		else
			size += sizeof(u32);

		WARN_ON_ONCE(size & (sizeof(u64)-1));
		header.size += size;
	}
	}


	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2783,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
		}
		}
	}
	}


	if (sample_type & PERF_SAMPLE_TP_RECORD)
	if (sample_type & PERF_SAMPLE_RAW) {
		perf_output_copy(&handle, tp->record, tp->size);
		if (data->raw) {
			perf_output_put(&handle, data->raw->size);
			perf_output_copy(&handle, data->raw->data, data->raw->size);
		} else {
			struct {
				u32	size;
				u32	data;
			} raw = {
				.size = sizeof(u32),
				.data = 0,
			};
			perf_output_put(&handle, raw);
		}
	}


	perf_output_end(&handle);
	perf_output_end(&handle);
}
}
@@ -2850,6 +2869,7 @@ perf_counter_read_event(struct perf_counter *counter,


struct perf_task_event {
struct perf_task_event {
	struct task_struct		*task;
	struct task_struct		*task;
	struct perf_counter_context	*task_ctx;


	struct {
	struct {
		struct perf_event_header	header;
		struct perf_event_header	header;
@@ -2909,24 +2929,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
static void perf_counter_task_event(struct perf_task_event *task_event)
static void perf_counter_task_event(struct perf_task_event *task_event)
{
{
	struct perf_cpu_context *cpuctx;
	struct perf_cpu_context *cpuctx;
	struct perf_counter_context *ctx;
	struct perf_counter_context *ctx = task_event->task_ctx;


	cpuctx = &get_cpu_var(perf_cpu_context);
	cpuctx = &get_cpu_var(perf_cpu_context);
	perf_counter_task_ctx(&cpuctx->ctx, task_event);
	perf_counter_task_ctx(&cpuctx->ctx, task_event);
	put_cpu_var(perf_cpu_context);
	put_cpu_var(perf_cpu_context);


	rcu_read_lock();
	rcu_read_lock();
	/*
	if (!ctx)
	 * doesn't really matter which of the child contexts the
		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
	 * events ends up in.
	 */
	ctx = rcu_dereference(current->perf_counter_ctxp);
	if (ctx)
	if (ctx)
		perf_counter_task_ctx(ctx, task_event);
		perf_counter_task_ctx(ctx, task_event);
	rcu_read_unlock();
	rcu_read_unlock();
}
}


static void perf_counter_task(struct task_struct *task, int new)
static void perf_counter_task(struct task_struct *task,
			      struct perf_counter_context *task_ctx,
			      int new)
{
{
	struct perf_task_event task_event;
	struct perf_task_event task_event;


@@ -2937,6 +2956,7 @@ static void perf_counter_task(struct task_struct *task, int new)


	task_event = (struct perf_task_event){
	task_event = (struct perf_task_event){
		.task	  = task,
		.task	  = task,
		.task_ctx = task_ctx,
		.event    = {
		.event    = {
			.header = {
			.header = {
				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
@@ -2955,7 +2975,7 @@ static void perf_counter_task(struct task_struct *task, int new)


void perf_counter_fork(struct task_struct *task)
void perf_counter_fork(struct task_struct *task)
{
{
	perf_counter_task(task, 1);
	perf_counter_task(task, NULL, 1);
}
}


/*
/*
@@ -3344,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
 * Generic software counter infrastructure
 * Generic software counter infrastructure
 */
 */


static void perf_swcounter_update(struct perf_counter *counter)
/*
 * We directly increment counter->count and keep a second value in
 * counter->hw.period_left to count intervals. This period counter
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

static u64 perf_swcounter_set_period(struct perf_counter *counter)
{
{
	struct hw_perf_counter *hwc = &counter->hw;
	struct hw_perf_counter *hwc = &counter->hw;
	u64 prev, now;
	u64 period = hwc->last_period;
	s64 delta;
	u64 nr, offset;
	s64 old, val;

	hwc->last_period = hwc->sample_period;


again:
again:
	prev = atomic64_read(&hwc->prev_count);
	old = val = atomic64_read(&hwc->period_left);
	now = atomic64_read(&hwc->count);
	if (val < 0)
	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
		return 0;
		goto again;


	delta = now - prev;
	nr = div64_u64(period + val, period);
	offset = nr * period;
	val -= offset;
	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
		goto again;


	atomic64_add(delta, &counter->count);
	return nr;
	atomic64_sub(delta, &hwc->period_left);
}
}


static void perf_swcounter_set_period(struct perf_counter *counter)
static void perf_swcounter_overflow(struct perf_counter *counter,
				    int nmi, struct perf_sample_data *data)
{
{
	struct hw_perf_counter *hwc = &counter->hw;
	struct hw_perf_counter *hwc = &counter->hw;
	s64 left = atomic64_read(&hwc->period_left);
	u64 overflow;
	s64 period = hwc->sample_period;


	if (unlikely(left <= -period)) {
	data->period = counter->hw.last_period;
		left = period;
	overflow = perf_swcounter_set_period(counter);
		atomic64_set(&hwc->period_left, left);
		hwc->last_period = period;
	}


	if (unlikely(left <= 0)) {
	if (hwc->interrupts == MAX_INTERRUPTS)
		left += period;
		return;
		atomic64_add(period, &hwc->period_left);
		hwc->last_period = period;
	}


	atomic64_set(&hwc->prev_count, -left);
	for (; overflow; overflow--) {
	atomic64_set(&hwc->count, -left);
		if (perf_counter_overflow(counter, nmi, data)) {
			/*
			 * We inhibit the overflow from happening when
			 * hwc->interrupts == MAX_INTERRUPTS.
			 */
			break;
		}
	}
}
}


static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
static void perf_swcounter_unthrottle(struct perf_counter *counter)
{
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
	struct perf_counter *counter;
	u64 period;

	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
	counter->pmu->read(counter);

	data.addr = 0;
	data.regs = get_irq_regs();
	/*
	/*
	 * In case we exclude kernel IPs or are somehow not in interrupt
	 * Nothing to do, we already reset hwc->interrupts.
	 * context, provide the next best thing, the user IP.
	 */
	 */
	if ((counter->attr.exclude_kernel || !data.regs) &&
			!counter->attr.exclude_user)
		data.regs = task_pt_regs(current);

	if (data.regs) {
		if (perf_counter_overflow(counter, 0, &data))
			ret = HRTIMER_NORESTART;
	}

	period = max_t(u64, 10000, counter->hw.sample_period);
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}
}


static void perf_swcounter_overflow(struct perf_counter *counter,
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
			       int nmi, struct perf_sample_data *data)
			       int nmi, struct perf_sample_data *data)
{
{
	data->period = counter->hw.last_period;
	struct hw_perf_counter *hwc = &counter->hw;


	perf_swcounter_update(counter);
	atomic64_add(nr, &counter->count);
	perf_swcounter_set_period(counter);

	if (perf_counter_overflow(counter, nmi, data))
	if (!hwc->sample_period)
		/* soft-disable the counter */
		return;
		;

	if (!data->regs)
		return;

	if (!atomic64_add_negative(nr, &hwc->period_left))
		perf_swcounter_overflow(counter, nmi, data);
}
}


static int perf_swcounter_is_counting(struct perf_counter *counter)
static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3488,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
	return 1;
	return 1;
}
}


static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
			       int nmi, struct perf_sample_data *data)
{
	int neg = atomic64_add_negative(nr, &counter->hw.count);

	if (counter->hw.sample_period && !neg && data->regs)
		perf_swcounter_overflow(counter, nmi, data);
}

static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
				     enum perf_type_id type,
				     enum perf_type_id type,
				     u32 event, u64 nr, int nmi,
				     u32 event, u64 nr, int nmi,
@@ -3575,26 +3580,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,


static void perf_swcounter_read(struct perf_counter *counter)
static void perf_swcounter_read(struct perf_counter *counter)
{
{
	perf_swcounter_update(counter);
}
}


static int perf_swcounter_enable(struct perf_counter *counter)
static int perf_swcounter_enable(struct perf_counter *counter)
{
{
	struct hw_perf_counter *hwc = &counter->hw;

	if (hwc->sample_period) {
		hwc->last_period = hwc->sample_period;
		perf_swcounter_set_period(counter);
		perf_swcounter_set_period(counter);
	}
	return 0;
	return 0;
}
}


static void perf_swcounter_disable(struct perf_counter *counter)
static void perf_swcounter_disable(struct perf_counter *counter)
{
{
	perf_swcounter_update(counter);
}
}


static const struct pmu perf_ops_generic = {
static const struct pmu perf_ops_generic = {
	.enable		= perf_swcounter_enable,
	.enable		= perf_swcounter_enable,
	.disable	= perf_swcounter_disable,
	.disable	= perf_swcounter_disable,
	.read		= perf_swcounter_read,
	.read		= perf_swcounter_read,
	.unthrottle	= perf_swcounter_unthrottle,
};
};


/*
 * hrtimer based swcounter callback
 */

static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
	struct perf_counter *counter;
	u64 period;

	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
	counter->pmu->read(counter);

	data.addr = 0;
	data.regs = get_irq_regs();
	/*
	 * In case we exclude kernel IPs or are somehow not in interrupt
	 * context, provide the next best thing, the user IP.
	 */
	if ((counter->attr.exclude_kernel || !data.regs) &&
			!counter->attr.exclude_user)
		data.regs = task_pt_regs(current);

	if (data.regs) {
		if (perf_counter_overflow(counter, 0, &data))
			ret = HRTIMER_NORESTART;
	}

	period = max_t(u64, 10000, counter->hw.sample_period);
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}

/*
/*
 * Software counter: cpu wall time clock
 * Software counter: cpu wall time clock
 */
 */
@@ -3715,15 +3759,15 @@ static const struct pmu perf_ops_task_clock = {
void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
			  int entry_size)
			  int entry_size)
{
{
	struct perf_tracepoint_record tp = {
	struct perf_raw_record raw = {
		.size = entry_size,
		.size = entry_size,
		.record = record,
		.data = record,
	};
	};


	struct perf_sample_data data = {
	struct perf_sample_data data = {
		.regs = get_irq_regs(),
		.regs = get_irq_regs(),
		.addr = addr,
		.addr = addr,
		.private = &tp,
		.raw = &raw,
	};
	};


	if (!data.regs)
	if (!data.regs)
@@ -3743,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)


static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
{
	/*
	 * Raw tracepoint data is a severe data leak, only allow root to
	 * have these.
	 */
	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
			!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

	if (ftrace_profile_enable(counter->attr.config))
	if (ftrace_profile_enable(counter->attr.config))
		return NULL;
		return NULL;


@@ -4285,7 +4337,7 @@ void perf_counter_exit_task(struct task_struct *child)
	unsigned long flags;
	unsigned long flags;


	if (likely(!child->perf_counter_ctxp)) {
	if (likely(!child->perf_counter_ctxp)) {
		perf_counter_task(child, 0);
		perf_counter_task(child, NULL, 0);
		return;
		return;
	}
	}


@@ -4305,6 +4357,7 @@ void perf_counter_exit_task(struct task_struct *child)
	 * incremented the context's refcount before we do put_ctx below.
	 * incremented the context's refcount before we do put_ctx below.
	 */
	 */
	spin_lock(&child_ctx->lock);
	spin_lock(&child_ctx->lock);
	child->perf_counter_ctxp = NULL;
	/*
	/*
	 * If this context is a clone; unclone it so it can't get
	 * If this context is a clone; unclone it so it can't get
	 * swapped to another process while we're removing all
	 * swapped to another process while we're removing all
@@ -4318,9 +4371,7 @@ void perf_counter_exit_task(struct task_struct *child)
	 * won't get any samples after PERF_EVENT_EXIT. We can however still
	 * won't get any samples after PERF_EVENT_EXIT. We can however still
	 * get a few PERF_EVENT_READ events.
	 * get a few PERF_EVENT_READ events.
	 */
	 */
	perf_counter_task(child, 0);
	perf_counter_task(child, child_ctx, 0);

	child->perf_counter_ctxp = NULL;


	/*
	/*
	 * We can recurse on the same lock type through:
	 * We can recurse on the same lock type through:
+225 −0
Original line number Original line Diff line number Diff line

		------------------------------
		****** perf by examples ******
		------------------------------

[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ]


First, discovery/enumeration of available counters can be done via
'perf list':

titan:~> perf list
  [...]
  kmem:kmalloc                             [Tracepoint event]
  kmem:kmem_cache_alloc                    [Tracepoint event]
  kmem:kmalloc_node                        [Tracepoint event]
  kmem:kmem_cache_alloc_node               [Tracepoint event]
  kmem:kfree                               [Tracepoint event]
  kmem:kmem_cache_free                     [Tracepoint event]
  kmem:mm_page_free_direct                 [Tracepoint event]
  kmem:mm_pagevec_free                     [Tracepoint event]
  kmem:mm_page_alloc                       [Tracepoint event]
  kmem:mm_page_alloc_zone_locked           [Tracepoint event]
  kmem:mm_page_pcpu_drain                  [Tracepoint event]
  kmem:mm_page_alloc_extfrag               [Tracepoint event]

Then any (or all) of the above event sources can be activated and
measured. For example the page alloc/free properties of a 'hackbench
run' are:

 titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
 -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10
 Time: 0.575

 Performance counter stats for './hackbench 10':

          13857  kmem:mm_page_pcpu_drain
          27576  kmem:mm_page_alloc
           6025  kmem:mm_pagevec_free
          20934  kmem:mm_page_free_direct

    0.613972165  seconds time elapsed

You can observe the statistical properties as well, by using the
'repeat the workload N times' feature of perf stat:

 titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
   kmem:mm_page_free_direct ./hackbench 10
 Time: 0.627
 Time: 0.644
 Time: 0.564
 Time: 0.559
 Time: 0.626

 Performance counter stats for './hackbench 10' (5 runs):

          12920  kmem:mm_page_pcpu_drain    ( +-   3.359% )
          25035  kmem:mm_page_alloc         ( +-   3.783% )
           6104  kmem:mm_pagevec_free       ( +-   0.934% )
          18376  kmem:mm_page_free_direct   ( +-   4.941% )

    0.643954516  seconds time elapsed   ( +-   2.363% )

Furthermore, these tracepoints can be used to sample the workload as
well. For example the page allocations done by a 'git gc' can be
captured the following way:

 titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc
 Counting objects: 1148, done.
 Delta compression using up to 2 threads.
 Compressing objects: 100% (450/450), done.
 Writing objects: 100% (1148/1148), done.
 Total 1148 (delta 690), reused 1148 (delta 690)
 [ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ]

To check which functions generated page allocations:

 titan:~/git> perf report
 # Samples: 10646
 #
 # Overhead          Command               Shared Object
 # ........  ...............  ..........................
 #
    23.57%       git-repack  /lib64/libc-2.5.so
    21.81%              git  /lib64/libc-2.5.so
    14.59%              git  ./git
    11.79%       git-repack  ./git
     7.12%              git  /lib64/ld-2.5.so
     3.16%       git-repack  /lib64/libpthread-2.5.so
     2.09%       git-repack  /bin/bash
     1.97%               rm  /lib64/libc-2.5.so
     1.39%               mv  /lib64/ld-2.5.so
     1.37%               mv  /lib64/libc-2.5.so
     1.12%       git-repack  /lib64/ld-2.5.so
     0.95%               rm  /lib64/ld-2.5.so
     0.90%  git-update-serv  /lib64/libc-2.5.so
     0.73%  git-update-serv  /lib64/ld-2.5.so
     0.68%             perf  /lib64/libpthread-2.5.so
     0.64%       git-repack  /usr/lib64/libz.so.1.2.3

Or to see it on a more finegrained level:

titan:~/git> perf report --sort comm,dso,symbol
# Samples: 10646
#
# Overhead          Command               Shared Object  Symbol
# ........  ...............  ..........................  ......
#
     9.35%       git-repack  ./git                       [.] insert_obj_hash
     9.12%              git  ./git                       [.] insert_obj_hash
     7.31%              git  /lib64/libc-2.5.so          [.] memcpy
     6.34%       git-repack  /lib64/libc-2.5.so          [.] _int_malloc
     6.24%       git-repack  /lib64/libc-2.5.so          [.] memcpy
     5.82%       git-repack  /lib64/libc-2.5.so          [.] __GI___fork
     5.47%              git  /lib64/libc-2.5.so          [.] _int_malloc
     2.99%              git  /lib64/libc-2.5.so          [.] memset

Furthermore, call-graph sampling can be done too, of page
allocations - to see precisely what kind of page allocations there
are:

 titan:~/git> perf record -f -g -e kmem:mm_page_alloc -c 1 ./git gc
 Counting objects: 1148, done.
 Delta compression using up to 2 threads.
 Compressing objects: 100% (450/450), done.
 Writing objects: 100% (1148/1148), done.
 Total 1148 (delta 690), reused 1148 (delta 690)
 [ perf record: Captured and wrote 0.963 MB perf.data (~42069 samples) ]

 titan:~/git> perf report -g
 # Samples: 10686
 #
 # Overhead          Command               Shared Object
 # ........  ...............  ..........................
 #
    23.25%       git-repack  /lib64/libc-2.5.so
                |
                |--50.00%-- _int_free
                |
                |--37.50%-- __GI___fork
                |          make_child
                |
                |--12.50%-- ptmalloc_unlock_all2
                |          make_child
                |
                 --6.25%-- __GI_strcpy
    21.61%              git  /lib64/libc-2.5.so
                |
                |--30.00%-- __GI_read
                |          |
                |           --83.33%-- git_config_from_file
                |                     git_config
                |                     |
   [...]

Or you can observe the whole system's page allocations for 10
seconds:

titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e
kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
kmem:mm_page_free_direct sleep 10

 Performance counter stats for 'sleep 10':

         171585  kmem:mm_page_pcpu_drain
         322114  kmem:mm_page_alloc
          73623  kmem:mm_pagevec_free
         254115  kmem:mm_page_free_direct

   10.000591410  seconds time elapsed

Or observe how fluctuating the page allocations are, via statistical
analysis done over ten 1-second intervals:

 titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e
   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
   kmem:mm_page_free_direct sleep 1

 Performance counter stats for 'sleep 1' (10 runs):

          17254  kmem:mm_page_pcpu_drain    ( +-   3.709% )
          34394  kmem:mm_page_alloc         ( +-   4.617% )
           7509  kmem:mm_pagevec_free       ( +-   4.820% )
          25653  kmem:mm_page_free_direct   ( +-   3.672% )

    1.058135029  seconds time elapsed   ( +-   3.089% )

Or you can annotate the recorded 'git gc' run on a per symbol basis
and check which instructions/source-code generated page allocations:

 titan:~/git> perf annotate __GI___fork
 ------------------------------------------------
  Percent |      Source code & Disassembly of libc-2.5.so
 ------------------------------------------------
          :
          :
          :      Disassembly of section .plt:
          :      Disassembly of section .text:
          :
          :      00000031a2e95560 <__fork>:
 [...]
     0.00 :        31a2e95602:   b8 38 00 00 00          mov    $0x38,%eax
     0.00 :        31a2e95607:   0f 05                   syscall
    83.42 :        31a2e95609:   48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax
     0.00 :        31a2e9560f:   0f 87 4d 01 00 00       ja     31a2e95762 <__fork+0x202>
     0.00 :        31a2e95615:   85 c0                   test   %eax,%eax

( this shows that 83.42% of __GI___fork's page allocations come from
  the 0x38 system call it performs. )

etc. etc. - a lot more is possible. I could list a dozen of
other different usecases straight away - neither of which is
possible via /proc/vmstat.

/proc/vmstat is not in the same league really, in terms of
expressive power of system analysis and performance
analysis.

All that the above results needed were those new tracepoints
in include/tracing/events/kmem.h.

	Ingo

Loading