Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 913317fe authored by Sultan Alsawaf's avatar Sultan Alsawaf Committed by Pranav Vashi
Browse files

memlat: Optimize perf event reads when possible



We can skip the locking and other overhead of perf_event_read_value()
when we know in advance that the perf event in question can be read from
the current CPU. This occurs when either the perf event permits reads
from CPUs other than the one its on, or when the CPU doing the reads is
the same CPU that owns the perf event.

Our PMU drivers only set two possible values for `readable_on_cpus`:
CPU_MASK_ALL or nothing. As such, we can simply check for CPU_MASK_ALL
beforehand in order to determine if the perf event allows non-local
reads.

We can also reduce the scope of under_scm_call() since we now know which
CPU we're reading a perf event from, thus reducing the false positive
rate of under_scm_call() as it is now per-CPU.

Signed-off-by: default avatarSultan Alsawaf <sultan@kerneltoast.com>
Signed-off-by: default avatarPranav Vashi <neobuddy89@gmail.com>
parent 5ebb06d3
Loading
Loading
Loading
Loading
+35 −8
Original line number Original line Diff line number Diff line
@@ -49,6 +49,7 @@ enum ev_index {
struct event_data {
struct event_data {
	struct perf_event *pevent;
	struct perf_event *pevent;
	unsigned long prev_count;
	unsigned long prev_count;
	bool any_cpu_readable;
};
};


struct cpu_pmu_stats {
struct cpu_pmu_stats {
@@ -101,7 +102,37 @@ static inline unsigned long read_event(struct event_data *event)
	if (!event->pevent)
	if (!event->pevent)
		return 0;
		return 0;


	total = perf_event_read_value(event->pevent, &enabled, &running);
	if (event->any_cpu_readable) {
		if (perf_event_read_local(event->pevent, &total))
			return 0;
	} else {
		unsigned int ev_cpu = READ_ONCE(event->pevent->oncpu);
		bool local_read;
		int ret;

		if (ev_cpu >= nr_cpu_ids)
			return 0;

		local_irq_disable();
		if ((local_read = (ev_cpu == raw_smp_processor_id())))
			ret = perf_event_read_local(event->pevent, &total);
		local_irq_enable();

		if (!local_read) {
			/*
			 * Some SCM calls take very long (20+ ms), so the perf
			 * event IPI could lag on the CPU running the SCM call.
			 */
			if (under_scm_call(ev_cpu))
				return 0;

			total = perf_event_read_value(event->pevent, &enabled,
						      &running);
		} else if (ret) {
			return ret;
		}
	}

	ev_count = total - event->prev_count;
	ev_count = total - event->prev_count;
	event->prev_count = total;
	event->prev_count = total;
	return ev_count;
	return ev_count;
@@ -141,13 +172,6 @@ static void delete_events(struct cpu_pmu_stats *cpustats)
{
{
	int i;
	int i;


	/*
	 * Some of SCM call is very heavy(+20ms) so perf IPI could
	 * be stuck on the CPU which contributes long latency.
	 */
	if (under_scm_call())
		return;

	for (i = 0; i < ARRAY_SIZE(cpustats->events); i++) {
	for (i = 0; i < ARRAY_SIZE(cpustats->events); i++) {
		cpustats->events[i].prev_count = 0;
		cpustats->events[i].prev_count = 0;
		if (cpustats->events[i].pevent) {
		if (cpustats->events[i].pevent) {
@@ -193,6 +217,7 @@ static struct perf_event_attr *alloc_attr(void)


static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
{
{
	static struct cpumask all_cpu_mask = CPU_MASK_ALL;
	struct perf_event *pevent;
	struct perf_event *pevent;
	struct perf_event_attr *attr;
	struct perf_event_attr *attr;
	int err, i;
	int err, i;
@@ -216,6 +241,8 @@ static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
			goto err_out;
			goto err_out;
		cpustats->events[i].pevent = pevent;
		cpustats->events[i].pevent = pevent;
		perf_event_enable(pevent);
		perf_event_enable(pevent);
		cpustats->events[i].any_cpu_readable =
			cpumask_equal(&pevent->readable_on_cpus, &all_cpu_mask);
	}
	}


	kfree(attr);
	kfree(attr);
+12 −9
Original line number Original line Diff line number Diff line
@@ -36,7 +36,7 @@
#define SCM_EBUSY		-55
#define SCM_EBUSY		-55
#define SCM_V2_EBUSY		-12
#define SCM_V2_EBUSY		-12


static atomic_t scm_call_count = ATOMIC_INIT(0);
static DEFINE_PER_CPU(atomic_t, scm_call_count);
static DEFINE_MUTEX(scm_lock);
static DEFINE_MUTEX(scm_lock);


/*
/*
@@ -433,11 +433,12 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
				u64 *ret1, u64 *ret2, u64 *ret3)
				u64 *ret1, u64 *ret2, u64 *ret3)
{
{
	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
	int ret;
	int ret;


	atomic_inc(&scm_call_count);
	atomic_inc(cnt);
	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
	atomic_dec(&scm_call_count);
	atomic_dec(cnt);


	return ret;
	return ret;
}
}
@@ -495,11 +496,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
				u64 *ret1, u64 *ret2, u64 *ret3)
				u64 *ret1, u64 *ret2, u64 *ret3)
{
{
	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
	int ret;
	int ret;


	atomic_inc(&scm_call_count);
	atomic_inc(cnt);
	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
	atomic_dec(&scm_call_count);
	atomic_dec(cnt);


	return ret;
	return ret;
}
}
@@ -557,11 +559,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
				u64 *ret1, u64 *ret2, u64 *ret3)
				u64 *ret1, u64 *ret2, u64 *ret3)
{
{
	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
	int ret;
	int ret;


	atomic_inc(&scm_call_count);
	atomic_inc(cnt);
	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
	atomic_dec(&scm_call_count);
	atomic_dec(cnt);


	return ret;
	return ret;
}
}
@@ -1352,7 +1355,7 @@ inline int scm_enable_mem_protection(void)
#endif
#endif
EXPORT_SYMBOL(scm_enable_mem_protection);
EXPORT_SYMBOL(scm_enable_mem_protection);


bool under_scm_call(void)
bool under_scm_call(int cpu)
{
{
	return atomic_read(&scm_call_count);
	return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
}
}
+2 −2
Original line number Original line Diff line number Diff line
@@ -124,7 +124,7 @@ struct scm_hdcp_req {
};
};


extern struct mutex scm_lmh_lock;
extern struct mutex scm_lmh_lock;
extern bool under_scm_call(void);
extern bool under_scm_call(int cpu);


#else
#else


@@ -188,7 +188,7 @@ static inline int scm_enable_mem_protection(void)
	return 0;
	return 0;
}
}


extern bool under_scm_call(void)
extern bool under_scm_call(int cpu)
{
{
	return false;
	return false;
}
}