Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d8dec295 authored by Harshdeep Dhatt's avatar Harshdeep Dhatt
Browse files

msm: kgsl: Add support for kernel profiling in hw scheduler



Enable kernel command profiling by sending the HFI feature
flag to GMU. Add support for the sync event which maps GPU/GMU
ticks to ftrace time. Record the GPU ticks when command is
submitted to the HFI dispatch queue. The ticks corresponding to
submission to the ringbuffer, start, retire on GPU/GMU will be
conveyed to kgsl by GMU through the HFI retire packet.
Modify the trace events to log this information appropriately.

Change-Id: Ib6b5a7858848d8d409925015aad719322de8c12d
Signed-off-by: default avatarSushmita Susheelendra <ssusheel@codeaurora.org>
Signed-off-by: default avatarHarshdeep Dhatt <hdhatt@codeaurora.org>
parent e6245a54
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@
#define HFI_FEATURE_BCL		11
#define HFI_FEATURE_ACD		12
#define HFI_FEATURE_DIDT	13
#define HFI_FEATURE_KPROF	14

#define HFI_VALUE_FT_POLICY		100
#define HFI_VALUE_RB_MAX_CMDS		101
@@ -481,10 +482,14 @@ struct hfi_ts_notify_cmd {

/* F2H */
struct hfi_ts_retire_cmd {
	uint32_t hdr;
	uint32_t ctxt_id;
	uint32_t ts;
	uint32_t ret;
	u32 hdr;
	u32 ctxt_id;
	u32 ts;
	u32 type;
	u64 submitted_to_rb;
	u64 sop;
	u64 eop;
	u64 retired_on_gmu;
} __packed;

/* H2F */
+96 −4
Original line number Diff line number Diff line
@@ -4,11 +4,13 @@
 */

#include <linux/iommu.h>
#include <linux/sched/clock.h>

#include "adreno.h"
#include "adreno_a6xx.h"
#include "adreno_a6xx_hwsched.h"
#include "adreno_pm4types.h"
#include "adreno_trace.h"
#include "kgsl_device.h"
#include "kgsl_pwrctrl.h"
#include "kgsl_trace.h"
@@ -148,6 +150,35 @@ static void a6xx_receive_ack_async(struct adreno_device *adreno_dev, void *rcvd)
			MSG_HDR_GET_SEQNUM(waiters[i]));
}

static u32 get_level(u32 priority)
{
	u32 level = priority / KGSL_PRIORITY_MAX_RB_LEVELS;

	return min_t(u32, level, KGSL_PRIORITY_MAX_RB_LEVELS - 1);
}

static void log_profiling_info(struct adreno_device *adreno_dev, u32 *rcvd)
{
	struct hfi_ts_retire_cmd *cmd = (struct hfi_ts_retire_cmd *)rcvd;
	struct kgsl_context *context;
	struct retire_info info = {0};

	context = kgsl_context_get(KGSL_DEVICE(adreno_dev), cmd->ctxt_id);
	if (context == NULL)
		return;

	info.timestamp = cmd->ts;
	info.rb_id = get_level(context->priority);
	info.gmu_dispatch_queue = context->gmu_dispatch_queue;
	info.submitted_to_rb = cmd->submitted_to_rb;
	info.sop = cmd->sop;
	info.eop = cmd->eop;
	info.retired_on_gmu = cmd->retired_on_gmu;

	trace_adreno_cmdbatch_retired(context, &info, 0, 0, 0);
	kgsl_context_put(context);
}

struct f2h_packet {
	/** @rcvd: the contents of the fw to host packet */
	u32 rcvd[MAX_RCVD_SIZE];
@@ -692,6 +723,10 @@ int a6xx_hwsched_hfi_start(struct adreno_device *adreno_dev)
	if (ret)
		goto err;

	ret = a6xx_hfi_send_feature_ctrl(adreno_dev, HFI_FEATURE_KPROF, 1, 0);
	if (ret)
		return ret;

	ret = a6xx_hfi_send_core_fw_start(adreno_dev);
	if (ret)
		goto err;
@@ -784,6 +819,12 @@ int a6xx_hwsched_cp_init(struct adreno_device *adreno_dev)
	return ret;
}

static void process_ts_retire(struct adreno_device *adreno_dev, u32 *rcvd)
{
	log_profiling_info(adreno_dev, rcvd);
	adreno_hwsched_trigger(adreno_dev);
}

static int hfi_f2h_main(void *arg)
{
	struct adreno_device *adreno_dev = arg;
@@ -805,7 +846,7 @@ static int hfi_f2h_main(void *arg)

		llist_for_each_entry_safe(pkt, tmp, list, node) {
			if (MSG_HDR_GET_ID(pkt->rcvd[0]) == F2H_MSG_TS_RETIRE)
				adreno_hwsched_trigger(adreno_dev);
				process_ts_retire(adreno_dev, pkt->rcvd);

			kmem_cache_free(f2h_cache, pkt);
		}
@@ -844,6 +885,54 @@ int a6xx_hwsched_hfi_probe(struct adreno_device *adreno_dev)
	return 0;
}

static void add_profile_events(struct adreno_device *adreno_dev,
	struct kgsl_drawobj *drawobj, struct adreno_submit_time *time)
{
	unsigned long flags;
	u64 time_in_s;
	unsigned long time_in_ns;
	struct kgsl_context *context = drawobj->context;
	struct submission_info info = {0};

	/*
	 * Here we are attempting to create a mapping between the
	 * GPU time domain (alwayson counter) and the CPU time domain
	 * (local_clock) by sampling both values as close together as
	 * possible. This is useful for many types of debugging and
	 * profiling. In order to make this mapping as accurate as
	 * possible, we must turn off interrupts to avoid running
	 * interrupt handlers between the two samples.
	 */

	local_irq_save(flags);

	/* Read always on registers */
	time->ticks = a6xx_read_alwayson(adreno_dev);

	/* Trace the GPU time to create a mapping to ftrace time */
	trace_adreno_cmdbatch_sync(context->id, context->priority,
		drawobj->timestamp, time->ticks);

	/* Get the kernel clock for time since boot */
	time->ktime = local_clock();

	/* Get the timeofday for the wall time (for the user) */
	getnstimeofday(&time->utime);

	local_irq_restore(flags);

	/* Return kernel clock time to the client if requested */
	time_in_s = time->ktime;
	time_in_ns = do_div(time_in_s, 1000000000);

	info.inflight = -1;
	info.rb_id = get_level(context->priority);
	info.gmu_dispatch_queue = context->gmu_dispatch_queue;

	trace_adreno_cmdbatch_submitted(drawobj, &info, time->ticks,
		(unsigned long) time_in_s, time_in_ns / 1000, 0);
}

#define CTXT_FLAG_PMODE                 0x00000001
#define CTXT_FLAG_SWITCH_INTERNAL       0x00000002
#define CTXT_FLAG_SWITCH                0x00000008
@@ -880,9 +969,7 @@ static u32 get_next_dq(u32 priority)

static u32 get_dq_id(u32 priority)
{
	u32 level = priority / KGSL_PRIORITY_MAX_RB_LEVELS;

	level = min_t(u32, level, KGSL_PRIORITY_MAX_RB_LEVELS - 1);
	u32 level = get_level(priority);

	return get_next_dq(level);
}
@@ -967,6 +1054,7 @@ int a6xx_hwsched_submit_cmdobj(struct adreno_device *adreno_dev,
	struct kgsl_drawobj *drawobj = DRAWOBJ(cmdobj);
	struct hfi_issue_ib *issue_ib;
	struct hfi_submit_cmd *cmd;
	struct adreno_submit_time time = {0};

	ret = hfi_context_register(adreno_dev, drawobj->context);
	if (ret)
@@ -1018,6 +1106,10 @@ int a6xx_hwsched_submit_cmdobj(struct adreno_device *adreno_dev,
	 */
	wmb();

	add_profile_events(adreno_dev, drawobj, &time);

	cmdobj->submit_ticks = time.ticks;

	/* Send interrupt to GMU to receive the message */
	gmu_core_regwrite(KGSL_DEVICE(adreno_dev), A6XX_GMU_HOST2GMU_INTR_SET,
		DISPQ_IRQ_BIT(drawobj->context->gmu_dispatch_queue));
+46 −19
Original line number Diff line number Diff line
@@ -271,6 +271,8 @@ static void _retire_timestamp(struct kgsl_drawobj *drawobj)
	struct kgsl_context *context = drawobj->context;
	struct adreno_context *drawctxt = ADRENO_CONTEXT(context);
	struct kgsl_device *device = context->device;
	struct adreno_ringbuffer *rb = drawctxt->rb;
	struct retire_info info = {0};

	/*
	 * Write the start and end timestamp to the memstore to keep the
@@ -284,21 +286,29 @@ static void _retire_timestamp(struct kgsl_drawobj *drawobj)
		KGSL_MEMSTORE_OFFSET(context->id, eoptimestamp),
		drawobj->timestamp);


	/* Retire pending GPU events for the object */
	kgsl_process_event_group(device, &context->events);

	info.inflight = -1;
	info.rb_id = rb->id;
	info.wptr = rb->wptr;
	info.timestamp = drawobj->timestamp;

	/*
	 * For A3xx we still get the rptr from the CP_RB_RPTR instead of
	 * rptr scratch out address. At this point GPU clocks turned off.
	 * So avoid reading GPU register directly for A3xx.
	 */
	if (adreno_is_a3xx(ADRENO_DEVICE(device)))
		trace_adreno_cmdbatch_retired(drawobj, -1, 0, 0, drawctxt->rb,
				0, 0);
	else
		trace_adreno_cmdbatch_retired(drawobj, -1, 0, 0, drawctxt->rb,
			adreno_get_rptr(drawctxt->rb), 0);
	if (adreno_is_a3xx(ADRENO_DEVICE(device))) {
		trace_adreno_cmdbatch_retired(context, &info,
			drawobj->flags, rb->dispatch_q.inflight, 0);
	} else {
		info.rptr = adreno_get_rptr(rb);

		trace_adreno_cmdbatch_retired(context, &info,
			drawobj->flags, rb->dispatch_q.inflight, 0);
	}

	kgsl_drawobj_destroy(drawobj);
}

@@ -542,6 +552,7 @@ static int sendcmd(struct adreno_device *adreno_dev,
	uint64_t secs = 0;
	unsigned long nsecs = 0;
	int ret;
	struct submission_info info = {0};

	mutex_lock(&device->mutex);
	if (adreno_gpu_halt(adreno_dev) != 0) {
@@ -650,9 +661,15 @@ static int sendcmd(struct adreno_device *adreno_dev,
		dispatch_q->expires = jiffies +
			msecs_to_jiffies(adreno_drawobj_timeout);

	trace_adreno_cmdbatch_submitted(drawobj, (int) dispatcher->inflight,
		time.ticks, (unsigned long) secs, nsecs / 1000, drawctxt->rb,
		adreno_get_rptr(drawctxt->rb));
	info.inflight = (int) dispatcher->inflight;
	info.rb_id = drawctxt->rb->id;
	info.rptr = adreno_get_rptr(drawctxt->rb);
	info.wptr = drawctxt->rb->wptr;
	info.gmu_dispatch_queue = -1;

	trace_adreno_cmdbatch_submitted(drawobj, &info,
			time.ticks, (unsigned long) secs, nsecs / 1000,
			dispatch_q->inflight);

	mutex_unlock(&device->mutex);

@@ -2335,7 +2352,9 @@ static void retire_cmdobj(struct adreno_device *adreno_dev,
	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
	struct kgsl_drawobj *drawobj = DRAWOBJ(cmdobj);
	struct adreno_context *drawctxt = ADRENO_CONTEXT(drawobj->context);
	struct adreno_ringbuffer *rb = drawctxt->rb;
	uint64_t start = 0, end = 0;
	struct retire_info info = {0};

	if (cmdobj->fault_recovery != 0) {
		set_bit(ADRENO_CONTEXT_FAULT, &drawobj->context->priv);
@@ -2345,20 +2364,28 @@ static void retire_cmdobj(struct adreno_device *adreno_dev,
	if (test_bit(CMDOBJ_PROFILE, &cmdobj->priv))
		cmdobj_profile_ticks(adreno_dev, cmdobj, &start, &end);

	info.inflight = (int)dispatcher->inflight;
	info.rb_id = rb->id;
	info.wptr = rb->wptr;
	info.timestamp = drawobj->timestamp;
	info.sop = start;
	info.eop = end;

	/*
	 * For A3xx we still get the rptr from the CP_RB_RPTR instead of
	 * rptr scratch out address. At this point GPU clocks turned off.
	 * So avoid reading GPU register directly for A3xx.
	 */
	if (adreno_is_a3xx(adreno_dev))
		trace_adreno_cmdbatch_retired(drawobj,
			(int) dispatcher->inflight, start, end,
			ADRENO_DRAWOBJ_RB(drawobj), 0, cmdobj->fault_recovery);
	else
		trace_adreno_cmdbatch_retired(drawobj,
			(int) dispatcher->inflight, start, end,
			ADRENO_DRAWOBJ_RB(drawobj),
			adreno_get_rptr(drawctxt->rb), cmdobj->fault_recovery);
	if (adreno_is_a3xx(adreno_dev)) {
		trace_adreno_cmdbatch_retired(drawobj->context, &info,
			drawobj->flags, rb->dispatch_q.inflight,
			cmdobj->fault_recovery);
	} else {
		info.rptr = adreno_get_rptr(rb);
		trace_adreno_cmdbatch_retired(drawobj->context, &info,
			drawobj->flags, rb->dispatch_q.inflight,
			cmdobj->fault_recovery);
	}

	drawctxt->submit_retire_ticks[drawctxt->ticks_index] =
		end - cmdobj->submit_ticks;
+5 −1
Original line number Diff line number Diff line
@@ -34,6 +34,9 @@ static void adreno_get_submit_time(struct adreno_device *adreno_dev,
{
	struct adreno_gpudev *gpudev = ADRENO_GPU_DEVICE(adreno_dev);
	unsigned long flags;
	struct adreno_context *drawctxt = rb->drawctxt_active;
	struct kgsl_context *context = &drawctxt->base;

	/*
	 * Here we are attempting to create a mapping between the
	 * GPU time domain (alwayson counter) and the CPU time domain
@@ -49,7 +52,8 @@ static void adreno_get_submit_time(struct adreno_device *adreno_dev,
	time->ticks = gpudev->read_alwayson(adreno_dev);

	/* Trace the GPU time to create a mapping to ftrace time */
	trace_adreno_cmdbatch_sync(rb->drawctxt_active, time->ticks);
	trace_adreno_cmdbatch_sync(context->id, context->priority,
		drawctxt->timestamp, time->ticks);

	/* Get the kernel clock for time since boot */
	time->ktime = local_clock();
+50 −40
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2013-2019, The Linux Foundation. All rights reserved.
 * Copyright (c) 2013-2020, The Linux Foundation. All rights reserved.
 */

#if !defined(_ADRENO_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
@@ -54,10 +54,10 @@ TRACE_EVENT(adreno_cmdbatch_queued,
);

TRACE_EVENT(adreno_cmdbatch_submitted,
	TP_PROTO(struct kgsl_drawobj *drawobj, int inflight, uint64_t ticks,
		unsigned long secs, unsigned long usecs,
		struct adreno_ringbuffer *rb, unsigned int rptr),
	TP_ARGS(drawobj, inflight, ticks, secs, usecs, rb, rptr),
	TP_PROTO(struct kgsl_drawobj *drawobj, struct submission_info *info,
		uint64_t ticks, unsigned long secs, unsigned long usecs,
		int q_inflight),
	TP_ARGS(drawobj, info, ticks, secs, usecs, q_inflight),
	TP_STRUCT__entry(
		__field(unsigned int, id)
		__field(unsigned int, timestamp)
@@ -71,39 +71,40 @@ TRACE_EVENT(adreno_cmdbatch_submitted,
		__field(unsigned int, rptr)
		__field(unsigned int, wptr)
		__field(int, q_inflight)
		__field(int, dispatch_queue)
	),
	TP_fast_assign(
		__entry->id = drawobj->context->id;
		__entry->timestamp = drawobj->timestamp;
		__entry->inflight = inflight;
		__entry->inflight = info->inflight;
		__entry->flags = drawobj->flags;
		__entry->ticks = ticks;
		__entry->secs = secs;
		__entry->usecs = usecs;
		__entry->prio = drawobj->context->priority;
		__entry->rb_id = rb->id;
		__entry->rptr = rptr;
		__entry->wptr = rb->wptr;
		__entry->q_inflight = rb->dispatch_q.inflight;
		__entry->rb_id = info->rb_id;
		__entry->rptr = info->rptr;
		__entry->wptr = info->wptr;
		__entry->q_inflight = q_inflight;
		__entry->dispatch_queue = info->gmu_dispatch_queue;
	),
	TP_printk(
		"ctx=%u ctx_prio=%d ts=%u inflight=%d flags=%s ticks=%lld time=%lu.%0lu rb_id=%d r/w=%x/%x, q_inflight=%d",
		"ctx=%u ctx_prio=%d ts=%u inflight=%d flags=%s ticks=%lld time=%lu.%0lu rb_id=%d r/w=%x/%x, q_inflight=%d dq_id=%d",
			__entry->id, __entry->prio, __entry->timestamp,
			__entry->inflight,
			__entry->flags ? __print_flags(__entry->flags, "|",
				KGSL_DRAWOBJ_FLAGS) : "none",
			__entry->ticks, __entry->secs, __entry->usecs,
			__entry->rb_id, __entry->rptr, __entry->wptr,
			__entry->q_inflight
			__entry->q_inflight, __entry->dispatch_queue
	)
);

TRACE_EVENT(adreno_cmdbatch_retired,
	TP_PROTO(struct kgsl_drawobj *drawobj, int inflight,
		uint64_t start, uint64_t retire,
		struct adreno_ringbuffer *rb, unsigned int rptr,
		TP_PROTO(struct kgsl_context *context, struct retire_info *info,
			unsigned int flags, int q_inflight,
			unsigned long fault_recovery),
	TP_ARGS(drawobj, inflight, start, retire, rb, rptr, fault_recovery),
	TP_ARGS(context, info, flags, q_inflight, fault_recovery),
	TP_STRUCT__entry(
		__field(unsigned int, id)
		__field(unsigned int, timestamp)
@@ -118,41 +119,50 @@ TRACE_EVENT(adreno_cmdbatch_retired,
		__field(unsigned int, wptr)
		__field(int, q_inflight)
		__field(unsigned long, fault_recovery)
		__field(unsigned int, dispatch_queue)
		__field(uint64_t, submitted_to_rb)
		__field(uint64_t, retired_on_gmu)
		),
	TP_fast_assign(
		__entry->id = drawobj->context->id;
		__entry->timestamp = drawobj->timestamp;
		__entry->inflight = inflight;
		__entry->id = context->id;
		__entry->timestamp = info->timestamp;
		__entry->inflight = info->inflight;
		__entry->recovery = fault_recovery;
		__entry->flags = drawobj->flags;
		__entry->start = start;
		__entry->retire = retire;
		__entry->prio = drawobj->context->priority;
		__entry->rb_id = rb->id;
		__entry->rptr = rptr;
		__entry->wptr = rb->wptr;
		__entry->q_inflight = rb->dispatch_q.inflight;
		__entry->flags = flags;
		__entry->start = info->sop;
		__entry->retire = info->eop;
		__entry->prio = context->priority;
		__entry->rb_id = info->rb_id;
		__entry->rptr = info->rptr;
		__entry->wptr = info->wptr;
		__entry->q_inflight = q_inflight;
		__entry->dispatch_queue = info->gmu_dispatch_queue;
		__entry->submitted_to_rb = info->submitted_to_rb;
		__entry->retired_on_gmu = info->retired_on_gmu;
		),

	TP_printk(
		"ctx=%u ctx_prio=%d ts=%u inflight=%d recovery=%s flags=%s start=%lld retire=%lld rb_id=%d, r/w=%x/%x, q_inflight=%d",
		"ctx=%u prio=%d ts=%u inflight=%d recovery=%s flags=%s start=%llu retire=%llu rb_id=%d, r/w=%x/%x, q_inflight=%d, dq_id=%u, submitted_to_rb=%llu, retired_on_gmu=%llu",
			__entry->id, __entry->prio, __entry->timestamp,
			__entry->inflight,
			__entry->recovery ?
				__print_flags(__entry->recovery, "|",
				__print_flags(__entry->fault_recovery, "|",
				ADRENO_FT_TYPES) : "none",
			__entry->flags ? __print_flags(__entry->flags, "|",
				KGSL_DRAWOBJ_FLAGS) : "none",
			__entry->start,
			__entry->retire,
			__entry->rb_id, __entry->rptr, __entry->wptr,
			__entry->q_inflight
			__entry->q_inflight,
			__entry->dispatch_queue,
			__entry->submitted_to_rb, __entry->retired_on_gmu
	 )
);

TRACE_EVENT(adreno_cmdbatch_sync,
	TP_PROTO(struct adreno_context *drawctxt,
		uint64_t ticks),
	TP_ARGS(drawctxt, ticks),
	TP_PROTO(unsigned int ctx_id, unsigned int ctx_prio,
		unsigned int timestamp,	uint64_t ticks),
	TP_ARGS(ctx_id, ctx_prio, timestamp, ticks),
	TP_STRUCT__entry(
		__field(unsigned int, id)
		__field(unsigned int, timestamp)
@@ -160,10 +170,10 @@ TRACE_EVENT(adreno_cmdbatch_sync,
		__field(int, prio)
	),
	TP_fast_assign(
		__entry->id = drawctxt->base.id;
		__entry->timestamp = drawctxt->timestamp;
		__entry->id = ctx_id;
		__entry->timestamp = timestamp;
		__entry->ticks = ticks;
		__entry->prio = drawctxt->base.priority;
		__entry->prio = ctx_prio;
	),
	TP_printk(
		"ctx=%u ctx_prio=%d ts=%u ticks=%lld",
Loading