Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 65101d8c authored by Boris Brezillon's avatar Boris Brezillon Committed by Eric Anholt
Browse files

drm/vc4: Expose performance counters to userspace



The V3D engine has various hardware counters which might be interesting
to userspace performance analysis tools.

Expose new ioctls to create/destroy a performance monitor object and
query the counter values of this perfmance monitor.

Note that a perfomance monitor is given an ID that is only valid on the
file descriptor it has been allocated from. A performance monitor can be
attached to a CL submission and the driver will enable HW counters for
this request and update the performance monitor values at the end of the
job.

Signed-off-by: default avatarBoris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-by: default avatarEric Anholt <eric@anholt.net>
Signed-off-by: default avatarEric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20180112090926.12538-1-boris.brezillon@free-electrons.com
parent 9c950e46
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ vc4-y := \
	vc4_vec.o \
	vc4_hvs.o \
	vc4_irq.o \
	vc4_perfmon.o \
	vc4_plane.o \
	vc4_render_cl.o \
	vc4_trace_points.o \
+26 −0
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
	case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
	case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
	case DRM_VC4_PARAM_SUPPORTS_MADVISE:
	case DRM_VC4_PARAM_SUPPORTS_PERFMON:
		args->value = true;
		break;
	default:
@@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
	return 0;
}

static int vc4_open(struct drm_device *dev, struct drm_file *file)
{
	struct vc4_file *vc4file;

	vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
	if (!vc4file)
		return -ENOMEM;

	vc4_perfmon_open_file(vc4file);
	file->driver_priv = vc4file;
	return 0;
}

static void vc4_close(struct drm_device *dev, struct drm_file *file)
{
	struct vc4_file *vc4file = file->driver_priv;

	vc4_perfmon_close_file(vc4file);
}

static const struct vm_operations_struct vc4_vm_ops = {
	.fault = vc4_fault,
	.open = drm_gem_vm_open,
@@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
	DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
};

static struct drm_driver vc4_drm_driver = {
@@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = {
			    DRIVER_RENDER |
			    DRIVER_PRIME),
	.lastclose = drm_fb_helper_lastclose,
	.open = vc4_open,
	.postclose = vc4_close,
	.irq_handler = vc4_irq,
	.irq_preinstall = vc4_irq_preinstall,
	.irq_postinstall = vc4_irq_postinstall,
+68 −0
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@
#include <drm/drm_encoder.h>
#include <drm/drm_gem_cma_helper.h>

#include "uapi/drm/vc4_drm.h"

/* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
 * this.
 */
@@ -29,6 +31,36 @@ enum vc4_kernel_bo_type {
	VC4_BO_TYPE_COUNT
};

/* Performance monitor object. The perform lifetime is controlled by userspace
 * using perfmon related ioctls. A perfmon can be attached to a submit_cl
 * request, and when this is the case, HW perf counters will be activated just
 * before the submit_cl is submitted to the GPU and disabled when the job is
 * done. This way, only events related to a specific job will be counted.
 */
struct vc4_perfmon {
	/* Tracks the number of users of the perfmon, when this counter reaches
	 * zero the perfmon is destroyed.
	 */
	refcount_t refcnt;

	/* Number of counters activated in this perfmon instance
	 * (should be less than DRM_VC4_MAX_PERF_COUNTERS).
	 */
	u8 ncounters;

	/* Events counted by the HW perf counters. */
	u8 events[DRM_VC4_MAX_PERF_COUNTERS];

	/* Storage for counter values. Counters are incremented by the HW
	 * perf counter values every time the perfmon is attached to a GPU job.
	 * This way, perfmon users don't have to retrieve the results after
	 * each job if they want to track events covering several submissions.
	 * Note that counter values can't be reset, but you can fake a reset by
	 * destroying the perfmon and creating a new one.
	 */
	u64 counters[0];
};

struct vc4_dev {
	struct drm_device *dev;

@@ -121,6 +153,11 @@ struct vc4_dev {
	wait_queue_head_t job_wait_queue;
	struct work_struct job_done_work;

	/* Used to track the active perfmon if any. Access to this field is
	 * protected by job_lock.
	 */
	struct vc4_perfmon *active_perfmon;

	/* List of struct vc4_seqno_cb for callbacks to be made from a
	 * workqueue when the given seqno is passed.
	 */
@@ -406,6 +443,21 @@ struct vc4_exec_info {
	void *uniforms_v;
	uint32_t uniforms_p;
	uint32_t uniforms_size;

	/* Pointer to a performance monitor object if the user requested it,
	 * NULL otherwise.
	 */
	struct vc4_perfmon *perfmon;
};

/* Per-open file private data. Any driver-specific resource that has to be
 * released when the DRM file is closed should be placed here.
 */
struct vc4_file {
	struct {
		struct idr idr;
		struct mutex lock;
	} perfmon;
};

static inline struct vc4_exec_info *
@@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
/* vc4_validate_shader.c */
struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_cma_object *shader_obj);

/* vc4_perfmon.c */
void vc4_perfmon_get(struct vc4_perfmon *perfmon);
void vc4_perfmon_put(struct vc4_perfmon *perfmon);
void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
		      bool capture);
struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
void vc4_perfmon_open_file(struct vc4_file *vc4file);
void vc4_perfmon_close_file(struct vc4_file *vc4file);
int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
			     struct drm_file *file_priv);
int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
			      struct drm_file *file_priv);
int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
				 struct drm_file *file_priv);
+43 −5
Original line number Diff line number Diff line
@@ -454,13 +454,29 @@ vc4_submit_next_bin_job(struct drm_device *dev)

	vc4_flush_caches(dev);

	/* Only start the perfmon if it was not already started by a previous
	 * job.
	 */
	if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
		vc4_perfmon_start(vc4, exec->perfmon);

	/* Either put the job in the binner if it uses the binner, or
	 * immediately move it to the to-be-rendered queue.
	 */
	if (exec->ct0ca != exec->ct0ea) {
		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
	} else {
		struct vc4_exec_info *next;

		vc4_move_job_to_render(dev, exec);
		next = vc4_first_bin_job(vc4);

		/* We can't start the next bin job if the previous job had a
		 * different perfmon instance attached to it. The same goes
		 * if one of them had a perfmon attached to it and the other
		 * one doesn't.
		 */
		if (next && next->perfmon == exec->perfmon)
			goto again;
	}
}
@@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
		 struct ww_acquire_ctx *acquire_ctx)
{
	struct vc4_dev *vc4 = to_vc4_dev(dev);
	struct vc4_exec_info *renderjob;
	uint64_t seqno;
	unsigned long irqflags;
	struct vc4_fence *fence;
@@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,

	list_add_tail(&exec->head, &vc4->bin_job_list);

	/* If no job was executing, kick ours off.  Otherwise, it'll
	 * get started when the previous job's flush done interrupt
	 * occurs.
	/* If no bin job was executing and if the render job (if any) has the
	 * same perfmon as our job attached to it (or if both jobs don't have
	 * perfmon activated), then kick ours off.  Otherwise, it'll get
	 * started when the previous job's flush/render done interrupt occurs.
	 */
	if (vc4_first_bin_job(vc4) == exec) {
	renderjob = vc4_first_render_job(vc4);
	if (vc4_first_bin_job(vc4) == exec &&
	    (!renderjob || renderjob->perfmon == exec->perfmon)) {
		vc4_submit_next_bin_job(dev);
		vc4_queue_hangcheck(dev);
	}
@@ -915,6 +935,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
	vc4->bin_alloc_used &= ~exec->bin_slots;
	spin_unlock_irqrestore(&vc4->job_lock, irqflags);

	/* Release the reference we had on the perf monitor. */
	vc4_perfmon_put(exec->perfmon);

	mutex_lock(&vc4->power_lock);
	if (--vc4->power_refcount == 0) {
		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1067,6 +1090,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
		    struct drm_file *file_priv)
{
	struct vc4_dev *vc4 = to_vc4_dev(dev);
	struct vc4_file *vc4file = file_priv->driver_priv;
	struct drm_vc4_submit_cl *args = data;
	struct vc4_exec_info *exec;
	struct ww_acquire_ctx acquire_ctx;
@@ -1080,6 +1104,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
		return -EINVAL;
	}

	if (args->pad2 != 0) {
		DRM_DEBUG("->pad2 must be set to zero\n");
		return -EINVAL;
	}

	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
	if (!exec) {
		DRM_ERROR("malloc failure on exec struct\n");
@@ -1105,6 +1134,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
	if (ret)
		goto fail;

	if (args->perfmonid) {
		exec->perfmon = vc4_perfmon_find(vc4file,
						 args->perfmonid);
		if (!exec->perfmon) {
			ret = -ENOENT;
			goto fail;
		}
	}

	if (exec->args->bin_cl_size != 0) {
		ret = vc4_get_bcl(dev, exec);
		if (ret)
+37 −3
Original line number Diff line number Diff line
@@ -104,12 +104,19 @@ static void
vc4_irq_finish_bin_job(struct drm_device *dev)
{
	struct vc4_dev *vc4 = to_vc4_dev(dev);
	struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
	struct vc4_exec_info *next, *exec = vc4_first_bin_job(vc4);

	if (!exec)
		return;

	vc4_move_job_to_render(dev, exec);
	next = vc4_first_bin_job(vc4);

	/* Only submit the next job in the bin list if it matches the perfmon
	 * attached to the one that just finished (or if both jobs don't have
	 * perfmon attached to them).
	 */
	if (next && next->perfmon == exec->perfmon)
		vc4_submit_next_bin_job(dev);
}

@@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev)
	if (!exec)
		return;

	/* Stop the perfmon so that the next bin job can be started. */
	if (exec->perfmon)
		vc4_perfmon_stop(vc4, exec->perfmon, false);

	list_move_tail(&exec->head, &vc4->bin_job_list);
	vc4_submit_next_bin_job(dev);
}
@@ -131,18 +142,41 @@ vc4_irq_finish_render_job(struct drm_device *dev)
{
	struct vc4_dev *vc4 = to_vc4_dev(dev);
	struct vc4_exec_info *exec = vc4_first_render_job(vc4);
	struct vc4_exec_info *nextbin, *nextrender;

	if (!exec)
		return;

	vc4->finished_seqno++;
	list_move_tail(&exec->head, &vc4->job_done_list);

	nextbin = vc4_first_bin_job(vc4);
	nextrender = vc4_first_render_job(vc4);

	/* Only stop the perfmon if following jobs in the queue don't expect it
	 * to be enabled.
	 */
	if (exec->perfmon && !nextrender &&
	    (!nextbin || nextbin->perfmon != exec->perfmon))
		vc4_perfmon_stop(vc4, exec->perfmon, true);

	/* If there's a render job waiting, start it. If this is not the case
	 * we may have to unblock the binner if it's been stalled because of
	 * perfmon (this can be checked by comparing the perfmon attached to
	 * the finished renderjob to the one attached to the next bin job: if
	 * they don't match, this means the binner is stalled and should be
	 * restarted).
	 */
	if (nextrender)
		vc4_submit_next_render_job(dev);
	else if (nextbin && nextbin->perfmon != exec->perfmon)
		vc4_submit_next_bin_job(dev);

	if (exec->fence) {
		dma_fence_signal_locked(exec->fence);
		dma_fence_put(exec->fence);
		exec->fence = NULL;
	}
	vc4_submit_next_render_job(dev);

	wake_up_all(&vc4->job_wait_queue);
	schedule_work(&vc4->job_done_work);
Loading