drm/vc4: Expose performance counters to userspace (65101d8c) · Commits · e / devices / android_kernel_oneplus_sm7250

drivers/gpu/drm/vc4/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@ vc4-y := \
		vc4_vec.o \
		vc4_hvs.o \
		vc4_irq.o \
		vc4_perfmon.o \
		vc4_plane.o \
		vc4_render_cl.o \
		vc4_trace_points.o \

drivers/gpu/drm/vc4/vc4_drv.c

+26 −0

Original line number	Diff line number	Diff line
		@@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device dev, void data,
		case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
		case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
		case DRM_VC4_PARAM_SUPPORTS_MADVISE:
		case DRM_VC4_PARAM_SUPPORTS_PERFMON:
		args->value = true;
		break;
		default:
		@@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device dev, void data,
		return 0;
		}

		static int vc4_open(struct drm_device dev, struct drm_file file)
		{
		struct vc4_file *vc4file;

		vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
		if (!vc4file)
		return -ENOMEM;

		vc4_perfmon_open_file(vc4file);
		file->driver_priv = vc4file;
		return 0;
		}

		static void vc4_close(struct drm_device dev, struct drm_file file)
		{
		struct vc4_file *vc4file = file->driver_priv;

		vc4_perfmon_close_file(vc4file);
		}

		static const struct vm_operations_struct vc4_vm_ops = {
		.fault = vc4_fault,
		.open = drm_gem_vm_open,
		@@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
		DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
		DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
		DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
		DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
		DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
		DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
		};

		static struct drm_driver vc4_drm_driver = {
		@@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = {
		DRIVER_RENDER \|
		DRIVER_PRIME),
		.lastclose = drm_fb_helper_lastclose,
		.open = vc4_open,
		.postclose = vc4_close,
		.irq_handler = vc4_irq,
		.irq_preinstall = vc4_irq_preinstall,
		.irq_postinstall = vc4_irq_postinstall,

drivers/gpu/drm/vc4/vc4_drv.h

+68 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,8 @@
		#include <drm/drm_encoder.h>
		#include <drm/drm_gem_cma_helper.h>

		#include "uapi/drm/vc4_drm.h"

		/* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
		* this.
		*/
		@@ -29,6 +31,36 @@ enum vc4_kernel_bo_type {
		VC4_BO_TYPE_COUNT
		};

		/* Performance monitor object. The perform lifetime is controlled by userspace
		* using perfmon related ioctls. A perfmon can be attached to a submit_cl
		* request, and when this is the case, HW perf counters will be activated just
		* before the submit_cl is submitted to the GPU and disabled when the job is
		* done. This way, only events related to a specific job will be counted.
		*/
		struct vc4_perfmon {
		/* Tracks the number of users of the perfmon, when this counter reaches
		* zero the perfmon is destroyed.
		*/
		refcount_t refcnt;

		/* Number of counters activated in this perfmon instance
		* (should be less than DRM_VC4_MAX_PERF_COUNTERS).
		*/
		u8 ncounters;

		/* Events counted by the HW perf counters. */
		u8 events[DRM_VC4_MAX_PERF_COUNTERS];

		/* Storage for counter values. Counters are incremented by the HW
		* perf counter values every time the perfmon is attached to a GPU job.
		* This way, perfmon users don't have to retrieve the results after
		* each job if they want to track events covering several submissions.
		* Note that counter values can't be reset, but you can fake a reset by
		* destroying the perfmon and creating a new one.
		*/
		u64 counters[0];
		};

		struct vc4_dev {
		struct drm_device *dev;

		@@ -121,6 +153,11 @@ struct vc4_dev {
		wait_queue_head_t job_wait_queue;
		struct work_struct job_done_work;

		/* Used to track the active perfmon if any. Access to this field is
		* protected by job_lock.
		*/
		struct vc4_perfmon *active_perfmon;

		/* List of struct vc4_seqno_cb for callbacks to be made from a
		* workqueue when the given seqno is passed.
		*/
		@@ -406,6 +443,21 @@ struct vc4_exec_info {
		void *uniforms_v;
		uint32_t uniforms_p;
		uint32_t uniforms_size;

		/* Pointer to a performance monitor object if the user requested it,
		* NULL otherwise.
		*/
		struct vc4_perfmon *perfmon;
		};

		/* Per-open file private data. Any driver-specific resource that has to be
		* released when the DRM file is closed should be placed here.
		*/
		struct vc4_file {
		struct {
		struct idr idr;
		struct mutex lock;
		} perfmon;
		};

		static inline struct vc4_exec_info *
		@@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
		/* vc4_validate_shader.c */
		struct vc4_validated_shader_info *
		vc4_validate_shader(struct drm_gem_cma_object *shader_obj);

		/* vc4_perfmon.c */
		void vc4_perfmon_get(struct vc4_perfmon *perfmon);
		void vc4_perfmon_put(struct vc4_perfmon *perfmon);
		void vc4_perfmon_start(struct vc4_dev vc4, struct vc4_perfmon perfmon);
		void vc4_perfmon_stop(struct vc4_dev vc4, struct vc4_perfmon perfmon,
		bool capture);
		struct vc4_perfmon vc4_perfmon_find(struct vc4_file vc4file, int id);
		void vc4_perfmon_open_file(struct vc4_file *vc4file);
		void vc4_perfmon_close_file(struct vc4_file *vc4file);
		int vc4_perfmon_create_ioctl(struct drm_device dev, void data,
		struct drm_file *file_priv);
		int vc4_perfmon_destroy_ioctl(struct drm_device dev, void data,
		struct drm_file *file_priv);
		int vc4_perfmon_get_values_ioctl(struct drm_device dev, void data,
		struct drm_file *file_priv);

drivers/gpu/drm/vc4/vc4_gem.c

+43 −5

Original line number	Diff line number	Diff line
		@@ -454,13 +454,29 @@ vc4_submit_next_bin_job(struct drm_device *dev)

		vc4_flush_caches(dev);

		/* Only start the perfmon if it was not already started by a previous
		* job.
		*/
		if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
		vc4_perfmon_start(vc4, exec->perfmon);

		/* Either put the job in the binner if it uses the binner, or
		* immediately move it to the to-be-rendered queue.
		*/
		if (exec->ct0ca != exec->ct0ea) {
		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
		} else {
		struct vc4_exec_info *next;

		vc4_move_job_to_render(dev, exec);
		next = vc4_first_bin_job(vc4);

		/* We can't start the next bin job if the previous job had a
		* different perfmon instance attached to it. The same goes
		* if one of them had a perfmon attached to it and the other
		* one doesn't.
		*/
		if (next && next->perfmon == exec->perfmon)
		goto again;
		}
		}
		@@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device dev, struct vc4_exec_info exec,
		struct ww_acquire_ctx *acquire_ctx)
		{
		struct vc4_dev *vc4 = to_vc4_dev(dev);
		struct vc4_exec_info *renderjob;
		uint64_t seqno;
		unsigned long irqflags;
		struct vc4_fence *fence;
		@@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device dev, struct vc4_exec_info exec,

		list_add_tail(&exec->head, &vc4->bin_job_list);

		/* If no job was executing, kick ours off. Otherwise, it'll
		* get started when the previous job's flush done interrupt
		* occurs.
		/* If no bin job was executing and if the render job (if any) has the
		* same perfmon as our job attached to it (or if both jobs don't have
		* perfmon activated), then kick ours off. Otherwise, it'll get
		* started when the previous job's flush/render done interrupt occurs.
		*/
		if (vc4_first_bin_job(vc4) == exec) {
		renderjob = vc4_first_render_job(vc4);
		if (vc4_first_bin_job(vc4) == exec &&
		(!renderjob \|\| renderjob->perfmon == exec->perfmon)) {
		vc4_submit_next_bin_job(dev);
		vc4_queue_hangcheck(dev);
		}
		@@ -915,6 +935,9 @@ vc4_complete_exec(struct drm_device dev, struct vc4_exec_info exec)
		vc4->bin_alloc_used &= ~exec->bin_slots;
		spin_unlock_irqrestore(&vc4->job_lock, irqflags);

		/* Release the reference we had on the perf monitor. */
		vc4_perfmon_put(exec->perfmon);

		mutex_lock(&vc4->power_lock);
		if (--vc4->power_refcount == 0) {
		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
		@@ -1067,6 +1090,7 @@ vc4_submit_cl_ioctl(struct drm_device dev, void data,
		struct drm_file *file_priv)
		{
		struct vc4_dev *vc4 = to_vc4_dev(dev);
		struct vc4_file *vc4file = file_priv->driver_priv;
		struct drm_vc4_submit_cl *args = data;
		struct vc4_exec_info *exec;
		struct ww_acquire_ctx acquire_ctx;
		@@ -1080,6 +1104,11 @@ vc4_submit_cl_ioctl(struct drm_device dev, void data,
		return -EINVAL;
		}

		if (args->pad2 != 0) {
		DRM_DEBUG("->pad2 must be set to zero\n");
		return -EINVAL;
		}

		exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
		if (!exec) {
		DRM_ERROR("malloc failure on exec struct\n");
		@@ -1105,6 +1134,15 @@ vc4_submit_cl_ioctl(struct drm_device dev, void data,
		if (ret)
		goto fail;

		if (args->perfmonid) {
		exec->perfmon = vc4_perfmon_find(vc4file,
		args->perfmonid);
		if (!exec->perfmon) {
		ret = -ENOENT;
		goto fail;
		}
		}

		if (exec->args->bin_cl_size != 0) {
		ret = vc4_get_bcl(dev, exec);
		if (ret)

drivers/gpu/drm/vc4/vc4_irq.c

+37 −3

Original line number	Diff line number	Diff line
		@@ -104,12 +104,19 @@ static void
		vc4_irq_finish_bin_job(struct drm_device *dev)
		{
		struct vc4_dev *vc4 = to_vc4_dev(dev);
		struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
		struct vc4_exec_info next, exec = vc4_first_bin_job(vc4);

		if (!exec)
		return;

		vc4_move_job_to_render(dev, exec);
		next = vc4_first_bin_job(vc4);

		/* Only submit the next job in the bin list if it matches the perfmon
		* attached to the one that just finished (or if both jobs don't have
		* perfmon attached to them).
		*/
		if (next && next->perfmon == exec->perfmon)
		vc4_submit_next_bin_job(dev);
		}

		@@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev)
		if (!exec)
		return;

		/* Stop the perfmon so that the next bin job can be started. */
		if (exec->perfmon)
		vc4_perfmon_stop(vc4, exec->perfmon, false);

		list_move_tail(&exec->head, &vc4->bin_job_list);
		vc4_submit_next_bin_job(dev);
		}
		@@ -131,18 +142,41 @@ vc4_irq_finish_render_job(struct drm_device *dev)
		{
		struct vc4_dev *vc4 = to_vc4_dev(dev);
		struct vc4_exec_info *exec = vc4_first_render_job(vc4);
		struct vc4_exec_info nextbin, nextrender;

		if (!exec)
		return;

		vc4->finished_seqno++;
		list_move_tail(&exec->head, &vc4->job_done_list);

		nextbin = vc4_first_bin_job(vc4);
		nextrender = vc4_first_render_job(vc4);

		/* Only stop the perfmon if following jobs in the queue don't expect it
		* to be enabled.
		*/
		if (exec->perfmon && !nextrender &&
		(!nextbin \|\| nextbin->perfmon != exec->perfmon))
		vc4_perfmon_stop(vc4, exec->perfmon, true);

		/* If there's a render job waiting, start it. If this is not the case
		* we may have to unblock the binner if it's been stalled because of
		* perfmon (this can be checked by comparing the perfmon attached to
		* the finished renderjob to the one attached to the next bin job: if
		* they don't match, this means the binner is stalled and should be
		* restarted).
		*/
		if (nextrender)
		vc4_submit_next_render_job(dev);
		else if (nextbin && nextbin->perfmon != exec->perfmon)
		vc4_submit_next_bin_job(dev);

		if (exec->fence) {
		dma_fence_signal_locked(exec->fence);
		dma_fence_put(exec->fence);
		exec->fence = NULL;
		}
		vc4_submit_next_render_job(dev);

		wake_up_all(&vc4->job_wait_queue);
		schedule_work(&vc4->job_done_work);