Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4fa6053e authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Record more information about the hanging contexts



Include extra information such as the user_handle and hw_id so that
userspace can identify which of their contexts hung, useful if they are
performing self-diagnositics.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170129092433.10483-1-chris@chris-wilson.co.uk


Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent 0102ba1f
Loading
Loading
Loading
Loading
+10 −4
Original line number Diff line number Diff line
@@ -969,6 +969,16 @@ struct drm_i915_error_state {
		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
		struct intel_instdone instdone;

		struct drm_i915_error_context {
			char comm[TASK_COMM_LEN];
			pid_t pid;
			u32 handle;
			u32 hw_id;
			int ban_score;
			int active;
			int guilty;
		} context;

		struct drm_i915_error_object {
			u64 gtt_offset;
			u64 gtt_size;
@@ -1002,10 +1012,6 @@ struct drm_i915_error_state {
				u32 pp_dir_base;
			};
		} vm_info;

		pid_t pid;
		char comm[TASK_COMM_LEN];
		int context_bans;
	} engine[I915_NUM_ENGINES];

	struct drm_i915_error_buffer {
+49 −28
Original line number Diff line number Diff line
@@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
		   erq->head, erq->tail);
}

static void error_print_context(struct drm_i915_error_state_buf *m,
				const char *header,
				struct drm_i915_error_context *ctx)
{
	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
		   ctx->ban_score, ctx->guilty, ctx->active);
}

static void error_print_engine(struct drm_i915_error_state_buf *m,
			       struct drm_i915_error_engine *ee)
{
@@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,

	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
	error_print_context(m, "  Active context: ", &ee->context);
}

void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,

	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		if (error->engine[i].hangcheck_stalled &&
		    error->engine[i].pid != -1) {
			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
		    error->engine[i].context.pid) {
			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
				   engine_str(i),
				   error->engine[i].comm,
				   error->engine[i].pid,
				   error->engine[i].context_bans);
				   error->engine[i].context.comm,
				   error->engine[i].context.pid,
				   error->engine[i].context.ban_score);
		}
	}
	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
		obj = ee->batchbuffer;
		if (obj) {
			err_puts(m, dev_priv->engine[i]->name);
			if (ee->pid != -1)
				err_printf(m, " (submitted by %s [%d], bans %d)",
					   ee->comm,
					   ee->pid,
					   ee->context_bans);
			if (ee->context.pid)
				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
					   ee->context.comm,
					   ee->context.pid,
					   ee->context.handle,
					   ee->context.hw_id,
					   ee->context.ban_score);
			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
				   upper_32_bits(obj->gtt_offset),
				   lower_32_bits(obj->gtt_offset));
@@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
				       &ee->execlist[n]);
}

static void record_context(struct drm_i915_error_context *e,
			   struct i915_gem_context *ctx)
{
	if (ctx->pid) {
		struct task_struct *task;

		rcu_read_lock();
		task = pid_task(ctx->pid, PIDTYPE_PID);
		if (task) {
			strcpy(e->comm, task->comm);
			e->pid = task->pid;
		}
		rcu_read_unlock();
	}

	e->handle = ctx->user_handle;
	e->hw_id = ctx->hw_id;
	e->ban_score = ctx->ban_score;
	e->guilty = ctx->guilty_count;
	e->active = ctx->active_count;
}

static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
				  struct drm_i915_error_state *error)
{
@@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
		struct drm_i915_error_engine *ee = &error->engine[i];
		struct drm_i915_gem_request *request;

		ee->pid = -1;
		ee->engine_id = -1;

		if (!engine)
@@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
		request = i915_gem_find_active_request(engine);
		if (request) {
			struct intel_ring *ring;
			struct pid *pid;

			ee->vm = request->ctx->ppgtt ?
				&request->ctx->ppgtt->base : &ggtt->base;

			record_context(&ee->context, request->ctx);

			/* We need to copy these to an anonymous buffer
			 * as the simplest method to avoid being overwritten
			 * by userspace.
@@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
				i915_error_object_create(dev_priv,
							 request->ctx->engine[i].state);

			pid = request->ctx->pid;
			if (pid) {
				struct task_struct *task;

				rcu_read_lock();
				task = pid_task(pid, PIDTYPE_PID);
				if (task) {
					strcpy(ee->comm, task->comm);
					ee->pid = task->pid;
				}
				rcu_read_unlock();
			}

			error->simulated |=
				i915_gem_context_no_error_capture(request->ctx);

@@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
			"GPU HANG: ecode %d:%d:0x%08x",
			INTEL_GEN(dev_priv), engine_id, ecode);

	if (engine_id != -1 && error->engine[engine_id].pid != -1)
	if (engine_id != -1 && error->engine[engine_id].context.pid)
		len += scnprintf(error->error_msg + len,
				 sizeof(error->error_msg) - len,
				 ", in %s [%d]",
				 error->engine[engine_id].comm,
				 error->engine[engine_id].pid);
				 error->engine[engine_id].context.comm,
				 error->engine[engine_id].context.pid);

	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
		  ", reason: %s, action: %s",