Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 12471ba8 authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Harden detection of missed interrupts



Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.

v2: Clear the stuck interrupt marker between successful batches

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1460195877-20520-3-git-send-email-chris@chris-wilson.co.uk
parent c04e0f3b
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
static void i915_ring_seqno_info(struct seq_file *m,
				 struct intel_engine_cs *engine)
{
	if (engine->get_seqno) {
	seq_printf(m, "Current sequence (%s): %x\n",
		   engine->name, engine->get_seqno(engine));
	}
	seq_printf(m, "Current user interrupts (%s): %x\n",
		   engine->name, READ_ONCE(engine->user_interrupts));
}

static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
			   engine->hangcheck.seqno,
			   seqno[id],
			   engine->last_submitted_seqno);
		seq_printf(m, "\tuser interrupts = %x [current %x]\n",
			   engine->hangcheck.user_interrupts,
			   READ_ONCE(engine->user_interrupts));
		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
			   (long long)engine->hangcheck.acthd,
			   (long long)acthd[id]);
+26 −12
Original line number Diff line number Diff line
@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine)
		return;

	trace_i915_gem_request_notify(engine);
	engine->user_interrupts++;

	wake_up_all(&engine->irq_queue);
}
@@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
	return HANGCHECK_HUNG;
}

static unsigned kick_waiters(struct intel_engine_cs *engine)
{
	struct drm_i915_private *i915 = to_i915(engine->dev);
	unsigned user_interrupts = READ_ONCE(engine->user_interrupts);

	if (engine->hangcheck.user_interrupts == user_interrupts &&
	    !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
		if (!(i915->gpu_error.test_irq_rings & intel_engine_flag(engine)))
			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
				  engine->name);
		else
			DRM_INFO("Fake missed irq on %s\n",
				 engine->name);
		wake_up_all(&engine->irq_queue);
	}

	return user_interrupts;
}
/*
 * This is called when the chip hasn't reported back with completed
 * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
	for_each_engine_id(engine, dev_priv, id) {
		u64 acthd;
		u32 seqno;
		unsigned user_interrupts;
		bool busy = true;

		semaphore_clear_deadlocks(dev_priv);
@@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		acthd = intel_ring_get_active_head(engine);
		seqno = engine->get_seqno(engine);

		/* Reset stuck interrupts between batch advances */
		user_interrupts = 0;

		if (engine->hangcheck.seqno == seqno) {
			if (ring_idle(engine, seqno)) {
				engine->hangcheck.action = HANGCHECK_IDLE;

				if (waitqueue_active(&engine->irq_queue)) {
					/* Issue a wake-up to catch stuck h/w. */
					if (!test_and_set_bit(engine->id, &dev_priv->gpu_error.missed_irq_rings)) {
						if (!(dev_priv->gpu_error.test_irq_rings & intel_engine_flag(engine)))
							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
								  engine->name);
						else
							DRM_INFO("Fake missed irq on %s\n",
								 engine->name);
						wake_up_all(&engine->irq_queue);
					}
					/* Safeguard against driver failure */
					user_interrupts = kick_waiters(engine);
					engine->hangcheck.score += BUSY;
				} else
					busy = false;
@@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
				engine->hangcheck.score = 0;

			/* Clear head and subunit states on seqno movement */
			engine->hangcheck.acthd = 0;
			acthd = 0;

			memset(engine->hangcheck.instdone, 0,
			       sizeof(engine->hangcheck.instdone));
@@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)

		engine->hangcheck.seqno = seqno;
		engine->hangcheck.acthd = acthd;
		engine->hangcheck.user_interrupts = user_interrupts;
		busy_count += busy;
	}

+2 −0
Original line number Diff line number Diff line
@@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action {
struct intel_ring_hangcheck {
	u64 acthd;
	u32 seqno;
	unsigned user_interrupts;
	int score;
	enum intel_ring_hangcheck_action action;
	int deadlock;
@@ -305,6 +306,7 @@ struct intel_engine_cs {
	 * inspecting request list.
	 */
	u32 last_submitted_seqno;
	unsigned user_interrupts;

	bool gpu_caches_dirty;