Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6274f212 authored by Chris Wilson's avatar Chris Wilson Committed by Daniel Vetter
Browse files

drm/i915: Don't count semaphore waits towards a stuck ring

If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.

This error was foretold by Ben in
commit 05407ff8
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."

v2: Make sure we don't even incur the KICK cost whilst waiting.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394


Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
parent 9107e9d2
Loading
Loading
Loading
Loading
+89 −32
Original line number Diff line number Diff line
@@ -2321,21 +2321,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
		i915_seqno_passed(seqno, ring_last_seqno(ring)));
}

static bool semaphore_passed(struct intel_ring_buffer *ring)
static struct intel_ring_buffer *
semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
{
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
	u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
	struct intel_ring_buffer *signaller;
	u32 cmd, ipehr, acthd_min;
	u32 cmd, ipehr, acthd, acthd_min;

	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
	if ((ipehr & ~(0x3 << 16)) !=
	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
		return false;
		return NULL;

	/* ACTHD is likely pointing to the dword after the actual command,
	 * so scan backwards until we find the MBOX.
	 */
	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
	acthd_min = max((int)acthd - 3 * 4, 0);
	do {
		cmd = ioread32(ring->virtual_start + acthd);
@@ -2344,22 +2344,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)

		acthd -= 4;
		if (acthd < acthd_min)
			return false;
			return NULL;
	} while (1);

	signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
	return i915_seqno_passed(signaller->get_seqno(signaller, false),
				 ioread32(ring->virtual_start+acthd+4)+1);
	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
}

static int semaphore_passed(struct intel_ring_buffer *ring)
{
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
	struct intel_ring_buffer *signaller;
	u32 seqno, ctl;

	ring->hangcheck.deadlock = true;

	signaller = semaphore_waits_for(ring, &seqno);
	if (signaller == NULL || signaller->hangcheck.deadlock)
		return -1;

	/* cursory check for an unkickable deadlock */
	ctl = I915_READ_CTL(signaller);
	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
		return -1;

	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
}

static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
{
	struct intel_ring_buffer *ring;
	int i;

	for_each_ring(ring, dev_priv, i)
		ring->hangcheck.deadlock = false;
}

static bool ring_hung(struct intel_ring_buffer *ring)
static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
{
	struct drm_device *dev = ring->dev;
	struct drm_i915_private *dev_priv = dev->dev_private;
	u32 tmp;

	if (ring->hangcheck.acthd != acthd)
		return active;

	if (IS_GEN2(dev))
		return true;
		return hung;

	/* Is the chip hanging on a WAIT_FOR_EVENT?
	 * If so we can simply poke the RB_WAIT bit
@@ -2371,19 +2402,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
		DRM_ERROR("Kicking stuck wait on %s\n",
			  ring->name);
		I915_WRITE_CTL(ring, tmp);
		return false;
		return kick;
	}

	if (INTEL_INFO(dev)->gen >= 6 &&
	    tmp & RING_WAIT_SEMAPHORE &&
	    semaphore_passed(ring)) {
	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
		switch (semaphore_passed(ring)) {
		default:
			return hung;
		case 1:
			DRM_ERROR("Kicking stuck semaphore on %s\n",
				  ring->name);
			I915_WRITE_CTL(ring, tmp);
		return false;
			return kick;
		case 0:
			return wait;
		}
	}

	return true;
	return hung;
}

/**
@@ -2414,6 +2450,8 @@ void i915_hangcheck_elapsed(unsigned long data)
		u32 seqno, acthd;
		bool busy = true;

		semaphore_clear_deadlocks(dev_priv);

		seqno = ring->get_seqno(ring, false);
		acthd = intel_ring_get_active_head(ring);

@@ -2430,17 +2468,36 @@ void i915_hangcheck_elapsed(unsigned long data)
			} else {
				int score;

				stuck[i] = ring->hangcheck.acthd == acthd;
				if (stuck[i]) {
					/* Every time we kick the ring, add a
					 * small increment to the hangcheck
					 * score so that we can catch a
					 * batch that is repeatedly kicked.
				/* We always increment the hangcheck score
				 * if the ring is busy and still processing
				 * the same request, so that no single request
				 * can run indefinitely (such as a chain of
				 * batches). The only time we do not increment
				 * the hangcheck score on this ring, if this
				 * ring is in a legitimate wait for another
				 * ring. In that case the waiting ring is a
				 * victim and we want to be sure we catch the
				 * right culprit. Then every time we do kick
				 * the ring, add a small increment to the
				 * score so that we can catch a batch that is
				 * being repeatedly kicked and so responsible
				 * for stalling the machine.
				 */
					score = ring_hung(ring) ? HUNG : KICK;
				} else
				switch (ring_stuck(ring, acthd)) {
				case wait:
					score = 0;
					break;
				case active:
					score = BUSY;

					break;
				case kick:
					score = KICK;
					break;
				case hung:
					score = HUNG;
					stuck[i] = true;
					break;
				}
				ring->hangcheck.score += score;
			}
		} else {
+1 −0
Original line number Diff line number Diff line
@@ -38,6 +38,7 @@ struct intel_hw_status_page {
#define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))

struct intel_ring_hangcheck {
	bool deadlock;
	u32 seqno;
	u32 acthd;
	int score;