Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 24a65e62 authored by Mika Kuoppala's avatar Mika Kuoppala Committed by Mika Kuoppala
Browse files

drm/i915/hangcheck: Prevent long walks across full-ppgtt



With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB
address space, causing a loop to be detected. Under the current scheme,
if ACTHD walks off the end of a batch buffer and into an empty
address space, we "never" detect the hang. If we always increment the
score as the ACTHD is progressing then we will eventually timeout (after
~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act
this, increase the amount we reduce the score for good batches, so that
only a series of almost-bad batches trigger a full reset. DoS detection
suffers slightly but series of long running shader tests will benefit.

Based on a patch from Chris Wilson.

Testcase: igt/drv_hangman/hangcheck-unterminated
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1456930109-21532-1-git-send-email-mika.kuoppala@intel.com
parent d431440c
Loading
Loading
Loading
Loading
+0 −2
Original line number Original line Diff line number Diff line
@@ -1367,8 +1367,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
			   (long long)ring->hangcheck.acthd,
			   (long long)ring->hangcheck.acthd,
			   (long long)acthd[i]);
			   (long long)acthd[i]);
		seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
			   (long long)ring->hangcheck.max_acthd);
		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);


+0 −2
Original line number Original line Diff line number Diff line
@@ -230,8 +230,6 @@ static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
		return "wait";
		return "wait";
	case HANGCHECK_ACTIVE:
	case HANGCHECK_ACTIVE:
		return "active";
		return "active";
	case HANGCHECK_ACTIVE_LOOP:
		return "active (loop)";
	case HANGCHECK_KICK:
	case HANGCHECK_KICK:
		return "kick";
		return "kick";
	case HANGCHECK_HUNG:
	case HANGCHECK_HUNG:
+7 −10
Original line number Original line Diff line number Diff line
@@ -3001,14 +3001,9 @@ head_stuck(struct intel_engine_cs *ring, u64 acthd)
		memset(ring->hangcheck.instdone, 0,
		memset(ring->hangcheck.instdone, 0,
		       sizeof(ring->hangcheck.instdone));
		       sizeof(ring->hangcheck.instdone));


		if (acthd > ring->hangcheck.max_acthd) {
			ring->hangcheck.max_acthd = acthd;
		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE;
	}
	}


		return HANGCHECK_ACTIVE_LOOP;
	}

	if (!subunits_stuck(ring))
	if (!subunits_stuck(ring))
		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE;


@@ -3083,6 +3078,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
#define BUSY 1
#define BUSY 1
#define KICK 5
#define KICK 5
#define HUNG 20
#define HUNG 20
#define ACTIVE_DECAY 15


	if (!i915.enable_hangcheck)
	if (!i915.enable_hangcheck)
		return;
		return;
@@ -3151,9 +3147,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
				switch (ring->hangcheck.action) {
				switch (ring->hangcheck.action) {
				case HANGCHECK_IDLE:
				case HANGCHECK_IDLE:
				case HANGCHECK_WAIT:
				case HANGCHECK_WAIT:
				case HANGCHECK_ACTIVE:
					break;
					break;
				case HANGCHECK_ACTIVE_LOOP:
				case HANGCHECK_ACTIVE:
					ring->hangcheck.score += BUSY;
					ring->hangcheck.score += BUSY;
					break;
					break;
				case HANGCHECK_KICK:
				case HANGCHECK_KICK:
@@ -3172,10 +3167,12 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
			 * attempts across multiple batches.
			 * attempts across multiple batches.
			 */
			 */
			if (ring->hangcheck.score > 0)
			if (ring->hangcheck.score > 0)
				ring->hangcheck.score--;
				ring->hangcheck.score -= ACTIVE_DECAY;
			if (ring->hangcheck.score < 0)
				ring->hangcheck.score = 0;


			/* Clear head and subunit states on seqno movement */
			/* Clear head and subunit states on seqno movement */
			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
			ring->hangcheck.acthd = 0;


			memset(ring->hangcheck.instdone, 0,
			memset(ring->hangcheck.instdone, 0,
			       sizeof(ring->hangcheck.instdone));
			       sizeof(ring->hangcheck.instdone));
+0 −2
Original line number Original line Diff line number Diff line
@@ -79,7 +79,6 @@ enum intel_ring_hangcheck_action {
	HANGCHECK_IDLE = 0,
	HANGCHECK_IDLE = 0,
	HANGCHECK_WAIT,
	HANGCHECK_WAIT,
	HANGCHECK_ACTIVE,
	HANGCHECK_ACTIVE,
	HANGCHECK_ACTIVE_LOOP,
	HANGCHECK_KICK,
	HANGCHECK_KICK,
	HANGCHECK_HUNG,
	HANGCHECK_HUNG,
};
};
@@ -88,7 +87,6 @@ enum intel_ring_hangcheck_action {


struct intel_ring_hangcheck {
struct intel_ring_hangcheck {
	u64 acthd;
	u64 acthd;
	u64 max_acthd;
	u32 seqno;
	u32 seqno;
	int score;
	int score;
	enum intel_ring_hangcheck_action action;
	enum intel_ring_hangcheck_action action;