Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e16d028 authored by Mika Kuoppala's avatar Mika Kuoppala Committed by Mika Kuoppala
Browse files

drm/i915: Split up hangcheck phases



In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent b2251c08
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
		return "idle";
	case HANGCHECK_WAIT:
		return "wait";
	case HANGCHECK_ACTIVE:
		return "active";
	case HANGCHECK_ACTIVE_SEQNO:
		return "active seqno";
	case HANGCHECK_ACTIVE_HEAD:
		return "active head";
	case HANGCHECK_ACTIVE_SUBUNITS:
		return "active subunits";
	case HANGCHECK_KICK:
		return "kick";
	case HANGCHECK_HUNG:
+137 −104
Original line number Diff line number Diff line
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
		memset(&engine->hangcheck.instdone, 0,
		       sizeof(engine->hangcheck.instdone));

		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE_HEAD;
	}

	if (!subunits_stuck(engine))
		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE_SUBUNITS;

	return HANGCHECK_HUNG;
}
@@ -291,48 +291,9 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
	return HANGCHECK_HUNG;
}

/*
 * This is called when the chip hasn't reported back with completed
 * batchbuffers in a long time. We keep track per ring seqno progress and
 * if there are no progress, hangcheck score for that ring is increased.
 * Further, acthd is inspected to see if the ring is stuck. On stuck case
 * we kick the ring. If we see no progress on three subsequent calls
 * we assume chip is wedged and try to fix it by resetting the chip.
 */
static void i915_hangcheck_elapsed(struct work_struct *work)
static void hangcheck_load_sample(struct intel_engine_cs *engine,
				  struct intel_engine_hangcheck *hc)
{
	struct drm_i915_private *dev_priv =
		container_of(work, typeof(*dev_priv),
			     gpu_error.hangcheck_work.work);
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	unsigned int hung = 0, stuck = 0;
	int busy_count = 0;
#define BUSY 1
#define KICK 5
#define HUNG 20
#define ACTIVE_DECAY 15

	if (!i915.enable_hangcheck)
		return;

	if (!READ_ONCE(dev_priv->gt.awake))
		return;

	/* As enabling the GPU requires fairly extensive mmio access,
	 * periodically arm the mmio checker to see if we are triggering
	 * any invalid access.
	 */
	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);

	for_each_engine(engine, dev_priv, id) {
		bool busy = intel_engine_has_waiter(engine);
		u64 acthd;
		u32 seqno;
		u32 submit;

		semaphore_clear_deadlocks(dev_priv);

	/* We don't strictly need an irq-barrier here, as we are not
	 * serving an interrupt request, be paranoid in case the
	 * barrier has side-effects (such as preventing a broken
@@ -343,14 +304,45 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
	if (engine->irq_seqno_barrier)
		engine->irq_seqno_barrier(engine);

		acthd = intel_engine_get_active_head(engine);
		seqno = intel_engine_get_seqno(engine);
		submit = intel_engine_last_submit(engine);
	hc->acthd = intel_engine_get_active_head(engine);
	hc->seqno = intel_engine_get_seqno(engine);
	hc->score = engine->hangcheck.score;
}

		if (engine->hangcheck.seqno == seqno) {
			if (i915_seqno_passed(seqno, submit)) {
				engine->hangcheck.action = HANGCHECK_IDLE;
			} else {
static void hangcheck_store_sample(struct intel_engine_cs *engine,
				   const struct intel_engine_hangcheck *hc)
{
	engine->hangcheck.acthd = hc->acthd;
	engine->hangcheck.seqno = hc->seqno;
	engine->hangcheck.score = hc->score;
	engine->hangcheck.action = hc->action;
}

static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
		     const struct intel_engine_hangcheck *hc)
{
	if (engine->hangcheck.seqno != hc->seqno)
		return HANGCHECK_ACTIVE_SEQNO;

	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
		return HANGCHECK_IDLE;

	return engine_stuck(engine, hc->acthd);
}

static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
					struct intel_engine_hangcheck *hc)
{
	hc->action = hangcheck_get_action(engine, hc);

	switch (hc->action) {
	case HANGCHECK_IDLE:
	case HANGCHECK_WAIT:
		break;

	case HANGCHECK_ACTIVE_HEAD:
	case HANGCHECK_ACTIVE_SUBUNITS:
		/* We always increment the hangcheck score
		 * if the engine is busy and still processing
		 * the same request, so that no single request
@@ -366,54 +358,43 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		 * being repeatedly kicked and so responsible
		 * for stalling the machine.
		 */
				engine->hangcheck.action =
					engine_stuck(engine, acthd);

				switch (engine->hangcheck.action) {
				case HANGCHECK_IDLE:
				case HANGCHECK_WAIT:
					break;
				case HANGCHECK_ACTIVE:
					engine->hangcheck.score += BUSY;
		hc->score += 1;
		break;

	case HANGCHECK_KICK:
					engine->hangcheck.score += KICK;
		hc->score += 5;
		break;

	case HANGCHECK_HUNG:
					engine->hangcheck.score += HUNG;
		hc->score += 20;
		break;
				}
			}

			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
				hung |= intel_engine_flag(engine);
				if (engine->hangcheck.action != HANGCHECK_HUNG)
					stuck |= intel_engine_flag(engine);
			}
		} else {
			engine->hangcheck.action = HANGCHECK_ACTIVE;

	case HANGCHECK_ACTIVE_SEQNO:
		/* Gradually reduce the count so that we catch DoS
		 * attempts across multiple batches.
		 */
			if (engine->hangcheck.score > 0)
				engine->hangcheck.score -= ACTIVE_DECAY;
			if (engine->hangcheck.score < 0)
				engine->hangcheck.score = 0;
		if (hc->score > 0)
			hc->score -= 15;
		if (hc->score < 0)
			hc->score = 0;

		/* Clear head and subunit states on seqno movement */
			acthd = 0;
		hc->acthd = 0;

		memset(&engine->hangcheck.instdone, 0,
		       sizeof(engine->hangcheck.instdone));
		}
		break;

		engine->hangcheck.seqno = seqno;
		engine->hangcheck.acthd = acthd;
		busy_count += busy;
	default:
		MISSING_CASE(hc->action);
	}
}

	if (hung) {
static void hangcheck_declare_hang(struct drm_i915_private *i915,
				   unsigned int hung,
				   unsigned int stuck)
{
	struct intel_engine_cs *engine;
	char msg[80];
	unsigned int tmp;
	int len;
@@ -425,14 +406,66 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		hung &= ~stuck;
	len = scnprintf(msg, sizeof(msg),
			"%s on ", stuck == hung ? "No progress" : "Hang");
		for_each_engine_masked(engine, dev_priv, hung, tmp)
	for_each_engine_masked(engine, i915, hung, tmp)
		len += scnprintf(msg + len, sizeof(msg) - len,
				 "%s, ", engine->name);
	msg[len-2] = '\0';

		return i915_handle_error(dev_priv, hung, msg);
	return i915_handle_error(i915, hung, msg);
}

/*
 * This is called when the chip hasn't reported back with completed
 * batchbuffers in a long time. We keep track per ring seqno progress and
 * if there are no progress, hangcheck score for that ring is increased.
 * Further, acthd is inspected to see if the ring is stuck. On stuck case
 * we kick the ring. If we see no progress on three subsequent calls
 * we assume chip is wedged and try to fix it by resetting the chip.
 */
static void i915_hangcheck_elapsed(struct work_struct *work)
{
	struct drm_i915_private *dev_priv =
		container_of(work, typeof(*dev_priv),
			     gpu_error.hangcheck_work.work);
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	unsigned int hung = 0, stuck = 0;
	int busy_count = 0;

	if (!i915.enable_hangcheck)
		return;

	if (!READ_ONCE(dev_priv->gt.awake))
		return;

	/* As enabling the GPU requires fairly extensive mmio access,
	 * periodically arm the mmio checker to see if we are triggering
	 * any invalid access.
	 */
	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);

	for_each_engine(engine, dev_priv, id) {
		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
		const bool busy = intel_engine_has_waiter(engine);

		semaphore_clear_deadlocks(dev_priv);

		hangcheck_load_sample(engine, hc);
		hangcheck_accumulate_sample(engine, hc);
		hangcheck_store_sample(engine, hc);

		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
			hung |= intel_engine_flag(engine);
			if (hc->action != HANGCHECK_HUNG)
				stuck |= intel_engine_flag(engine);
		}

		busy_count += busy;
	}

	if (hung)
		hangcheck_declare_hang(dev_priv, hung, stuck);

	/* Reset timer in case GPU hangs without another request being added */
	if (busy_count)
		i915_queue_hangcheck(dev_priv);
+3 −1
Original line number Diff line number Diff line
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
enum intel_engine_hangcheck_action {
	HANGCHECK_IDLE = 0,
	HANGCHECK_WAIT,
	HANGCHECK_ACTIVE,
	HANGCHECK_ACTIVE_SEQNO,
	HANGCHECK_ACTIVE_HEAD,
	HANGCHECK_ACTIVE_SUBUNITS,
	HANGCHECK_KICK,
	HANGCHECK_HUNG,
};