drm/i915: Split up hangcheck phases (6e16d028) · Commits · e / devices / android_kernel_oneplus_sm7250

drivers/gpu/drm/i915/i915_gpu_error.c

+6 −2

Original line number	Diff line number	Diff line
		@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
		return "idle";
		case HANGCHECK_WAIT:
		return "wait";
		case HANGCHECK_ACTIVE:
		return "active";
		case HANGCHECK_ACTIVE_SEQNO:
		return "active seqno";
		case HANGCHECK_ACTIVE_HEAD:
		return "active head";
		case HANGCHECK_ACTIVE_SUBUNITS:
		return "active subunits";
		case HANGCHECK_KICK:
		return "kick";
		case HANGCHECK_HUNG:

drivers/gpu/drm/i915/intel_hangcheck.c

+137 −104

Original line number	Diff line number	Diff line
		@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
		memset(&engine->hangcheck.instdone, 0,
		sizeof(engine->hangcheck.instdone));

		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE_HEAD;
		}

		if (!subunits_stuck(engine))
		return HANGCHECK_ACTIVE;
		return HANGCHECK_ACTIVE_SUBUNITS;

		return HANGCHECK_HUNG;
		}
		@@ -291,48 +291,9 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
		return HANGCHECK_HUNG;
		}

		/*
		* This is called when the chip hasn't reported back with completed
		* batchbuffers in a long time. We keep track per ring seqno progress and
		* if there are no progress, hangcheck score for that ring is increased.
		* Further, acthd is inspected to see if the ring is stuck. On stuck case
		* we kick the ring. If we see no progress on three subsequent calls
		* we assume chip is wedged and try to fix it by resetting the chip.
		*/
		static void i915_hangcheck_elapsed(struct work_struct *work)
		static void hangcheck_load_sample(struct intel_engine_cs *engine,
		struct intel_engine_hangcheck *hc)
		{
		struct drm_i915_private *dev_priv =
		container_of(work, typeof(*dev_priv),
		gpu_error.hangcheck_work.work);
		struct intel_engine_cs *engine;
		enum intel_engine_id id;
		unsigned int hung = 0, stuck = 0;
		int busy_count = 0;
		#define BUSY 1
		#define KICK 5
		#define HUNG 20
		#define ACTIVE_DECAY 15

		if (!i915.enable_hangcheck)
		return;

		if (!READ_ONCE(dev_priv->gt.awake))
		return;

		/* As enabling the GPU requires fairly extensive mmio access,
		* periodically arm the mmio checker to see if we are triggering
		* any invalid access.
		*/
		intel_uncore_arm_unclaimed_mmio_detection(dev_priv);

		for_each_engine(engine, dev_priv, id) {
		bool busy = intel_engine_has_waiter(engine);
		u64 acthd;
		u32 seqno;
		u32 submit;

		semaphore_clear_deadlocks(dev_priv);

		/* We don't strictly need an irq-barrier here, as we are not
		* serving an interrupt request, be paranoid in case the
		* barrier has side-effects (such as preventing a broken
		@@ -343,14 +304,45 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		if (engine->irq_seqno_barrier)
		engine->irq_seqno_barrier(engine);

		acthd = intel_engine_get_active_head(engine);
		seqno = intel_engine_get_seqno(engine);
		submit = intel_engine_last_submit(engine);
		hc->acthd = intel_engine_get_active_head(engine);
		hc->seqno = intel_engine_get_seqno(engine);
		hc->score = engine->hangcheck.score;
		}

		if (engine->hangcheck.seqno == seqno) {
		if (i915_seqno_passed(seqno, submit)) {
		engine->hangcheck.action = HANGCHECK_IDLE;
		} else {
		static void hangcheck_store_sample(struct intel_engine_cs *engine,
		const struct intel_engine_hangcheck *hc)
		{
		engine->hangcheck.acthd = hc->acthd;
		engine->hangcheck.seqno = hc->seqno;
		engine->hangcheck.score = hc->score;
		engine->hangcheck.action = hc->action;
		}

		static enum intel_engine_hangcheck_action
		hangcheck_get_action(struct intel_engine_cs *engine,
		const struct intel_engine_hangcheck *hc)
		{
		if (engine->hangcheck.seqno != hc->seqno)
		return HANGCHECK_ACTIVE_SEQNO;

		if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
		return HANGCHECK_IDLE;

		return engine_stuck(engine, hc->acthd);
		}

		static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
		struct intel_engine_hangcheck *hc)
		{
		hc->action = hangcheck_get_action(engine, hc);

		switch (hc->action) {
		case HANGCHECK_IDLE:
		case HANGCHECK_WAIT:
		break;

		case HANGCHECK_ACTIVE_HEAD:
		case HANGCHECK_ACTIVE_SUBUNITS:
		/* We always increment the hangcheck score
		* if the engine is busy and still processing
		* the same request, so that no single request
		@@ -366,54 +358,43 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		* being repeatedly kicked and so responsible
		* for stalling the machine.
		*/
		engine->hangcheck.action =
		engine_stuck(engine, acthd);

		switch (engine->hangcheck.action) {
		case HANGCHECK_IDLE:
		case HANGCHECK_WAIT:
		break;
		case HANGCHECK_ACTIVE:
		engine->hangcheck.score += BUSY;
		hc->score += 1;
		break;

		case HANGCHECK_KICK:
		engine->hangcheck.score += KICK;
		hc->score += 5;
		break;

		case HANGCHECK_HUNG:
		engine->hangcheck.score += HUNG;
		hc->score += 20;
		break;
		}
		}

		if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
		hung \|= intel_engine_flag(engine);
		if (engine->hangcheck.action != HANGCHECK_HUNG)
		stuck \|= intel_engine_flag(engine);
		}
		} else {
		engine->hangcheck.action = HANGCHECK_ACTIVE;

		case HANGCHECK_ACTIVE_SEQNO:
		/* Gradually reduce the count so that we catch DoS
		* attempts across multiple batches.
		*/
		if (engine->hangcheck.score > 0)
		engine->hangcheck.score -= ACTIVE_DECAY;
		if (engine->hangcheck.score < 0)
		engine->hangcheck.score = 0;
		if (hc->score > 0)
		hc->score -= 15;
		if (hc->score < 0)
		hc->score = 0;

		/* Clear head and subunit states on seqno movement */
		acthd = 0;
		hc->acthd = 0;

		memset(&engine->hangcheck.instdone, 0,
		sizeof(engine->hangcheck.instdone));
		}
		break;

		engine->hangcheck.seqno = seqno;
		engine->hangcheck.acthd = acthd;
		busy_count += busy;
		default:
		MISSING_CASE(hc->action);
		}
		}

		if (hung) {
		static void hangcheck_declare_hang(struct drm_i915_private *i915,
		unsigned int hung,
		unsigned int stuck)
		{
		struct intel_engine_cs *engine;
		char msg[80];
		unsigned int tmp;
		int len;
		@@ -425,14 +406,66 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
		hung &= ~stuck;
		len = scnprintf(msg, sizeof(msg),
		"%s on ", stuck == hung ? "No progress" : "Hang");
		for_each_engine_masked(engine, dev_priv, hung, tmp)
		for_each_engine_masked(engine, i915, hung, tmp)
		len += scnprintf(msg + len, sizeof(msg) - len,
		"%s, ", engine->name);
		msg[len-2] = '\0';

		return i915_handle_error(dev_priv, hung, msg);
		return i915_handle_error(i915, hung, msg);
		}

		/*
		* This is called when the chip hasn't reported back with completed
		* batchbuffers in a long time. We keep track per ring seqno progress and
		* if there are no progress, hangcheck score for that ring is increased.
		* Further, acthd is inspected to see if the ring is stuck. On stuck case
		* we kick the ring. If we see no progress on three subsequent calls
		* we assume chip is wedged and try to fix it by resetting the chip.
		*/
		static void i915_hangcheck_elapsed(struct work_struct *work)
		{
		struct drm_i915_private *dev_priv =
		container_of(work, typeof(*dev_priv),
		gpu_error.hangcheck_work.work);
		struct intel_engine_cs *engine;
		enum intel_engine_id id;
		unsigned int hung = 0, stuck = 0;
		int busy_count = 0;

		if (!i915.enable_hangcheck)
		return;

		if (!READ_ONCE(dev_priv->gt.awake))
		return;

		/* As enabling the GPU requires fairly extensive mmio access,
		* periodically arm the mmio checker to see if we are triggering
		* any invalid access.
		*/
		intel_uncore_arm_unclaimed_mmio_detection(dev_priv);

		for_each_engine(engine, dev_priv, id) {
		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
		const bool busy = intel_engine_has_waiter(engine);

		semaphore_clear_deadlocks(dev_priv);

		hangcheck_load_sample(engine, hc);
		hangcheck_accumulate_sample(engine, hc);
		hangcheck_store_sample(engine, hc);

		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
		hung \|= intel_engine_flag(engine);
		if (hc->action != HANGCHECK_HUNG)
		stuck \|= intel_engine_flag(engine);
		}

		busy_count += busy;
		}

		if (hung)
		hangcheck_declare_hang(dev_priv, hung, stuck);

		/* Reset timer in case GPU hangs without another request being added */
		if (busy_count)
		i915_queue_hangcheck(dev_priv);

drivers/gpu/drm/i915/intel_ringbuffer.h

+3 −1

Original line number	Diff line number	Diff line
		@@ -67,7 +67,9 @@ struct intel_hw_status_page {
		enum intel_engine_hangcheck_action {
		HANGCHECK_IDLE = 0,
		HANGCHECK_WAIT,
		HANGCHECK_ACTIVE,
		HANGCHECK_ACTIVE_SEQNO,
		HANGCHECK_ACTIVE_HEAD,
		HANGCHECK_ACTIVE_SUBUNITS,
		HANGCHECK_KICK,
		HANGCHECK_HUNG,
		};