Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ce800754 authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Add control flags to i915_handle_error()



Not all callers want the GPU error to handled in the same way, so expose
a control parameter. In the first instance, some callers do not want the
heavyweight error capture so add a bit to request the state to be
captured and saved.

v2: Pass msg down to i915_reset/i915_reset_engine so that we include the
reason for the reset in the dev_notice(), superseding the earlier option
to not print that notice.
v3: Stash the reason inside the i915->gpu_error to handover to the direct
reset from the blocking waiter.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: default avatarMichel Thierry <michel.thierry@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180320100449.1360-2-chris@chris-wilson.co.uk
parent ca98317b
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val)
		engine->hangcheck.stalled = true;
	}

	i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
			  val);
	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
			  "Manually set wedged engine mask = %llx", val);

	wait_on_bit(&i915->gpu_error.flags,
		    I915_RESET_HANDOFF,
+8 −9
Original line number Diff line number Diff line
@@ -1873,7 +1873,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
/**
 * i915_reset - reset chip after a hang
 * @i915: #drm_i915_private to reset
 * @flags: Instructions
 *
 * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
 * on failure.
@@ -1888,7 +1887,7 @@ static int i915_resume_switcheroo(struct drm_device *dev)
 *   - re-init interrupt state
 *   - re-init display
 */
void i915_reset(struct drm_i915_private *i915, unsigned int flags)
void i915_reset(struct drm_i915_private *i915)
{
	struct i915_gpu_error *error = &i915->gpu_error;
	int ret;
@@ -1905,8 +1904,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
	if (!i915_gem_unset_wedged(i915))
		goto wakeup;

	if (!(flags & I915_RESET_QUIET))
		dev_notice(i915->drm.dev, "Resetting chip after gpu hang\n");
	if (error->reason)
		dev_notice(i915->drm.dev,
			   "Resetting chip for %s\n", error->reason);
	error->reset_count++;

	disable_irq(i915->drm.irq);
@@ -2007,7 +2007,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
/**
 * i915_reset_engine - reset GPU engine to recover from a hang
 * @engine: engine to reset
 * @flags: options
 * @msg: reason for GPU reset; or NULL for no dev_notice()
 *
 * Reset a specific GPU engine. Useful if a hang is detected.
 * Returns zero on successful reset or otherwise an error code.
@@ -2017,7 +2017,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
 *  - reset engine (which will force the engine to idle)
 *  - re-init/configure engine
 */
int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
{
	struct i915_gpu_error *error = &engine->i915->gpu_error;
	struct i915_request *active_request;
@@ -2032,10 +2032,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
		goto out;
	}

	if (!(flags & I915_RESET_QUIET)) {
	if (msg)
		dev_notice(engine->i915->drm.dev,
			   "Resetting %s after gpu hang\n", engine->name);
	}
			   "Resetting %s for %s\n", engine->name, msg);
	error->reset_engine_count[engine->id]++;

	if (!engine->i915->guc.execbuf_client)
+5 −5
Original line number Diff line number Diff line
@@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev);
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);

#define I915_RESET_QUIET BIT(0)
extern void i915_reset(struct drm_i915_private *i915, unsigned int flags);
extern int i915_reset_engine(struct intel_engine_cs *engine,
			     unsigned int flags);
extern void i915_reset(struct drm_i915_private *i915);
extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg);

extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_reset_guc(struct drm_i915_private *dev_priv);
@@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
			   &dev_priv->gpu_error.hangcheck_work, delay);
}

__printf(3, 4)
__printf(4, 5)
void i915_handle_error(struct drm_i915_private *dev_priv,
		       u32 engine_mask,
		       unsigned long flags,
		       const char *fmt, ...);
#define I915_ERROR_CAPTURE BIT(0)

extern void intel_irq_init(struct drm_i915_private *dev_priv);
extern void intel_irq_fini(struct drm_i915_private *dev_priv);
+3 −0
Original line number Diff line number Diff line
@@ -269,6 +269,9 @@ struct i915_gpu_error {
	/** Number of times an engine has been reset */
	u32 reset_engine_count[I915_NUM_ENGINES];

	/** Reason for the current *global* reset */
	const char *reason;

	/**
	 * Waitqueue to signal when a hang is detected. Used to for waiters
	 * to release the struct_mutex for the reset to procede.
+32 −23
Original line number Diff line number Diff line
@@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
	return IRQ_HANDLED;
}

/**
 * i915_reset_device - do process context error handling work
 * @dev_priv: i915 device private
 *
 * Fire an error uevent so userspace can see that a hang or error
 * was detected.
 */
static void i915_reset_device(struct drm_i915_private *dev_priv)
static void i915_reset_device(struct drm_i915_private *dev_priv,
			      const char *msg)
{
	struct i915_gpu_error *error = &dev_priv->gpu_error;
	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
@@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
		intel_prepare_reset(dev_priv);

		error->reason = msg;

		/* Signal that locked waiters should reset the GPU */
		set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
		wake_up_all(&dev_priv->gpu_error.wait_queue);
		set_bit(I915_RESET_HANDOFF, &error->flags);
		wake_up_all(&error->wait_queue);

		/* Wait for anyone holding the lock to wakeup, without
		 * blocking indefinitely on struct_mutex.
		 */
		do {
			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
				i915_reset(dev_priv, 0);
				i915_reset(dev_priv);
				mutex_unlock(&dev_priv->drm.struct_mutex);
			}
		} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
		} while (wait_on_bit_timeout(&error->flags,
					     I915_RESET_HANDOFF,
					     TASK_UNINTERRUPTIBLE,
					     1));

		error->reason = NULL;

		intel_finish_reset(dev_priv);
	}

	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
		kobject_uevent_env(kobj,
				   KOBJ_CHANGE, reset_done_event);
	if (!test_bit(I915_WEDGED, &error->flags))
		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
}

static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
@@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
 * i915_handle_error - handle a gpu error
 * @dev_priv: i915 device private
 * @engine_mask: mask representing engines that are hung
 * @flags: control flags
 * @fmt: Error message format string
 *
 * Do some basic checking of register state at error time and
@@ -2965,17 +2964,24 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
 */
void i915_handle_error(struct drm_i915_private *dev_priv,
		       u32 engine_mask,
		       unsigned long flags,
		       const char *fmt, ...)
{
	struct intel_engine_cs *engine;
	unsigned int tmp;
	va_list args;
	char error_msg[80];
	char *msg = NULL;

	if (fmt) {
		va_list args;

		va_start(args, fmt);
		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
		va_end(args);

		msg = error_msg;
	}

	/*
	 * In most cases it's guaranteed that we get here with an RPM
	 * reference held, for example because there is a pending GPU
@@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
	intel_runtime_pm_get(dev_priv);

	engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
	i915_capture_error_state(dev_priv, engine_mask, error_msg);

	if (flags & I915_ERROR_CAPTURE) {
		i915_capture_error_state(dev_priv, engine_mask, msg);
		i915_clear_error_registers(dev_priv);
	}

	/*
	 * Try engine reset when available. We fall back to full reset if
@@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
					     &dev_priv->gpu_error.flags))
				continue;

			if (i915_reset_engine(engine, 0) == 0)
			if (i915_reset_engine(engine, msg) == 0)
				engine_mask &= ~intel_engine_flag(engine);

			clear_bit(I915_RESET_ENGINE + engine->id,
@@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
				    TASK_UNINTERRUPTIBLE);
	}

	i915_reset_device(dev_priv);
	i915_reset_device(dev_priv, msg);

	for_each_engine(engine, dev_priv, tmp) {
		clear_bit(I915_RESET_ENGINE + engine->id,
Loading