Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8af29b0c authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915: Separate out reset flags from the reset counter



In preparation for introducing a per-engine reset, we can first separate
the mixing of the reset state from the global reset counter.

The loss of atomicity in updating the reset state poses a small problem
for handling the waiters. For requests, this is solved by advancing the
seqno so that a waiter waking up after the reset knows the request is
complete. For pending flips, we still rely on the increment of the
global reset epoch (as well as the reset-in-progress flag) to signify
when the hardware was reset.

The advantage, now that we do not inspect the reset state during reset
itself i.e. we no longer emit requests during reset, is that we can use
the atomic updates of the state flags to ensure that only one reset
worker is active.

v2: Mika spotted that I transformed the i915_gem_wait_for_error() wakeup
into a waiter wakeup.

Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-6-git-send-email-arun.siluvery@linux.intel.com


Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160909131201.16673-7-chris@chris-wilson.co.uk
parent 70c2a24d
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -1287,6 +1287,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
	enum intel_engine_id id;
	int j;

	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
		seq_printf(m, "Wedged\n");
	if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
		seq_printf(m, "Reset in progress\n");
	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
		seq_printf(m, "Waiter holding struct mutex\n");
	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
		seq_printf(m, "struct_mutex blocked for reset\n");

	if (!i915.enable_hangcheck) {
		seq_printf(m, "Hangcheck disabled\n");
		return 0;
+5 −11
Original line number Diff line number Diff line
@@ -1579,7 +1579,7 @@ static int i915_drm_resume(struct drm_device *dev)
	mutex_lock(&dev->struct_mutex);
	if (i915_gem_init_hw(dev)) {
		DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
	}
	mutex_unlock(&dev->struct_mutex);

@@ -1741,20 +1741,13 @@ int i915_reset(struct drm_i915_private *dev_priv)
{
	struct drm_device *dev = &dev_priv->drm;
	struct i915_gpu_error *error = &dev_priv->gpu_error;
	unsigned reset_counter;
	int ret;

	mutex_lock(&dev->struct_mutex);

	/* Clear any previous failed attempts at recovery. Time to try again. */
	atomic_andnot(I915_WEDGED, &error->reset_counter);

	/* Clear the reset-in-progress flag and increment the reset epoch. */
	reset_counter = atomic_inc_return(&error->reset_counter);
	if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
		ret = -EIO;
		goto error;
	}
	__clear_bit(I915_WEDGED, &error->flags);
	error->reset_count++;

	pr_notice("drm/i915: Resetting chip after gpu hang\n");

@@ -1791,6 +1784,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
		goto error;
	}

	clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
	mutex_unlock(&dev->struct_mutex);

	/*
@@ -1805,7 +1799,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
	return 0;

error:
	atomic_or(I915_WEDGED, &error->reset_counter);
	set_bit(I915_WEDGED, &error->flags);
	mutex_unlock(&dev->struct_mutex);
	return ret;
}
+14 −32
Original line number Diff line number Diff line
@@ -1405,9 +1405,10 @@ struct i915_gpu_error {
	 * State variable controlling the reset flow and count
	 *
	 * This is a counter which gets incremented when reset is triggered,
	 * and again when reset has been handled. So odd values (lowest bit set)
	 * means that reset is in progress and even values that
	 * (reset_counter >> 1):th reset was successfully completed.
	 *
	 * Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
	 * meaning that any waiters holding onto the struct_mutex should
	 * relinquish the lock immediately in order for the reset to start.
	 *
	 * If reset is not completed succesfully, the I915_WEDGE bit is
	 * set meaning that hardware is terminally sour and there is no
@@ -1422,10 +1423,11 @@ struct i915_gpu_error {
	 * naturally enforces the correct ordering between the bail-out of the
	 * waiter and the gpu reset work code.
	 */
	atomic_t reset_counter;
	unsigned long reset_count;

#define I915_RESET_IN_PROGRESS_FLAG	1
#define I915_WEDGED			(1 << 31)
	unsigned long flags;
#define I915_RESET_IN_PROGRESS	0
#define I915_WEDGED		(BITS_PER_LONG - 1)

	/**
	 * Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3241,44 +3243,24 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);

void i915_gem_retire_requests(struct drm_i915_private *dev_priv);

static inline u32 i915_reset_counter(struct i915_gpu_error *error)
{
	return atomic_read(&error->reset_counter);
}

static inline bool __i915_reset_in_progress(u32 reset)
{
	return unlikely(reset & I915_RESET_IN_PROGRESS_FLAG);
}

static inline bool __i915_reset_in_progress_or_wedged(u32 reset)
{
	return unlikely(reset & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
}

static inline bool __i915_terminally_wedged(u32 reset)
{
	return unlikely(reset & I915_WEDGED);
}

static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
{
	return __i915_reset_in_progress(i915_reset_counter(error));
	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
}

static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
	return __i915_reset_in_progress_or_wedged(i915_reset_counter(error));
	return unlikely(test_bit(I915_WEDGED, &error->flags));
}

static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
{
	return __i915_terminally_wedged(i915_reset_counter(error));
	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
}

static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
	return ((i915_reset_counter(error) & ~I915_WEDGED) + 1) / 2;
	return READ_ONCE(error->reset_count);
}

void i915_gem_reset(struct drm_device *dev);
+1 −1
Original line number Diff line number Diff line
@@ -4525,7 +4525,7 @@ int i915_gem_init(struct drm_device *dev)
		 * for all other failure, such as an allocation failure, bail.
		 */
		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
		atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
		set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
		ret = 0;
	}

+7 −6
Original line number Diff line number Diff line
@@ -233,16 +233,18 @@ void i915_gem_request_retire_upto(struct drm_i915_gem_request *req)
	} while (tmp != req);
}

static int i915_gem_check_wedge(unsigned int reset_counter, bool interruptible)
static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
{
	if (__i915_terminally_wedged(reset_counter))
	struct i915_gpu_error *error = &dev_priv->gpu_error;

	if (i915_terminally_wedged(error))
		return -EIO;

	if (__i915_reset_in_progress(reset_counter)) {
	if (i915_reset_in_progress(error)) {
		/* Non-interruptible callers can't handle -EAGAIN, hence return
		 * -EIO unconditionally for these.
		 */
		if (!interruptible)
		if (!dev_priv->mm.interruptible)
			return -EIO;

		return -EAGAIN;
@@ -331,7 +333,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
		       struct i915_gem_context *ctx)
{
	struct drm_i915_private *dev_priv = engine->i915;
	unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
	struct drm_i915_gem_request *req;
	u32 seqno;
	int ret;
@@ -340,7 +341,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
	 * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
	 * and restart.
	 */
	ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
	ret = i915_gem_check_wedge(dev_priv);
	if (ret)
		return ERR_PTR(ret);

Loading