Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3fb04cb0 authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915/selftests: Fix up igt_reset_engine

Now that we skip a per-engine reset on an idle engine, we need to update
the selftest to take that into account. In the process, we find that we
were not stressing the per-engine reset very hard, so add those missing
active resets.

v2: Actually test i915_reset_engine() by loading it with requests.

Fixes: f6ba181a ("drm/i915: Skip an engine reset if it recovered before our preparations")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104313


Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171217132852.30642-3-chris@chris-wilson.co.uk


Reviewed-by: default avatarMichel Thierry <michel.thierry@intel.com>
parent 151a99ec
Loading
Loading
Loading
Loading
+251 −64
Original line number Diff line number Diff line
@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = upper_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
		*batch++ = lower_32_bits(vma->node.start);
		*batch++ = upper_32_bits(vma->node.start);
@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
		*batch++ = lower_32_bits(vma->node.start);
	} else if (INTEL_GEN(i915) >= 4) {
@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
		*batch++ = lower_32_bits(vma->node.start);
	} else {
		*batch++ = MI_STORE_DWORD_IMM;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
		*batch++ = lower_32_bits(vma->node.start);
	}
@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
}

static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
{
	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
					       rq->fence.seqno),
			     10) &&
		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
					    rq->fence.seqno),
			  1000));
}

static int igt_hang_sanitycheck(void *arg)
{
	struct drm_i915_private *i915 = arg;
@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	pr_debug("%s: current gpu_error=%08lx\n",
		 __func__, i915->gpu_error.flags);

	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
		wait_event(i915->gpu_error.reset_queue,
			   !test_bit(I915_RESET_BACKOFF,
@@ -353,25 +390,75 @@ static int igt_global_reset(void *arg)
	return err;
}

static int igt_reset_engine(void *arg)
static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
{
	struct drm_i915_private *i915 = arg;
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	unsigned int reset_count, reset_engine_count;
	struct hang h;
	int err = 0;

	/* Check that we can issue a global GPU and engine reset */
	/* Check that we can issue an engine reset on an idle engine (no-op) */

	if (!intel_has_reset_engine(i915))
		return 0;

	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		err = hang_init(&h, i915);
		mutex_unlock(&i915->drm.struct_mutex);
		if (err)
			return err;
	}

	for_each_engine(engine, i915, id) {
		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
		unsigned int reset_count, reset_engine_count;
		IGT_TIMEOUT(end_time);

		if (active && !intel_engine_can_store_dword(engine))
			continue;

		reset_count = i915_reset_count(&i915->gpu_error);
		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
							     engine);

		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
		do {
			if (active) {
				struct drm_i915_gem_request *rq;

				mutex_lock(&i915->drm.struct_mutex);
				rq = hang_create_request(&h, engine,
							 i915->kernel_context);
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
					mutex_unlock(&i915->drm.struct_mutex);
					break;
				}

				i915_gem_request_get(rq);
				__i915_add_request(rq, true);
				mutex_unlock(&i915->drm.struct_mutex);

				if (!wait_for_hang(&h, rq)) {
					struct drm_printer p = drm_info_printer(i915->drm.dev);

					pr_err("%s: Failed to start request %x, at %x\n",
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

					i915_gem_request_put(rq);
					err = -EIO;
					break;
				}

				i915_gem_request_put(rq);
			}

			engine->hangcheck.stalled = true;
			engine->hangcheck.seqno =
				intel_engine_get_seqno(engine);

			err = i915_reset_engine(engine, I915_RESET_QUIET);
			if (err) {
				pr_err("i915_reset_engine failed\n");
@@ -384,23 +471,47 @@ static int igt_reset_engine(void *arg)
				break;
			}

		if (i915_reset_engine_count(&i915->gpu_error, engine) ==
			reset_engine_count += active;
			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
			    reset_engine_count) {
			pr_err("No %s engine reset recorded!\n", engine->name);
				pr_err("%s engine reset %srecorded!\n",
				       engine->name, active ? "not " : "");
				err = -EINVAL;
				break;
			}

		clear_bit(I915_RESET_ENGINE + engine->id,
			  &i915->gpu_error.flags);
			engine->hangcheck.stalled = false;
		} while (time_before(jiffies, end_time));
		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);

		if (err)
			break;

		cond_resched();
	}

	if (i915_terminally_wedged(&i915->gpu_error))
		err = -EIO;

	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		hang_fini(&h);
		mutex_unlock(&i915->drm.struct_mutex);
	}

	return err;
}

static int igt_reset_idle_engine(void *arg)
{
	return __igt_reset_engine(arg, false);
}

static int igt_reset_active_engine(void *arg)
{
	return __igt_reset_engine(arg, true);
}

static int active_engine(void *data)
{
	struct intel_engine_cs *engine = data;
@@ -462,11 +573,12 @@ static int active_engine(void *data)
	return err;
}

static int igt_reset_active_engines(void *arg)
static int __igt_reset_engine_others(struct drm_i915_private *i915,
				     bool active)
{
	struct drm_i915_private *i915 = arg;
	struct intel_engine_cs *engine, *active;
	struct intel_engine_cs *engine, *other;
	enum intel_engine_id id, tmp;
	struct hang h;
	int err = 0;

	/* Check that issuing a reset on one engine does not interfere
@@ -476,24 +588,36 @@ static int igt_reset_active_engines(void *arg)
	if (!intel_has_reset_engine(i915))
		return 0;

	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		err = hang_init(&h, i915);
		mutex_unlock(&i915->drm.struct_mutex);
		if (err)
			return err;
	}

	for_each_engine(engine, i915, id) {
		struct task_struct *threads[I915_NUM_ENGINES];
		struct task_struct *threads[I915_NUM_ENGINES] = {};
		unsigned long resets[I915_NUM_ENGINES];
		unsigned long global = i915_reset_count(&i915->gpu_error);
		unsigned long count = 0;
		IGT_TIMEOUT(end_time);

		if (active && !intel_engine_can_store_dword(engine))
			continue;

		memset(threads, 0, sizeof(threads));
		for_each_engine(active, i915, tmp) {
		for_each_engine(other, i915, tmp) {
			struct task_struct *tsk;

			if (active == engine)
				continue;

			resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
							      active);
							      other);

			if (other == engine)
				continue;

			tsk = kthread_run(active_engine, active,
					  "igt/%s", active->name);
			tsk = kthread_run(active_engine, other,
					  "igt/%s", other->name);
			if (IS_ERR(tsk)) {
				err = PTR_ERR(tsk);
				goto unwind;
@@ -503,20 +627,70 @@ static int igt_reset_active_engines(void *arg)
			get_task_struct(tsk);
		}

		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
		do {
			if (active) {
				struct drm_i915_gem_request *rq;

				mutex_lock(&i915->drm.struct_mutex);
				rq = hang_create_request(&h, engine,
							 i915->kernel_context);
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
					mutex_unlock(&i915->drm.struct_mutex);
					break;
				}

				i915_gem_request_get(rq);
				__i915_add_request(rq, true);
				mutex_unlock(&i915->drm.struct_mutex);

				if (!wait_for_hang(&h, rq)) {
					struct drm_printer p = drm_info_printer(i915->drm.dev);

					pr_err("%s: Failed to start request %x, at %x\n",
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

					i915_gem_request_put(rq);
					err = -EIO;
					break;
				}

				i915_gem_request_put(rq);
			}

			engine->hangcheck.stalled = true;
			engine->hangcheck.seqno =
				intel_engine_get_seqno(engine);

			err = i915_reset_engine(engine, I915_RESET_QUIET);
			if (err) {
				pr_err("i915_reset_engine(%s) failed, err=%d\n",
				       engine->name, err);
				pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
				       engine->name, active ? "active" : "idle", err);
				break;
			}

			engine->hangcheck.stalled = false;
			count++;
		} while (time_before(jiffies, end_time));
		clear_bit(I915_RESET_ENGINE + engine->id,
			  &i915->gpu_error.flags);
		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
			engine->name, active ? "active" : "idle", count);

		if (i915_reset_engine_count(&i915->gpu_error, engine) -
		    resets[engine->id] != (active ? count : 0)) {
			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
			       engine->name, active ? "active" : "idle", count,
			       i915_reset_engine_count(&i915->gpu_error,
						       engine) - resets[engine->id]);
			if (!err)
				err = -EINVAL;
		}

unwind:
		for_each_engine(active, i915, tmp) {
		for_each_engine(other, i915, tmp) {
			int ret;

			if (!threads[tmp])
@@ -524,27 +698,29 @@ static int igt_reset_active_engines(void *arg)

			ret = kthread_stop(threads[tmp]);
			if (ret) {
				pr_err("kthread for active engine %s failed, err=%d\n",
				       active->name, ret);
				pr_err("kthread for other engine %s failed, err=%d\n",
				       other->name, ret);
				if (!err)
					err = ret;
			}
			put_task_struct(threads[tmp]);

			if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
								   active)) {
								   other)) {
				pr_err("Innocent engine %s was reset (count=%ld)\n",
				       active->name,
				       other->name,
				       i915_reset_engine_count(&i915->gpu_error,
							       active) - resets[tmp]);
				err = -EIO;
							       other) - resets[tmp]);
				if (!err)
					err = -EINVAL;
			}
		}

		if (global != i915_reset_count(&i915->gpu_error)) {
			pr_err("Global reset (count=%ld)!\n",
			       i915_reset_count(&i915->gpu_error) - global);
			err = -EIO;
			if (!err)
				err = -EINVAL;
		}

		if (err)
@@ -556,9 +732,25 @@ static int igt_reset_active_engines(void *arg)
	if (i915_terminally_wedged(&i915->gpu_error))
		err = -EIO;

	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		hang_fini(&h);
		mutex_unlock(&i915->drm.struct_mutex);
	}

	return err;
}

static int igt_reset_idle_engine_others(void *arg)
{
	return __igt_reset_engine_others(arg, false);
}

static int igt_reset_active_engine_others(void *arg)
{
	return __igt_reset_engine_others(arg, true);
}

static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
{
	u32 reset_count;
@@ -574,16 +766,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
	return reset_count;
}

static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
{
	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
					       rq->fence.seqno),
			     10) &&
		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
					    rq->fence.seqno),
			  1000));
}

static int igt_wait_reset(void *arg)
{
	struct drm_i915_private *i915 = arg;
@@ -617,8 +799,8 @@ static int igt_wait_reset(void *arg)
	if (!wait_for_hang(&h, rq)) {
		struct drm_printer p = drm_info_printer(i915->drm.dev);

		pr_err("Failed to start request %x, at %x\n",
		       rq->fence.seqno, hws_seqno(&h, rq));
		pr_err("%s: Failed to start request %x, at %x\n",
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);

		i915_reset(i915, 0);
@@ -712,8 +894,8 @@ static int igt_reset_queue(void *arg)
			if (!wait_for_hang(&h, prev)) {
				struct drm_printer p = drm_info_printer(i915->drm.dev);

				pr_err("Failed to start request %x, at %x\n",
				       prev->fence.seqno, hws_seqno(&h, prev));
				pr_err("%s: Failed to start request %x, at %x\n",
				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
				intel_engine_dump(prev->engine, &p,
						  "%s\n", prev->engine->name);

@@ -819,8 +1001,8 @@ static int igt_handle_error(void *arg)
	if (!wait_for_hang(&h, rq)) {
		struct drm_printer p = drm_info_printer(i915->drm.dev);

		pr_err("Failed to start request %x, at %x\n",
		       rq->fence.seqno, hws_seqno(&h, rq));
		pr_err("%s: Failed to start request %x, at %x\n",
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);

		i915_reset(i915, 0);
@@ -864,21 +1046,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
	static const struct i915_subtest tests[] = {
		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
		SUBTEST(igt_hang_sanitycheck),
		SUBTEST(igt_reset_engine),
		SUBTEST(igt_reset_active_engines),
		SUBTEST(igt_reset_idle_engine),
		SUBTEST(igt_reset_active_engine),
		SUBTEST(igt_reset_idle_engine_others),
		SUBTEST(igt_reset_active_engine_others),
		SUBTEST(igt_wait_reset),
		SUBTEST(igt_reset_queue),
		SUBTEST(igt_handle_error),
	};
	bool saved_hangcheck;
	int err;

	if (!intel_has_gpu_reset(i915))
		return 0;

	intel_runtime_pm_get(i915);
	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);

	err = i915_subtests(tests, i915);

	i915_modparams.enable_hangcheck = saved_hangcheck;
	intel_runtime_pm_put(i915);

	return err;