Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0bfa4b41 authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/radeon: handle lockup in delayed work, v5



v5 (chk): complete rework, start when the first fence is emitted,
          stop when the last fence is signalled, make it work
          correctly with GPU resets, cleanup radeon_fence_wait_seq

Signed-off-by: default avatarMaarten Lankhorst <maarten.lankhorst@canonical.com>
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9bb39ff4
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -350,6 +350,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw,
 * Fences.
 */
struct radeon_fence_driver {
	struct radeon_device		*rdev;
	uint32_t			scratch_reg;
	uint64_t			gpu_addr;
	volatile uint32_t		*cpu_addr;
@@ -357,6 +358,7 @@ struct radeon_fence_driver {
	uint64_t			sync_seq[RADEON_NUM_RINGS];
	atomic64_t			last_seq;
	bool				initialized;
	struct delayed_work		lockup_work;
};

struct radeon_fence {
+122 −78
Original line number Diff line number Diff line
@@ -97,6 +97,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring)
	return seq;
}

/**
 * radeon_fence_schedule_check - schedule lockup check
 *
 * @rdev: radeon_device pointer
 * @ring: ring index we should work with
 *
 * Queues a delayed work item to check for lockups.
 */
static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring)
{
	/*
	 * Do not reset the timer here with mod_delayed_work,
	 * this can livelock in an interaction with TTM delayed destroy.
	 */
	queue_delayed_work(system_power_efficient_wq,
			   &rdev->fence_drv[ring].lockup_work,
			   RADEON_FENCE_JIFFIES_TIMEOUT);
}

/**
 * radeon_fence_emit - emit a fence on the requested ring
 *
@@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev,
	(*fence)->ring = ring;
	radeon_fence_ring_emit(rdev, ring, *fence);
	trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
	radeon_fence_schedule_check(rdev, ring);
	return 0;
}

/**
 * radeon_fence_process - process a fence
 * radeon_fence_activity - check for fence activity
 *
 * @rdev: radeon_device pointer
 * @ring: ring index the fence is associated with
 *
 * Checks the current fence value and wakes the fence queue
 * if the sequence number has increased (all asics).
 * Checks the current fence value and calculates the last
 * signalled fence value. Returns true if activity occured
 * on the ring, and the fence_queue should be waken up.
 */
void radeon_fence_process(struct radeon_device *rdev, int ring)
static bool radeon_fence_activity(struct radeon_device *rdev, int ring)
{
	uint64_t seq, last_seq, last_emitted;
	unsigned count_loop = 0;
@@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
		}
	} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);

	if (wake)
	if (seq < last_emitted)
		radeon_fence_schedule_check(rdev, ring);

	return wake;
}

/**
 * radeon_fence_check_lockup - check for hardware lockup
 *
 * @work: delayed work item
 *
 * Checks for fence activity and if there is none probe
 * the hardware if a lockup occured.
 */
static void radeon_fence_check_lockup(struct work_struct *work)
{
	struct radeon_fence_driver *fence_drv;
	struct radeon_device *rdev;
	int ring;

	fence_drv = container_of(work, struct radeon_fence_driver,
				 lockup_work.work);
	rdev = fence_drv->rdev;
	ring = fence_drv - &rdev->fence_drv[0];

	if (!down_read_trylock(&rdev->exclusive_lock)) {
		/* just reschedule the check if a reset is going on */
		radeon_fence_schedule_check(rdev, ring);
		return;
	}

	if (radeon_fence_activity(rdev, ring))
		wake_up_all(&rdev->fence_queue);

	else if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {

		/* good news we believe it's a lockup */
		dev_warn(rdev->dev, "GPU lockup (current fence id "
			 "0x%016llx last fence id 0x%016llx on ring %d)\n",
			 (uint64_t)atomic64_read(&fence_drv->last_seq),
			 fence_drv->sync_seq[ring], ring);

		/* remember that we need an reset */
		rdev->needs_reset = true;
		wake_up_all(&rdev->fence_queue);
	}
	up_read(&rdev->exclusive_lock);
}

/**
 * radeon_fence_process - process a fence
 *
 * @rdev: radeon_device pointer
 * @ring: ring index the fence is associated with
 *
 * Checks the current fence value and wakes the fence queue
 * if the sequence number has increased (all asics).
 */
void radeon_fence_process(struct radeon_device *rdev, int ring)
{
	if (radeon_fence_activity(rdev, ring))
		wake_up_all(&rdev->fence_queue);
}

@@ -300,86 +381,43 @@ static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
				 bool intr)
{
	uint64_t last_seq[RADEON_NUM_RINGS];
	bool signaled;
	int i, r;
	long r;
	int i;

	while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
	if (radeon_fence_any_seq_signaled(rdev, target_seq))
		return 0;

		/* Save current sequence values, used to check for GPU lockups */
	/* enable IRQs and tracing */
	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
		if (!target_seq[i])
			continue;

			last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
		trace_radeon_fence_wait_begin(rdev->ddev, i, target_seq[i]);
		radeon_irq_kms_sw_irq_get(rdev, i);
	}

	if (intr) {
		r = wait_event_interruptible_timeout(rdev->fence_queue, (
				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
			radeon_fence_any_seq_signaled(rdev, target_seq)
			 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
	} else {
		r = wait_event_timeout(rdev->fence_queue, (
				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
			radeon_fence_any_seq_signaled(rdev, target_seq)
			 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
	}

		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
			if (!target_seq[i])
				continue;

			radeon_irq_kms_sw_irq_put(rdev, i);
			trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
		}

		if (unlikely(r < 0))
			return r;

		if (unlikely(!signaled)) {
	if (rdev->needs_reset)
				return -EDEADLK;

			/* we were interrupted for some reason and fence
			 * isn't signaled yet, resume waiting */
			if (r)
				continue;
		r = -EDEADLK;

	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
		if (!target_seq[i])
			continue;

				if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
					break;
			}

			if (i != RADEON_NUM_RINGS)
				continue;

			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
				if (!target_seq[i])
					continue;

				if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
					break;
		radeon_irq_kms_sw_irq_put(rdev, i);
		trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
	}

			if (i < RADEON_NUM_RINGS) {
				/* good news we believe it's a lockup */
				dev_warn(rdev->dev, "GPU lockup (waiting for "
					 "0x%016llx last fence id 0x%016llx on"
					 " ring %d)\n",
					 target_seq[i], last_seq[i], i);

				/* remember that we need an reset */
				rdev->needs_reset = true;
				wake_up_all(&rdev->fence_queue);
				return -EDEADLK;
			}
		}
	}
	return 0;
	return r < 0 ? r : 0;
}

/**
@@ -711,6 +749,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
		rdev->fence_drv[ring].sync_seq[i] = 0;
	atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
	rdev->fence_drv[ring].initialized = false;
	INIT_DELAYED_WORK(&rdev->fence_drv[ring].lockup_work,
			  radeon_fence_check_lockup);
	rdev->fence_drv[ring].rdev = rdev;
}

/**
@@ -760,6 +801,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
			/* no need to trigger GPU reset as we are unloading */
			radeon_fence_driver_force_completion(rdev, ring);
		}
		cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
		wake_up_all(&rdev->fence_queue);
		radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
		rdev->fence_drv[ring].initialized = false;
@@ -778,8 +820,10 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
 */
void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring)
{
	if (rdev->fence_drv[ring].initialized)
	if (rdev->fence_drv[ring].initialized) {
		radeon_fence_write(rdev, rdev->fence_drv[ring].sync_seq[ring], ring);
		cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
	}
}