Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 225758d8 authored by Jerome Glisse's avatar Jerome Glisse Committed by Dave Airlie
Browse files

drm/radeon/kms: fence cleanup + more reliable GPU lockup detection V4



This patch cleanup the fence code, it drops the timeout field of
fence as the time to complete each IB is unpredictable and shouldn't
be bound.

The fence cleanup lead to GPU lockup detection improvement, this
patch introduce a callback, allowing to do asic specific test for
lockup detection. In this patch the CP is use as a first indicator
of GPU lockup. If CP doesn't make progress during 1second we assume
we are facing a GPU lockup.

To avoid overhead of testing GPU lockup frequently due to fence
taking time to be signaled we query the lockup callback every
500msec. There is plenty code comment explaining the design & choise
inside the code.

This have been tested mostly on R3XX/R5XX hw, in normal running
destkop (compiz firefox, quake3 running) the lockup callback wasn't
call once (1 hour session). Also tested with forcing GPU lockup and
lockup was reported after the 1s CP activity timeout.

V2 switch to 500ms timeout so GPU lockup get call at least 2 times
   in less than 2sec.
V3 store last jiffies in fence struct so on ERESTART, EBUSY we keep
   track of how long we already wait for a given fence
V4 make sure we got up to date cp read pointer so we don't have
   false positive

Signed-off-by: default avatarJerome Glisse <jglisse@redhat.com>
Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
parent 95beb690
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -486,6 +486,12 @@ int evergreen_mc_init(struct radeon_device *rdev)
	return 0;
}

bool evergreen_gpu_is_lockup(struct radeon_device *rdev)
{
	/* FIXME: implement for evergreen */
	return false;
}

int evergreen_gpu_reset(struct radeon_device *rdev)
{
	/* FIXME: implement for evergreen */
+86 −0
Original line number Diff line number Diff line
@@ -1777,6 +1777,92 @@ int r100_rb2d_reset(struct radeon_device *rdev)
	return -1;
}

void r100_gpu_lockup_update(struct r100_gpu_lockup *lockup, struct radeon_cp *cp)
{
	lockup->last_cp_rptr = cp->rptr;
	lockup->last_jiffies = jiffies;
}

/**
 * r100_gpu_cp_is_lockup() - check if CP is lockup by recording information
 * @rdev:	radeon device structure
 * @lockup:	r100_gpu_lockup structure holding CP lockup tracking informations
 * @cp:		radeon_cp structure holding CP information
 *
 * We don't need to initialize the lockup tracking information as we will either
 * have CP rptr to a different value of jiffies wrap around which will force
 * initialization of the lockup tracking informations.
 *
 * A possible false positivie is if we get call after while and last_cp_rptr ==
 * the current CP rptr, even if it's unlikely it might happen. To avoid this
 * if the elapsed time since last call is bigger than 2 second than we return
 * false and update the tracking information. Due to this the caller must call
 * r100_gpu_cp_is_lockup several time in less than 2sec for lockup to be reported
 * the fencing code should be cautious about that.
 *
 * Caller should write to the ring to force CP to do something so we don't get
 * false positive when CP is just gived nothing to do.
 *
 **/
bool r100_gpu_cp_is_lockup(struct radeon_device *rdev, struct r100_gpu_lockup *lockup, struct radeon_cp *cp)
{
	unsigned long cjiffies, elapsed;

	cjiffies = jiffies;
	if (!time_after(cjiffies, lockup->last_jiffies)) {
		/* likely a wrap around */
		lockup->last_cp_rptr = cp->rptr;
		lockup->last_jiffies = jiffies;
		return false;
	}
	if (cp->rptr != lockup->last_cp_rptr) {
		/* CP is still working no lockup */
		lockup->last_cp_rptr = cp->rptr;
		lockup->last_jiffies = jiffies;
		return false;
	}
	elapsed = jiffies_to_msecs(cjiffies - lockup->last_jiffies);
	if (elapsed >= 3000) {
		/* very likely the improbable case where current
		 * rptr is equal to last recorded, a while ago, rptr
		 * this is more likely a false positive update tracking
		 * information which should force us to be recall at
		 * latter point
		 */
		lockup->last_cp_rptr = cp->rptr;
		lockup->last_jiffies = jiffies;
		return false;
	}
	if (elapsed >= 1000) {
		dev_err(rdev->dev, "GPU lockup CP stall for more than %lumsec\n", elapsed);
		return true;
	}
	/* give a chance to the GPU ... */
	return false;
}

bool r100_gpu_is_lockup(struct radeon_device *rdev)
{
	u32 rbbm_status;
	int r;

	rbbm_status = RREG32(R_000E40_RBBM_STATUS);
	if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
		r100_gpu_lockup_update(&rdev->config.r100.lockup, &rdev->cp);
		return false;
	}
	/* force CP activities */
	r = radeon_ring_lock(rdev, 2);
	if (!r) {
		/* PACKET2 NOP */
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_unlock_commit(rdev);
	}
	rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
	return r100_gpu_cp_is_lockup(rdev, &rdev->config.r100.lockup, &rdev->cp);
}

int r100_gpu_reset(struct radeon_device *rdev)
{
	uint32_t status;
+26 −2
Original line number Diff line number Diff line
@@ -26,8 +26,9 @@
 *          Jerome Glisse
 */
#include <linux/seq_file.h>
#include "drmP.h"
#include "drm.h"
#include <drm/drmP.h>
#include <drm/drm.h>
#include <drm/drm_crtc_helper.h>
#include "radeon_reg.h"
#include "radeon.h"
#include "radeon_asic.h"
@@ -426,12 +427,35 @@ int r300_ga_reset(struct radeon_device *rdev)
	return -1;
}

bool r300_gpu_is_lockup(struct radeon_device *rdev)
{
	u32 rbbm_status;
	int r;

	rbbm_status = RREG32(R_000E40_RBBM_STATUS);
	if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
		r100_gpu_lockup_update(&rdev->config.r300.lockup, &rdev->cp);
		return false;
	}
	/* force CP activities */
	r = radeon_ring_lock(rdev, 2);
	if (!r) {
		/* PACKET2 NOP */
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_unlock_commit(rdev);
	}
	rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
	return r100_gpu_cp_is_lockup(rdev, &rdev->config.r300.lockup, &rdev->cp);
}

int r300_gpu_reset(struct radeon_device *rdev)
{
	uint32_t status;

	/* reset order likely matter */
	status = RREG32(RADEON_RBBM_STATUS);
	dev_info(rdev->dev, "(%s:%d) RBBM_STATUS=0x%08X\n", __func__, __LINE__, status);
	/* reset HDP */
	r100_hdp_reset(rdev);
	/* reset rb2d */
+30 −4
Original line number Diff line number Diff line
@@ -784,7 +784,7 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
		dev_info(rdev->dev, "  R_008020_GRBM_SOFT_RESET=0x%08X\n", tmp);
		WREG32(R_008020_GRBM_SOFT_RESET, tmp);
		(void)RREG32(R_008020_GRBM_SOFT_RESET);
		udelay(50);
		mdelay(1);
		WREG32(R_008020_GRBM_SOFT_RESET, 0);
		(void)RREG32(R_008020_GRBM_SOFT_RESET);
	}
@@ -824,16 +824,16 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
	dev_info(rdev->dev, "  R_000E60_SRBM_SOFT_RESET=0x%08X\n", srbm_reset);
	WREG32(R_000E60_SRBM_SOFT_RESET, srbm_reset);
	(void)RREG32(R_000E60_SRBM_SOFT_RESET);
	udelay(50);
	mdelay(1);
	WREG32(R_000E60_SRBM_SOFT_RESET, 0);
	(void)RREG32(R_000E60_SRBM_SOFT_RESET);
	WREG32(R_000E60_SRBM_SOFT_RESET, srbm_reset);
	(void)RREG32(R_000E60_SRBM_SOFT_RESET);
	udelay(50);
	mdelay(1);
	WREG32(R_000E60_SRBM_SOFT_RESET, 0);
	(void)RREG32(R_000E60_SRBM_SOFT_RESET);
	/* Wait a little for things to settle down */
	udelay(50);
	mdelay(1);
	dev_info(rdev->dev, "  R_008010_GRBM_STATUS=0x%08X\n",
		RREG32(R_008010_GRBM_STATUS));
	dev_info(rdev->dev, "  R_008014_GRBM_STATUS2=0x%08X\n",
@@ -848,6 +848,32 @@ int r600_gpu_soft_reset(struct radeon_device *rdev)
	return 0;
}

bool r600_gpu_is_lockup(struct radeon_device *rdev)
{
	u32 srbm_status;
	u32 grbm_status;
	u32 grbm_status2;
	int r;

	srbm_status = RREG32(R_000E50_SRBM_STATUS);
	grbm_status = RREG32(R_008010_GRBM_STATUS);
	grbm_status2 = RREG32(R_008014_GRBM_STATUS2);
	if (!G_008010_GUI_ACTIVE(grbm_status)) {
		r100_gpu_lockup_update(&rdev->config.r300.lockup, &rdev->cp);
		return false;
	}
	/* force CP activities */
	r = radeon_ring_lock(rdev, 2);
	if (!r) {
		/* PACKET2 NOP */
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_write(rdev, 0x80000000);
		radeon_ring_unlock_commit(rdev);
	}
	rdev->cp.rptr = RREG32(R600_CP_RB_RPTR);
	return r100_gpu_cp_is_lockup(rdev, &rdev->config.r300.lockup, &rdev->cp);
}

int r600_gpu_reset(struct radeon_device *rdev)
{
	return r600_gpu_soft_reset(rdev);
+59 −45
Original line number Diff line number Diff line
@@ -99,6 +99,7 @@ extern int radeon_hw_i2c;
 * symbol;
 */
#define RADEON_MAX_USEC_TIMEOUT		100000	/* 100 ms */
#define RADEON_FENCE_JIFFIES_TIMEOUT	(HZ / 2)
/* RADEON_IB_POOL_SIZE must be a power of 2 */
#define RADEON_IB_POOL_SIZE		16
#define RADEON_DEBUGFS_MAX_NUM_FILES	32
@@ -182,7 +183,8 @@ struct radeon_fence_driver {
	uint32_t			scratch_reg;
	atomic_t			seq;
	uint32_t			last_seq;
	unsigned long			count_timeout;
	unsigned long			last_jiffies;
	unsigned long			last_timeout;
	wait_queue_head_t		queue;
	rwlock_t			lock;
	struct list_head		created;
@@ -197,7 +199,6 @@ struct radeon_fence {
	struct list_head		list;
	/* protected by radeon_fence.lock */
	uint32_t			seq;
	unsigned long			timeout;
	bool				emited;
	bool				signaled;
};
@@ -746,6 +747,7 @@ struct radeon_asic {
	int (*resume)(struct radeon_device *rdev);
	int (*suspend)(struct radeon_device *rdev);
	void (*vga_set_state)(struct radeon_device *rdev, bool state);
	bool (*gpu_is_lockup)(struct radeon_device *rdev);
	int (*gpu_reset)(struct radeon_device *rdev);
	void (*gart_tlb_flush)(struct radeon_device *rdev);
	int (*gart_set_page)(struct radeon_device *rdev, int i, uint64_t addr);
@@ -804,10 +806,16 @@ struct radeon_asic {
/*
 * Asic structures
 */
struct r100_gpu_lockup {
	unsigned long	last_jiffies;
	u32		last_cp_rptr;
};

struct r100_asic {
	const unsigned		*reg_safe_bm;
	unsigned		reg_safe_bm_size;
	u32			hdp_cntl;
	struct r100_gpu_lockup	lockup;
};

struct r300_asic {
@@ -815,6 +823,7 @@ struct r300_asic {
	unsigned		reg_safe_bm_size;
	u32			resync_scratch;
	u32			hdp_cntl;
	struct r100_gpu_lockup	lockup;
};

struct r600_asic {
@@ -834,6 +843,7 @@ struct r600_asic {
	unsigned		tiling_nbanks;
	unsigned		tiling_npipes;
	unsigned		tiling_group_size;
	struct r100_gpu_lockup	lockup;
};

struct rv770_asic {
@@ -857,6 +867,7 @@ struct rv770_asic {
	unsigned		tiling_nbanks;
	unsigned		tiling_npipes;
	unsigned		tiling_group_size;
	struct r100_gpu_lockup	lockup;
};

union radeon_asic_config {
@@ -1145,6 +1156,7 @@ static inline void radeon_ring_write(struct radeon_device *rdev, uint32_t v)
#define radeon_suspend(rdev) (rdev)->asic->suspend((rdev))
#define radeon_cs_parse(p) rdev->asic->cs_parse((p))
#define radeon_vga_set_state(rdev, state) (rdev)->asic->vga_set_state((rdev), (state))
#define radeon_gpu_is_lockup(rdev) (rdev)->asic->gpu_is_lockup((rdev))
#define radeon_gpu_reset(rdev) (rdev)->asic->gpu_reset((rdev))
#define radeon_gart_tlb_flush(rdev) (rdev)->asic->gart_tlb_flush((rdev))
#define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart_set_page((rdev), (i), (p))
@@ -1200,6 +1212,8 @@ extern int radeon_resume_kms(struct drm_device *dev);
extern int radeon_suspend_kms(struct drm_device *dev, pm_message_t state);

/* r100,rv100,rs100,rv200,rs200,r200,rv250,rs300,rv280 */
extern void r100_gpu_lockup_update(struct r100_gpu_lockup *lockup, struct radeon_cp *cp);
extern bool r100_gpu_cp_is_lockup(struct radeon_device *rdev, struct r100_gpu_lockup *lockup, struct radeon_cp *cp);

/* rv200,rv250,rv280 */
extern void r200_set_safe_registers(struct radeon_device *rdev);
Loading