Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e38fdd2 authored by Tarun Karra's avatar Tarun Karra Committed by Jordan Crouse
Browse files

msm: kgsl: Enhance GFT to avoid hang->recover->hang cycle



If GFT recovered more than X times in Y ms invalidate the context
and do not attempt recovery. this is to prevent the case where GFT keeps
recovering continuously.

Change-Id: I9326a4cf0c8d1b6b92497ae5cd086e4b8bb7e89f
Signed-off-by: default avatarTarun Karra <tkarra@codeaurora.org>
parent 360013b1
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -543,7 +543,9 @@ struct log_field {
#define  KGSL_FT_SKIPFRAME                3
#define  KGSL_FT_DISABLE                  4
#define  KGSL_FT_TEMP_DISABLE             5
#define  KGSL_FT_DEFAULT_POLICY (BIT(KGSL_FT_REPLAY) + BIT(KGSL_FT_SKIPIB))
#define  KGSL_FT_THROTTLE                 6
#define  KGSL_FT_DEFAULT_POLICY (BIT(KGSL_FT_REPLAY) + BIT(KGSL_FT_SKIPIB) \
				+ BIT(KGSL_FT_THROTTLE))

/* This internal bit is used to skip the PM dump on replayed command batches */
#define  KGSL_FT_SKIP_PMDUMP              31
@@ -561,7 +563,8 @@ struct log_field {
	{ BIT(KGSL_FT_SKIPIB), "skipib" }, \
	{ BIT(KGSL_FT_SKIPFRAME), "skipframe" }, \
	{ BIT(KGSL_FT_DISABLE), "disable" }, \
	{ BIT(KGSL_FT_TEMP_DISABLE), "temp" }
	{ BIT(KGSL_FT_TEMP_DISABLE), "temp" }, \
	{ BIT(KGSL_FT_THROTTLE), "throttle"}

extern struct adreno_gpudev adreno_a3xx_gpudev;
extern struct adreno_gpudev adreno_a4xx_gpudev;
+44 −0
Original line number Diff line number Diff line
@@ -33,6 +33,15 @@ static unsigned int _context_queue_wait = 10000;
/* Number of command batches sent at a time from a single context */
static unsigned int _context_cmdbatch_burst = 5;

/*
 * GFT throttle parameters. If GFT recovered more than
 * X times in Y ms invalidate the context and do not attempt recovery.
 * X -> _fault_throttle_burst
 * Y -> _fault_throttle_time
 */
static unsigned int _fault_throttle_time = 3000;
static unsigned int _fault_throttle_burst = 3;

/* Number of command batches inflight in the ringbuffer at any time */
static unsigned int _dispatcher_inflight = 15;

@@ -1054,6 +1063,35 @@ static int dispatcher_do_fault(struct kgsl_device *device)

	cmdbatch = replay[0];

	/*
	 * If GFT recovered more than X times in Y ms invalidate the context
	 * and do not attempt recovery.
	 * Example: X==3 and Y==3000 ms, GPU hung at 500ms, 1700ms, 25000ms and
	 * 3000ms for the same context, we will not try FT and invalidate the
	 * context @3000ms because context triggered GFT more than 3 times in
	 * last 3 seconds. If a context caused recoverable GPU hangs
	 * where 1st and 4th gpu hang are more than 3 seconds apart we
	 * won't disable GFT and invalidate the context.
	 */
	if (test_bit(KGSL_FT_THROTTLE, &cmdbatch->fault_policy)) {
		if (time_after(jiffies, (cmdbatch->context->fault_time
				+ msecs_to_jiffies(_fault_throttle_time)))) {
			cmdbatch->context->fault_time = jiffies;
			cmdbatch->context->fault_count = 1;
		} else {
			cmdbatch->context->fault_count++;
			if (cmdbatch->context->fault_count >
					_fault_throttle_burst) {
				set_bit(KGSL_FT_DISABLE,
						&cmdbatch->fault_policy);
				pr_fault(device, cmdbatch,
					 "gpu fault threshold exceeded %d faults in %d msecs\n",
					 _fault_throttle_burst,
					 _fault_throttle_time);
			}
		}
	}

	/*
	 * If FT is disabled for this cmdbatch invalidate immediately
	 */
@@ -1664,6 +1702,10 @@ static DISPATCHER_UINT_ATTR(cmdbatch_timeout, 0644, 0, _cmdbatch_timeout);
static DISPATCHER_UINT_ATTR(context_queue_wait, 0644, 0, _context_queue_wait);
static DISPATCHER_UINT_ATTR(fault_detect_interval, 0644, 0,
	_fault_timer_interval);
static DISPATCHER_UINT_ATTR(fault_throttle_time, 0644, 0,
	_fault_throttle_time);
static DISPATCHER_UINT_ATTR(fault_throttle_burst, 0644, 0,
	_fault_throttle_burst);

static struct attribute *dispatcher_attrs[] = {
	&dispatcher_attr_inflight.attr,
@@ -1672,6 +1714,8 @@ static struct attribute *dispatcher_attrs[] = {
	&dispatcher_attr_cmdbatch_timeout.attr,
	&dispatcher_attr_context_queue_wait.attr,
	&dispatcher_attr_fault_detect_interval.attr,
	&dispatcher_attr_fault_throttle_time.attr,
	&dispatcher_attr_fault_throttle_burst.attr,
	NULL,
};

+4 −0
Original line number Diff line number Diff line
@@ -406,6 +406,8 @@ struct kgsl_process_private;
 * is set.
 * @flags: flags from userspace controlling the behavior of this context
 * @pwr_constraint: power constraint from userspace for this context
 * @fault_count: number of times gpu hanged in last _context_throttle_time ms
 * @fault_time: time of the first gpu hang in last _context_throttle_time ms
 */
struct kgsl_context {
	struct kref refcount;
@@ -423,6 +425,8 @@ struct kgsl_context {
	unsigned int pagefault_ts;
	unsigned int flags;
	struct kgsl_pwr_constraint pwr_constraint;
	unsigned int fault_count;
	unsigned long fault_time;
};

/**