Merge "msm: kgsl: Abort poll for HFI response if GMU fails any assertion" (75e814e0) · Commits · e / devices / android_kernel_fairphone_FP5

drivers/gpu/msm/adreno_a6xx_gmu.c

+35 −52

Original line number	Diff line number	Diff line
		@@ -1871,27 +1871,45 @@ static unsigned int a6xx_gmu_ifpc_show(struct kgsl_device *device)
		}

		/* Send an NMI to the GMU */
		static void a6xx_gmu_send_nmi(struct adreno_device *adreno_dev)
		static void a6xx_gmu_send_nmi(struct adreno_device *adreno_dev, bool force)
		{
		struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
		struct a6xx_gmu_device *gmu = to_a6xx_gmu(adreno_dev);
		u32 val;

		if (!a6xx_gmu_gx_is_on(device))
		goto done;

		/*
		* Do not send NMI if the SMMU is stalled because GMU will not be able
		* to save cm3 state to DDR.
		*/
		if (a6xx_is_smmu_stalled(device)) {
		struct a6xx_gmu_device *gmu = to_a6xx_gmu(adreno_dev);

		if (a6xx_gmu_gx_is_on(device) && a6xx_is_smmu_stalled(device)) {
		dev_err(&gmu->pdev->dev,
		"Skipping NMI because SMMU is stalled\n");
		return;
		}

		done:
		if (force)
		goto nmi;

		/*
		* We should not send NMI if there was a CM3 fault reported because we
		* don't want to overwrite the critical CM3 state captured by gmu before
		* it sent the CM3 fault interrupt. Also don't send NMI if GMU reset is
		* already active. We could have hit a GMU assert and NMI might have
		* already been triggered.
		*/

		/* make sure we're reading the latest cm3_fault */
		smp_rmb();

		if (atomic_read(&gmu->cm3_fault))
		return;

		gmu_core_regread(device, A6XX_GMU_CM3_FW_INIT_RESULT, &val);

		if (val & 0xE00)
		return;

		nmi:
		/* Mask so there's no interrupt caused by NMI */
		gmu_core_regwrite(device, A6XX_GMU_GMU2HOST_INTR_MASK, 0xFFFFFFFF);

		@@ -1908,6 +1926,9 @@ static void a6xx_gmu_send_nmi(struct adreno_device *adreno_dev)

		/* Make sure the NMI is invoked before we proceed*/
		wmb();

		/* Wait for the NMI to be handled */
		udelay(200);
		}

		static void a6xx_gmu_cooperative_reset(struct kgsl_device *device)
		@@ -1934,8 +1955,8 @@ static void a6xx_gmu_cooperative_reset(struct kgsl_device *device)
		* If we dont get a snapshot ready from GMU, trigger NMI
		* and if we still timeout then we just continue with reset.
		*/
		a6xx_gmu_send_nmi(adreno_dev);
		udelay(200);
		a6xx_gmu_send_nmi(adreno_dev, true);

		gmu_core_regread(device, A6XX_GMU_CM3_FW_INIT_RESULT, &result);
		if ((result & 0x800) != 0x800)
		dev_err(&gmu->pdev->dev,
		@@ -1997,22 +2018,7 @@ static irqreturn_t a6xx_gmu_irq_handler(int irq, void *data)
		gmu_core_regwrite(device, A6XX_GMU_AO_HOST_INTERRUPT_MASK,
		(mask \| GMU_INT_WDOG_BITE));

		/* make sure we're reading the latest cm3_fault */
		smp_rmb();

		/*
		* We should not send NMI if there was a CM3 fault reported
		* because we don't want to overwrite the critical CM3 state
		* captured by gmu before it sent the CM3 fault interrupt.
		*/
		if (!atomic_read(&gmu->cm3_fault))
		a6xx_gmu_send_nmi(adreno_dev);

		/*
		* There is sufficient delay for the GMU to have finished
		* handling the NMI before snapshot is taken, as the fault
		* worker is scheduled below.
		*/
		a6xx_gmu_send_nmi(adreno_dev, false);

		dev_err_ratelimited(&gmu->pdev->dev,
		"GMU watchdog expired interrupt received\n");
		@@ -2042,37 +2048,14 @@ static irqreturn_t a6xx_gmu_irq_handler(int irq, void *data)
		return IRQ_HANDLED;
		}

		static void a6xx_gmu_nmi(struct adreno_device *adreno_dev)
		{
		struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
		struct a6xx_gmu_device *gmu = to_a6xx_gmu(adreno_dev);

		/* No need to nmi if it was a gpu fault */
		if (!device->gmu_fault)
		return;

		/* make sure we're reading the latest cm3_fault */
		smp_rmb();

		/*
		* We should not send NMI if there was a CM3 fault reported because we
		* don't want to overwrite the critical CM3 state captured by gmu before
		* it sent the CM3 fault interrupt.
		*/
		if (!atomic_read(&gmu->cm3_fault)) {
		a6xx_gmu_send_nmi(adreno_dev);

		/* Wait for the NMI to be handled */
		udelay(100);
		}
		}

		void a6xx_gmu_snapshot(struct adreno_device *adreno_dev,
		struct kgsl_snapshot *snapshot)
		{
		struct kgsl_device *device = KGSL_DEVICE(adreno_dev);

		a6xx_gmu_nmi(adreno_dev);
		/* No need to nmi if it was a gpu fault */
		if (device->gmu_fault)
		a6xx_gmu_send_nmi(adreno_dev, false);

		a6xx_gmu_device_snapshot(device, snapshot);

drivers/gpu/msm/adreno_a6xx_hfi.c

+14 −1

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0-only
		/*
		* Copyright (c) 2018-2020, The Linux Foundation. All rights reserved.
		* Copyright (c) 2018-2021, The Linux Foundation. All rights reserved.
		*/

		#include <linux/delay.h>
		@@ -268,6 +268,19 @@ static int poll_gmu_reg(struct adreno_device *adreno_dev,
		gmu_core_regread(device, offsetdwords, &val);
		if ((val & mask) == expected_val)
		return 0;

		/*
		* If GMU firmware fails any assertion, error message is sent
		* to KMD and NMI is triggered. So check if GMU is in NMI and
		* timeout early. Bits [11:9] of A6XX_GMU_CM3_FW_INIT_RESULT
		* contain GMU reset status. Non zero value here indicates that
		* GMU reset is active, NMI handler would eventually complete
		* and GMU would wait for recovery.
		*/
		gmu_core_regread(device, A6XX_GMU_CM3_FW_INIT_RESULT, &val);
		if (val & 0xE00)
		return -ETIMEDOUT;

		usleep_range(10, 100);
		}