drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues (41cca166) · Commits · e / devices / android_kernel_fairphone_FP5

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

+2 −1

Original line number	Diff line number	Diff line
		@@ -72,9 +72,10 @@
		* - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
		* - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
		* - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
		* - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
		*/
		#define KMS_DRIVER_MAJOR 3
		#define KMS_DRIVER_MINOR 28
		#define KMS_DRIVER_MINOR 29
		#define KMS_DRIVER_PATCHLEVEL 0

		int amdgpu_vram_limit = 0;

drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,8 @@ struct amdgpu_gds {
		struct amdgpu_gds_asic_info mem;
		struct amdgpu_gds_asic_info gws;
		struct amdgpu_gds_asic_info oa;
		uint32_t gds_compute_max_wave_id;

		/* At present, GDS, GWS and OA resources for gfx (graphics)
		* is always pre-allocated and available for graphics operation.
		* Such resource is shared between all gfx clients.

drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c

+18 −1

Original line number	Diff line number	Diff line
		@@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
		unsigned vmid = AMDGPU_JOB_GET_VMID(job);
		u32 control = INDIRECT_BUFFER_VALID \| ib->length_dw \| (vmid << 24);

		/* Currently, there is a high possibility to get wave ID mismatch
		* between ME and GDS, leading to a hw deadlock, because ME generates
		* different wave IDs than the GDS expects. This situation happens
		* randomly when at least 5 compute pipes use GDS ordered append.
		* The wave IDs generated by ME are also wrong after suspend/resume.
		* Those are probably bugs somewhere else in the kernel driver.
		*
		* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
		* GDS to 0 for this ring (me/pipe).
		*/
		if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
		}

		amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
		amdgpu_ring_write(ring,
		#ifdef __BIG_ENDIAN
		@@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
		.emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
		.emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
		.emit_ib = gfx_v7_0_ring_emit_ib_compute,
		.emit_fence = gfx_v7_0_ring_emit_fence_compute,
		.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
		@@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
		adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
		adev->gds.gws.total_size = 64;
		adev->gds.oa.total_size = 16;
		adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);

		if (adev->gds.mem.total_size == 64 * 1024) {
		adev->gds.mem.gfx_partition_size = 4096;

drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c

+19 −2

Original line number	Diff line number	Diff line
		@@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
		unsigned vmid = AMDGPU_JOB_GET_VMID(job);
		u32 control = INDIRECT_BUFFER_VALID \| ib->length_dw \| (vmid << 24);

		/* Currently, there is a high possibility to get wave ID mismatch
		* between ME and GDS, leading to a hw deadlock, because ME generates
		* different wave IDs than the GDS expects. This situation happens
		* randomly when at least 5 compute pipes use GDS ordered append.
		* The wave IDs generated by ME are also wrong after suspend/resume.
		* Those are probably bugs somewhere else in the kernel driver.
		*
		* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
		* GDS to 0 for this ring (me/pipe).
		*/
		if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
		}

		amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
		amdgpu_ring_write(ring,
		#ifdef __BIG_ENDIAN
		@@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
		.emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
		.emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
		.emit_ib = gfx_v8_0_ring_emit_ib_compute,
		.emit_fence = gfx_v8_0_ring_emit_fence_compute,
		.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
		@@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
		17 + /* gfx_v8_0_ring_emit_vm_flush */
		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
		.emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
		.emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
		.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
		.test_ring = gfx_v8_0_ring_test_ring,
		.insert_nop = amdgpu_ring_insert_nop,
		@@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
		adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
		adev->gds.gws.total_size = 64;
		adev->gds.oa.total_size = 16;
		adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);

		if (adev->gds.mem.total_size == 64 * 1024) {
		adev->gds.mem.gfx_partition_size = 4096;

drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

+38 −2

Original line number	Diff line number	Diff line
		@@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
		unsigned vmid = AMDGPU_JOB_GET_VMID(job);
		u32 control = INDIRECT_BUFFER_VALID \| ib->length_dw \| (vmid << 24);

		/* Currently, there is a high possibility to get wave ID mismatch
		* between ME and GDS, leading to a hw deadlock, because ME generates
		* different wave IDs than the GDS expects. This situation happens
		* randomly when at least 5 compute pipes use GDS ordered append.
		* The wave IDs generated by ME are also wrong after suspend/resume.
		* Those are probably bugs somewhere else in the kernel driver.
		*
		* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
		* GDS to 0 for this ring (me/pipe).
		*/
		if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
		}

		amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
		BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
		amdgpu_ring_write(ring,
		@@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
		2 + /* gfx_v9_0_ring_emit_vm_flush */
		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
		.emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
		.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
		.emit_ib = gfx_v9_0_ring_emit_ib_compute,
		.emit_fence = gfx_v9_0_ring_emit_fence,
		.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
		@@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
		2 + /* gfx_v9_0_ring_emit_vm_flush */
		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
		.emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
		.emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
		.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
		.test_ring = gfx_v9_0_ring_test_ring,
		.insert_nop = amdgpu_ring_insert_nop,
		@@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
		break;
		}

		switch (adev->asic_type) {
		case CHIP_VEGA10:
		case CHIP_VEGA20:
		adev->gds.gds_compute_max_wave_id = 0x7ff;
		break;
		case CHIP_VEGA12:
		adev->gds.gds_compute_max_wave_id = 0x27f;
		break;
		case CHIP_RAVEN:
		if (adev->rev_id >= 0x8)
		adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
		else
		adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
		break;
		default:
		/* this really depends on the chip */
		adev->gds.gds_compute_max_wave_id = 0x7ff;
		break;
		}

		adev->gds.gws.total_size = 64;
		adev->gds.oa.total_size = 16;