drm/amdgpu: flip frag_ptes and update_pts (92696dd5) · Commits · e / devices / android_kernel_oneplus_sm7250

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+79 −87

Original line number	Diff line number	Diff line
		@@ -701,83 +701,6 @@ int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
		return r;
		}

		/**
		* amdgpu_vm_frag_ptes - add fragment information to PTEs
		*
		* @params: see amdgpu_pte_update_params definition
		* @pe_start: first PTE to handle
		* @pe_end: last PTE to handle
		* @addr: addr those PTEs should point to
		* @flags: hw mapping flags
		*/
		static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
		uint64_t pe_start, uint64_t pe_end,
		uint64_t addr, uint32_t flags)
		{
		/**
		* The MC L1 TLB supports variable sized pages, based on a fragment
		* field in the PTE. When this field is set to a non-zero value, page
		* granularity is increased from 4KB to (1 << (12 + frag)). The PTE
		* flags are considered valid for all PTEs within the fragment range
		* and corresponding mappings are assumed to be physically contiguous.
		*
		* The L1 TLB can store a single PTE for the whole fragment,
		* significantly increasing the space available for translation
		* caching. This leads to large improvements in throughput when the
		* TLB is under pressure.
		*
		* The L2 TLB distributes small and large fragments into two
		* asymmetric partitions. The large fragment cache is significantly
		* larger. Thus, we try to use large fragments wherever possible.
		* Userspace can support this by aligning virtual base address and
		* allocation size to the fragment size.
		*/

		/* SI and newer are optimized for 64KB */
		uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
		uint64_t frag_align = 0x80;

		uint64_t frag_start = ALIGN(pe_start, frag_align);
		uint64_t frag_end = pe_end & ~(frag_align - 1);

		unsigned count;

		/* Abort early if there isn't anything to do */
		if (pe_start == pe_end)
		return;

		/* system pages are non continuously */
		if (params->src \|\| params->pages_addr \|\|
		!(flags & AMDGPU_PTE_VALID) \|\| (frag_start >= frag_end)) {

		count = (pe_end - pe_start) / 8;
		amdgpu_vm_update_pages(params, pe_start, addr, count,
		AMDGPU_GPU_PAGE_SIZE, flags);
		return;
		}

		/* handle the 4K area at the beginning */
		if (pe_start != frag_start) {
		count = (frag_start - pe_start) / 8;
		amdgpu_vm_update_pages(params, pe_start, addr, count,
		AMDGPU_GPU_PAGE_SIZE, flags);
		addr += AMDGPU_GPU_PAGE_SIZE * count;
		}

		/* handle the area in the middle */
		count = (frag_end - frag_start) / 8;
		amdgpu_vm_update_pages(params, frag_start, addr, count,
		AMDGPU_GPU_PAGE_SIZE, flags \| frag_flags);

		/* handle the 4K area at the end */
		if (frag_end != pe_end) {
		addr += AMDGPU_GPU_PAGE_SIZE * count;
		count = (pe_end - frag_end) / 8;
		amdgpu_vm_update_pages(params, frag_end, addr, count,
		AMDGPU_GPU_PAGE_SIZE, flags);
		}
		}

		/**
		* amdgpu_vm_update_ptes - make sure that page tables are valid
		*
		@@ -797,7 +720,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
		{
		const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1;

		uint64_t cur_pe_start, cur_pe_end, cur_dst;
		uint64_t cur_pe_start, cur_nptes, cur_dst;
		uint64_t addr; /* next GPU address to be updated */
		uint64_t pt_idx;
		struct amdgpu_bo *pt;
		@@ -816,7 +739,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,

		cur_pe_start = amdgpu_bo_gpu_offset(pt);
		cur_pe_start += (addr & mask) * 8;
		cur_pe_end = cur_pe_start + 8 * nptes;
		cur_nptes = nptes;
		cur_dst = dst;

		/* for next ptb*/
		@@ -836,18 +759,19 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
		next_pe_start = amdgpu_bo_gpu_offset(pt);
		next_pe_start += (addr & mask) * 8;

		if (cur_pe_end == next_pe_start) {
		if ((cur_pe_start + 8 * cur_nptes) == next_pe_start) {
		/* The next ptb is consecutive to current ptb.
		* Don't call amdgpu_vm_frag_ptes now.
		* Don't call amdgpu_vm_update_pages now.
		* Will update two ptbs together in future.
		*/
		cur_pe_end += 8 * nptes;
		cur_nptes += nptes;
		} else {
		amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end,
		cur_dst, flags);
		amdgpu_vm_update_pages(params, cur_pe_start, cur_dst,
		cur_nptes, AMDGPU_GPU_PAGE_SIZE,
		flags);

		cur_pe_start = next_pe_start;
		cur_pe_end = next_pe_start + 8 * nptes;
		cur_nptes = nptes;
		cur_dst = dst;
		}

		@@ -856,7 +780,75 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
		dst += nptes * AMDGPU_GPU_PAGE_SIZE;
		}

		amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, cur_dst, flags);
		amdgpu_vm_update_pages(params, cur_pe_start, cur_dst, cur_nptes,
		AMDGPU_GPU_PAGE_SIZE, flags);
		}

		/*
		* amdgpu_vm_frag_ptes - add fragment information to PTEs
		*
		* @params: see amdgpu_pte_update_params definition
		* @vm: requested vm
		* @start: first PTE to handle
		* @end: last PTE to handle
		* @dst: addr those PTEs should point to
		* @flags: hw mapping flags
		*/
		static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
		struct amdgpu_vm *vm,
		uint64_t start, uint64_t end,
		uint64_t dst, uint32_t flags)
		{
		/**
		* The MC L1 TLB supports variable sized pages, based on a fragment
		* field in the PTE. When this field is set to a non-zero value, page
		* granularity is increased from 4KB to (1 << (12 + frag)). The PTE
		* flags are considered valid for all PTEs within the fragment range
		* and corresponding mappings are assumed to be physically contiguous.
		*
		* The L1 TLB can store a single PTE for the whole fragment,
		* significantly increasing the space available for translation
		* caching. This leads to large improvements in throughput when the
		* TLB is under pressure.
		*
		* The L2 TLB distributes small and large fragments into two
		* asymmetric partitions. The large fragment cache is significantly
		* larger. Thus, we try to use large fragments wherever possible.
		* Userspace can support this by aligning virtual base address and
		* allocation size to the fragment size.
		*/

		/* SI and newer are optimized for 64KB */
		uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
		uint64_t frag_align = 1 << AMDGPU_LOG2_PAGES_PER_FRAG;

		uint64_t frag_start = ALIGN(start, frag_align);
		uint64_t frag_end = end & ~(frag_align - 1);

		/* system pages are non continuously */
		if (params->src \|\| params->pages_addr \|\| !(flags & AMDGPU_PTE_VALID) \|\|
		(frag_start >= frag_end)) {

		amdgpu_vm_update_ptes(params, vm, start, end, dst, flags);
		return;
		}

		/* handle the 4K area at the beginning */
		if (start != frag_start) {
		amdgpu_vm_update_ptes(params, vm, start, frag_start,
		dst, flags);
		dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
		}

		/* handle the area in the middle */
		amdgpu_vm_update_ptes(params, vm, frag_start, frag_end, dst,
		flags \| frag_flags);

		/* handle the 4K area at the end */
		if (frag_end != end) {
		dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
		amdgpu_vm_update_ptes(params, vm, frag_end, end, dst, flags);
		}
		}

		/**
		@@ -953,7 +945,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
		if (r)
		goto error_free;

		amdgpu_vm_update_ptes(&params, vm, start, last + 1, addr, flags);
		amdgpu_vm_frag_ptes(&params, vm, start, last + 1, addr, flags);

		amdgpu_ring_pad_ib(ring, params.ib);
		WARN_ON(params.ib->length_dw > ndw);