1 files changed, 341 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux-4.19/linux-yocto-4.19.8/0508-Revert-drm-amdgpu-meld-together-VM-fragment-and-huge.patch b/meta-amd-bsp/recipes-kernel/linux-4.19/linux-yocto-4.19.8/0508-Revert-drm-amdgpu-meld-together-VM-fragment-and-huge.patch
new file mode 100644
index 00000000..7188b178
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux-4.19/linux-yocto-4.19.8/0508-Revert-drm-amdgpu-meld-together-VM-fragment-and-huge.patch
@@ -0,0 +1,341 @@
+From 9cd17f014fe62dec34b4f06ae6f51efda5177bac Mon Sep 17 00:00:00 2001
+From: Prike Liang <Prike.Liang@amd.com>
+Date: Thu, 11 Oct 2018 15:38:37 +0800
+Subject: [PATCH 0508/2940] Revert "drm/amdgpu: meld together VM fragment and
+ huge page handling"
+
+This reverts commit 6ef5680cb4eefd8d92d9ea0dbafc83953ca8c968.
+
+Change-Id: If43203fec093d05fb998cca557676a83553a9a52
+Signed-off-by: Prike Liang <Prike.Liang@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 +++++++++++--------------
+ 1 file changed, 120 insertions(+), 147 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index 635bd17cd709..d945db767fa0 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -1483,76 +1483,46 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev,
+ }
+ 
+ /**
+- * amdgpu_vm_update_huge - figure out parameters for PTE updates
++ * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
+  *
+- * Make sure to set the right flags for the PTEs at the desired level.
++ * @p: see amdgpu_pte_update_params definition
++ * @entry: vm_pt entry to check
++ * @parent: parent entry
++ * @nptes: number of PTEs updated with this operation
++ * @dst: destination address where the PTEs should point to
++ * @flags: access flags fro the PTEs
++ *
++ * Check if we can update the PD with a huge page.
+  */
+-static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params *params,
+-				  struct amdgpu_bo *bo, unsigned level,
+-				  uint64_t pe, uint64_t addr,
+-				  unsigned count, uint32_t incr,
+-				  uint64_t flags)
+-
++static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
++					struct amdgpu_vm_pt *entry,
++					struct amdgpu_vm_pt *parent,
++					unsigned nptes, uint64_t dst,
++					uint64_t flags)
+ {
+-	if (level != AMDGPU_VM_PTB) {
++	uint64_t pde;
++
++	/* In the case of a mixed PT the PDE must point to it*/
++	if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
++	    nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
++		/* Set the huge page flag to stop scanning at this PDE */
+ 		flags |= AMDGPU_PDE_PTE;
+-		amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
+ 	}
+ 
+-	amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
+-}
+-
+-/**
+- * amdgpu_vm_fragment - get fragment for PTEs
+- *
+- * @params: see amdgpu_pte_update_params definition
+- * @start: first PTE to handle
+- * @end: last PTE to handle
+- * @flags: hw mapping flags
+- * @frag: resulting fragment size
+- * @frag_end: end of this fragment
+- *
+- * Returns the first possible fragment for the start and end address.
+- */
+-static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
+-			       uint64_t start, uint64_t end, uint64_t flags,
+-			       unsigned int *frag, uint64_t *frag_end)
+-{
+-	/**
+-	 * The MC L1 TLB supports variable sized pages, based on a fragment
+-	 * field in the PTE. When this field is set to a non-zero value, page
+-	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+-	 * flags are considered valid for all PTEs within the fragment range
+-	 * and corresponding mappings are assumed to be physically contiguous.
+-	 *
+-	 * The L1 TLB can store a single PTE for the whole fragment,
+-	 * significantly increasing the space available for translation
+-	 * caching. This leads to large improvements in throughput when the
+-	 * TLB is under pressure.
+-	 *
+-	 * The L2 TLB distributes small and large fragments into two
+-	 * asymmetric partitions. The large fragment cache is significantly
+-	 * larger. Thus, we try to use large fragments wherever possible.
+-	 * Userspace can support this by aligning virtual base address and
+-	 * allocation size to the fragment size.
+-	 */
+-	unsigned max_frag = params->adev->vm_manager.fragment_size;
+-
+-	/* system pages are non continuously */
+-	if (params->src || !(flags & AMDGPU_PTE_VALID)) {
+-		*frag = 0;
+-		*frag_end = end;
++	if (!(flags & AMDGPU_PDE_PTE)) {
++		if (entry->huge) {
++			/* Add the entry to the relocated list to update it. */
++			entry->huge = false;
++			amdgpu_vm_bo_relocated(&entry->base);
++		}
+ 		return;
+ 	}
+ 
+-	/* This intentionally wraps around if no bit is set */
+-	*frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - start) - 1);
+-	if (*frag >= max_frag) {
+-		*frag = max_frag;
+-		*frag_end = end & ~((1ULL << max_frag) - 1);
+-	} else {
+-		*frag_end = start + (1 << *frag);
+-	}
++	entry->huge = true;
++	amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
++
++	pde = (entry - parent->entries) * 8;
++	amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags);
+ }
+ 
+ /**
+@@ -1570,105 +1540,108 @@ static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
+  * 0 for success, -EINVAL for failure.
+  */
+ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
+-				 uint64_t start, uint64_t end,
+-				 uint64_t dst, uint64_t flags)
++				  uint64_t start, uint64_t end,
++				  uint64_t dst, uint64_t flags)
+ {
+ 	struct amdgpu_device *adev = params->adev;
++	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
+ 	struct amdgpu_vm_pt_cursor cursor;
+-	uint64_t frag_start = start, frag_end;
+-	unsigned int frag;
+ 
+-	/* figure out the initial fragment */
+-	amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end);
+-
+-	/* walk over the address space and update the PTs */
+-	amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
+-	while (cursor.pfn < end) {
++	/* walk over the address space and update the page tables */
++	for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, cursor) {
+ 		struct amdgpu_bo *pt = cursor.entry->base.bo;
+-		unsigned shift, parent_shift, num_entries;
+-		uint64_t incr, entry_end, pe_start;
++		uint64_t pe_start;
++		unsigned nptes;
+ 
+-		if (!pt)
++		if (!pt || cursor.level != AMDGPU_VM_PTB)
+ 			return -ENOENT;
+ 
+-		/* The root level can't be a huge page */
+-		if (cursor.level == adev->vm_manager.root_level) {
+-			if (!amdgpu_vm_pt_descendant(adev, &cursor))
+-				return -ENOENT;
+-			continue;
+-		}
++		if ((cursor.pfn & ~mask) == (end & ~mask))
++			nptes = end - cursor.pfn;
++		else
++			nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask);
+ 
+-		/* First check if the entry is already handled */
+-		if (cursor.pfn < frag_start) {
+-			cursor.entry->huge = true;
+-			amdgpu_vm_pt_next(adev, &cursor);
++		amdgpu_vm_handle_huge_pages(params, cursor.entry, cursor.parent,
++					    nptes, dst, flags);
++		/* We don't need to update PTEs for huge pages */
++		if (cursor.entry->huge) {
++			dst += nptes * AMDGPU_GPU_PAGE_SIZE;
+ 			continue;
+ 		}
+ 
+-		/* If it isn't already handled it can't be a huge page */
+-		if (cursor.entry->huge) {
+-			/* Add the entry to the relocated list to update it. */
+-			cursor.entry->huge = false;
+-			amdgpu_vm_bo_relocated(&cursor.entry->base);
+-		}
++		pe_start = (cursor.pfn & mask) * 8;
++		amdgpu_vm_update_func(params, pt, pe_start, dst, nptes,
++				      AMDGPU_GPU_PAGE_SIZE, flags);
++		dst += nptes * AMDGPU_GPU_PAGE_SIZE;
++	}
+ 
+-		shift = amdgpu_vm_level_shift(adev, cursor.level);
+-		parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
+-		if (adev->asic_type < CHIP_VEGA10) {
+-			/* No huge page support before GMC v9 */
+-			if (cursor.level != AMDGPU_VM_PTB) {
+-				if (!amdgpu_vm_pt_descendant(adev, &cursor))
+-					return -ENOENT;
+-				continue;
+-			}
+-		} else if (frag < shift) {
+-			/* We can't use this level when the fragment size is
+-			 * smaller than the address shift. Go to the next
+-			 * child entry and try again.
+-			 */
+-			if (!amdgpu_vm_pt_descendant(adev, &cursor))
+-				return -ENOENT;
+-			continue;
+-		} else if (frag >= parent_shift) {
+-			/* If the fragment size is even larger than the parent
+-			 * shift we should go up one level and check it again.
+-			 */
+-			if (!amdgpu_vm_pt_ancestor(&cursor))
+-				return -ENOENT;
+-			continue;
++	return 0;
++}
++
++/*
++ * amdgpu_vm_frag_ptes - add fragment information to PTEs
++ *
++ * @params: see amdgpu_pte_update_params definition
++ * @vm: requested vm
++ * @start: first PTE to handle
++ * @end: last PTE to handle
++ * @dst: addr those PTEs should point to
++ * @flags: hw mapping flags
++ *
++ * Returns:
++ * 0 for success, -EINVAL for failure.
++ */
++static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
++				uint64_t start, uint64_t end,
++				uint64_t dst, uint64_t flags)
++{
++	/**
++	 * The MC L1 TLB supports variable sized pages, based on a fragment
++	 * field in the PTE. When this field is set to a non-zero value, page
++	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
++	 * flags are considered valid for all PTEs within the fragment range
++	 * and corresponding mappings are assumed to be physically contiguous.
++	 *
++	 * The L1 TLB can store a single PTE for the whole fragment,
++	 * significantly increasing the space available for translation
++	 * caching. This leads to large improvements in throughput when the
++	 * TLB is under pressure.
++	 *
++	 * The L2 TLB distributes small and large fragments into two
++	 * asymmetric partitions. The large fragment cache is significantly
++	 * larger. Thus, we try to use large fragments wherever possible.
++	 * Userspace can support this by aligning virtual base address and
++	 * allocation size to the fragment size.
++	 */
++	unsigned max_frag = params->adev->vm_manager.fragment_size;
++	int r;
++
++	/* system pages are non continuously */
++	if (params->src || !(flags & AMDGPU_PTE_VALID))
++		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
++
++	while (start != end) {
++		uint64_t frag_flags, frag_end;
++		unsigned frag;
++
++		/* This intentionally wraps around if no bit is set */
++		frag = min((unsigned)ffs(start) - 1,
++			   (unsigned)fls64(end - start) - 1);
++		if (frag >= max_frag) {
++			frag_flags = AMDGPU_PTE_FRAG(max_frag);
++			frag_end = end & ~((1ULL << max_frag) - 1);
++		} else {
++			frag_flags = AMDGPU_PTE_FRAG(frag);
++			frag_end = start + (1 << frag);
+ 		}
+ 
+-		/* Looks good so far, calculate parameters for the update */
+-		incr = AMDGPU_GPU_PAGE_SIZE << shift;
+-		num_entries = amdgpu_vm_num_entries(adev, cursor.level);
+-		pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8;
+-		entry_end = num_entries << shift;
+-		entry_end += cursor.pfn & ~(entry_end - 1);
+-		entry_end = min(entry_end, end);
+-
+-		do {
+-			uint64_t upd_end = min(entry_end, frag_end);
+-			unsigned nptes = (upd_end - frag_start) >> shift;
+-
+-			amdgpu_vm_update_huge(params, pt, cursor.level,
+-					      pe_start, dst, nptes, incr,
+-					      flags | AMDGPU_PTE_FRAG(frag));
+-
+-			pe_start += nptes * 8;
+-			dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
+-
+-			frag_start = upd_end;
+-			if (frag_start >= frag_end) {
+-				/* figure out the next fragment */
+-				amdgpu_vm_fragment(params, frag_start, end,
+-						   flags, &frag, &frag_end);
+-				if (frag < shift)
+-					break;
+-			}
+-		} while (frag_start < entry_end);
++		r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
++					  flags | frag_flags);
++		if (r)
++			return r;
+ 
+-		if (frag >= shift)
+-			amdgpu_vm_pt_next(adev, &cursor);
++		dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
++		start = frag_end;
+ 	}
+ 
+ 	return 0;
+@@ -1730,8 +1703,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
+ 
+ 		params.func = amdgpu_vm_cpu_set_ptes;
+ 		params.pages_addr = pages_addr;
+-		return amdgpu_vm_update_ptes(&params, start, last + 1,
+-					     addr, flags);
++		return amdgpu_vm_frag_ptes(&params, start, last + 1,
++					   addr, flags);
+ 	}
+ 
+ 	ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
+@@ -1810,7 +1783,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
+ 	if (r)
+ 		goto error_free;
+ 
+-	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
++	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+ 	if (r)
+ 		goto error_free;
+ 
+-- 
+2.17.1
+