aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch354
1 files changed, 354 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch
new file mode 100644
index 00000000..2ceb685d
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch
@@ -0,0 +1,354 @@
+From f19f8ea413bcecfbb2966b32426ee66094b29fe6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
+Date: Thu, 6 Sep 2018 17:13:06 +0200
+Subject: [PATCH 0560/2940] drm/amdgpu: meld together VM fragment and huge page
+ handling
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This optimizes the generating of PTEs by walking the hierarchy only once
+for a range and making changes as necessary.
+
+It allows for both huge (2MB) as well giant (1GB) pages to be used on
+Vega and Raven.
+
+Change-Id: I1e12f0ad9cb00f488d6723e06e895df47d728af5
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Reviewed-by: Huang Rui <ray.huang@amd.com>
+Acked-by: Junwei Zhang <Jerry.Zhang@amd.com>
+Acked-by: Alvin Huan <Alvin.Huan@amd.com>
+Acked-by: Aaron Liu <Aaron.Liu@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 ++++++++++++++-----------
+ 1 file changed, 147 insertions(+), 120 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index d945db767fa0..635bd17cd709 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -1483,46 +1483,76 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev,
+ }
+
+ /**
+- * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages
++ * amdgpu_vm_update_huge - figure out parameters for PTE updates
+ *
+- * @p: see amdgpu_pte_update_params definition
+- * @entry: vm_pt entry to check
+- * @parent: parent entry
+- * @nptes: number of PTEs updated with this operation
+- * @dst: destination address where the PTEs should point to
+- * @flags: access flags fro the PTEs
+- *
+- * Check if we can update the PD with a huge page.
++ * Make sure to set the right flags for the PTEs at the desired level.
+ */
+-static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
+- struct amdgpu_vm_pt *entry,
+- struct amdgpu_vm_pt *parent,
+- unsigned nptes, uint64_t dst,
+- uint64_t flags)
+-{
+- uint64_t pde;
++static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params *params,
++ struct amdgpu_bo *bo, unsigned level,
++ uint64_t pe, uint64_t addr,
++ unsigned count, uint32_t incr,
++ uint64_t flags)
+
+- /* In the case of a mixed PT the PDE must point to it*/
+- if (p->adev->asic_type >= CHIP_VEGA10 && !p->src &&
+- nptes == AMDGPU_VM_PTE_COUNT(p->adev)) {
+- /* Set the huge page flag to stop scanning at this PDE */
++{
++ if (level != AMDGPU_VM_PTB) {
+ flags |= AMDGPU_PDE_PTE;
++ amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags);
+ }
+
+- if (!(flags & AMDGPU_PDE_PTE)) {
+- if (entry->huge) {
+- /* Add the entry to the relocated list to update it. */
+- entry->huge = false;
+- amdgpu_vm_bo_relocated(&entry->base);
+- }
++ amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
++}
++
++/**
++ * amdgpu_vm_fragment - get fragment for PTEs
++ *
++ * @params: see amdgpu_pte_update_params definition
++ * @start: first PTE to handle
++ * @end: last PTE to handle
++ * @flags: hw mapping flags
++ * @frag: resulting fragment size
++ * @frag_end: end of this fragment
++ *
++ * Returns the first possible fragment for the start and end address.
++ */
++static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
++ uint64_t start, uint64_t end, uint64_t flags,
++ unsigned int *frag, uint64_t *frag_end)
++{
++ /**
++ * The MC L1 TLB supports variable sized pages, based on a fragment
++ * field in the PTE. When this field is set to a non-zero value, page
++ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
++ * flags are considered valid for all PTEs within the fragment range
++ * and corresponding mappings are assumed to be physically contiguous.
++ *
++ * The L1 TLB can store a single PTE for the whole fragment,
++ * significantly increasing the space available for translation
++ * caching. This leads to large improvements in throughput when the
++ * TLB is under pressure.
++ *
++ * The L2 TLB distributes small and large fragments into two
++ * asymmetric partitions. The large fragment cache is significantly
++ * larger. Thus, we try to use large fragments wherever possible.
++ * Userspace can support this by aligning virtual base address and
++ * allocation size to the fragment size.
++ */
++ unsigned max_frag = params->adev->vm_manager.fragment_size;
++
++ /* system pages are non continuously */
++ if (params->src || !(flags & AMDGPU_PTE_VALID)) {
++ *frag = 0;
++ *frag_end = end;
+ return;
+ }
+
+- entry->huge = true;
+- amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags);
+-
+- pde = (entry - parent->entries) * 8;
+- amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags);
++ /* This intentionally wraps around if no bit is set */
++ *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - start) - 1);
++ if (*frag >= max_frag) {
++ *frag = max_frag;
++ *frag_end = end & ~((1ULL << max_frag) - 1);
++ } else {
++ *frag_end = start + (1 << *frag);
++ }
+ }
+
+ /**
+@@ -1540,108 +1570,105 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
+ * 0 for success, -EINVAL for failure.
+ */
+ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
+- uint64_t start, uint64_t end,
+- uint64_t dst, uint64_t flags)
++ uint64_t start, uint64_t end,
++ uint64_t dst, uint64_t flags)
+ {
+ struct amdgpu_device *adev = params->adev;
+- const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
+ struct amdgpu_vm_pt_cursor cursor;
++ uint64_t frag_start = start, frag_end;
++ unsigned int frag;
+
+- /* walk over the address space and update the page tables */
+- for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, cursor) {
++ /* figure out the initial fragment */
++ amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end);
++
++ /* walk over the address space and update the PTs */
++ amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
++ while (cursor.pfn < end) {
+ struct amdgpu_bo *pt = cursor.entry->base.bo;
+- uint64_t pe_start;
+- unsigned nptes;
++ unsigned shift, parent_shift, num_entries;
++ uint64_t incr, entry_end, pe_start;
+
+- if (!pt || cursor.level != AMDGPU_VM_PTB)
++ if (!pt)
+ return -ENOENT;
+
+- if ((cursor.pfn & ~mask) == (end & ~mask))
+- nptes = end - cursor.pfn;
+- else
+- nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask);
+-
+- amdgpu_vm_handle_huge_pages(params, cursor.entry, cursor.parent,
+- nptes, dst, flags);
+- /* We don't need to update PTEs for huge pages */
+- if (cursor.entry->huge) {
+- dst += nptes * AMDGPU_GPU_PAGE_SIZE;
++ /* The root level can't be a huge page */
++ if (cursor.level == adev->vm_manager.root_level) {
++ if (!amdgpu_vm_pt_descendant(adev, &cursor))
++ return -ENOENT;
+ continue;
+ }
+
+- pe_start = (cursor.pfn & mask) * 8;
+- amdgpu_vm_update_func(params, pt, pe_start, dst, nptes,
+- AMDGPU_GPU_PAGE_SIZE, flags);
+- dst += nptes * AMDGPU_GPU_PAGE_SIZE;
+- }
+-
+- return 0;
+-}
++ /* First check if the entry is already handled */
++ if (cursor.pfn < frag_start) {
++ cursor.entry->huge = true;
++ amdgpu_vm_pt_next(adev, &cursor);
++ continue;
++ }
+
+-/*
+- * amdgpu_vm_frag_ptes - add fragment information to PTEs
+- *
+- * @params: see amdgpu_pte_update_params definition
+- * @vm: requested vm
+- * @start: first PTE to handle
+- * @end: last PTE to handle
+- * @dst: addr those PTEs should point to
+- * @flags: hw mapping flags
+- *
+- * Returns:
+- * 0 for success, -EINVAL for failure.
+- */
+-static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
+- uint64_t start, uint64_t end,
+- uint64_t dst, uint64_t flags)
+-{
+- /**
+- * The MC L1 TLB supports variable sized pages, based on a fragment
+- * field in the PTE. When this field is set to a non-zero value, page
+- * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+- * flags are considered valid for all PTEs within the fragment range
+- * and corresponding mappings are assumed to be physically contiguous.
+- *
+- * The L1 TLB can store a single PTE for the whole fragment,
+- * significantly increasing the space available for translation
+- * caching. This leads to large improvements in throughput when the
+- * TLB is under pressure.
+- *
+- * The L2 TLB distributes small and large fragments into two
+- * asymmetric partitions. The large fragment cache is significantly
+- * larger. Thus, we try to use large fragments wherever possible.
+- * Userspace can support this by aligning virtual base address and
+- * allocation size to the fragment size.
+- */
+- unsigned max_frag = params->adev->vm_manager.fragment_size;
+- int r;
++ /* If it isn't already handled it can't be a huge page */
++ if (cursor.entry->huge) {
++ /* Add the entry to the relocated list to update it. */
++ cursor.entry->huge = false;
++ amdgpu_vm_bo_relocated(&cursor.entry->base);
++ }
+
+- /* system pages are non continuously */
+- if (params->src || !(flags & AMDGPU_PTE_VALID))
+- return amdgpu_vm_update_ptes(params, start, end, dst, flags);
+-
+- while (start != end) {
+- uint64_t frag_flags, frag_end;
+- unsigned frag;
+-
+- /* This intentionally wraps around if no bit is set */
+- frag = min((unsigned)ffs(start) - 1,
+- (unsigned)fls64(end - start) - 1);
+- if (frag >= max_frag) {
+- frag_flags = AMDGPU_PTE_FRAG(max_frag);
+- frag_end = end & ~((1ULL << max_frag) - 1);
+- } else {
+- frag_flags = AMDGPU_PTE_FRAG(frag);
+- frag_end = start + (1 << frag);
++ shift = amdgpu_vm_level_shift(adev, cursor.level);
++ parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
++ if (adev->asic_type < CHIP_VEGA10) {
++ /* No huge page support before GMC v9 */
++ if (cursor.level != AMDGPU_VM_PTB) {
++ if (!amdgpu_vm_pt_descendant(adev, &cursor))
++ return -ENOENT;
++ continue;
++ }
++ } else if (frag < shift) {
++ /* We can't use this level when the fragment size is
++ * smaller than the address shift. Go to the next
++ * child entry and try again.
++ */
++ if (!amdgpu_vm_pt_descendant(adev, &cursor))
++ return -ENOENT;
++ continue;
++ } else if (frag >= parent_shift) {
++ /* If the fragment size is even larger than the parent
++ * shift we should go up one level and check it again.
++ */
++ if (!amdgpu_vm_pt_ancestor(&cursor))
++ return -ENOENT;
++ continue;
+ }
+
+- r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
+- flags | frag_flags);
+- if (r)
+- return r;
++ /* Looks good so far, calculate parameters for the update */
++ incr = AMDGPU_GPU_PAGE_SIZE << shift;
++ num_entries = amdgpu_vm_num_entries(adev, cursor.level);
++ pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8;
++ entry_end = num_entries << shift;
++ entry_end += cursor.pfn & ~(entry_end - 1);
++ entry_end = min(entry_end, end);
++
++ do {
++ uint64_t upd_end = min(entry_end, frag_end);
++ unsigned nptes = (upd_end - frag_start) >> shift;
++
++ amdgpu_vm_update_huge(params, pt, cursor.level,
++ pe_start, dst, nptes, incr,
++ flags | AMDGPU_PTE_FRAG(frag));
++
++ pe_start += nptes * 8;
++ dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift;
++
++ frag_start = upd_end;
++ if (frag_start >= frag_end) {
++ /* figure out the next fragment */
++ amdgpu_vm_fragment(params, frag_start, end,
++ flags, &frag, &frag_end);
++ if (frag < shift)
++ break;
++ }
++ } while (frag_start < entry_end);
+
+- dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
+- start = frag_end;
++ if (frag >= shift)
++ amdgpu_vm_pt_next(adev, &cursor);
+ }
+
+ return 0;
+@@ -1703,8 +1730,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
+
+ params.func = amdgpu_vm_cpu_set_ptes;
+ params.pages_addr = pages_addr;
+- return amdgpu_vm_frag_ptes(&params, start, last + 1,
+- addr, flags);
++ return amdgpu_vm_update_ptes(&params, start, last + 1,
++ addr, flags);
+ }
+
+ ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
+@@ -1783,7 +1810,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
+ if (r)
+ goto error_free;
+
+- r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
++ r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
+ if (r)
+ goto error_free;
+
+--
+2.17.1
+