diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch | 354 |
1 files changed, 354 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch new file mode 100644 index 00000000..2ceb685d --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0560-drm-amdgpu-meld-together-VM-fragment-and-huge-page-h.patch @@ -0,0 +1,354 @@ +From f19f8ea413bcecfbb2966b32426ee66094b29fe6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> +Date: Thu, 6 Sep 2018 17:13:06 +0200 +Subject: [PATCH 0560/2940] drm/amdgpu: meld together VM fragment and huge page + handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This optimizes the generating of PTEs by walking the hierarchy only once +for a range and making changes as necessary. + +It allows for both huge (2MB) as well giant (1GB) pages to be used on +Vega and Raven. + +Change-Id: I1e12f0ad9cb00f488d6723e06e895df47d728af5 +Signed-off-by: Christian König <christian.koenig@amd.com> +Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> +Reviewed-by: Huang Rui <ray.huang@amd.com> +Acked-by: Junwei Zhang <Jerry.Zhang@amd.com> +Acked-by: Alvin Huan <Alvin.Huan@amd.com> +Acked-by: Aaron Liu <Aaron.Liu@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 267 ++++++++++++++----------- + 1 file changed, 147 insertions(+), 120 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +index d945db767fa0..635bd17cd709 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +@@ -1483,46 +1483,76 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev, + } + + /** +- * amdgpu_vm_handle_huge_pages - handle updating the PD with huge pages ++ * amdgpu_vm_update_huge - figure out parameters for PTE updates + * +- * @p: see amdgpu_pte_update_params definition +- * @entry: vm_pt entry to check +- * @parent: parent entry +- * @nptes: number of PTEs updated with this operation +- * @dst: destination address where the PTEs should point to +- * @flags: access flags fro the PTEs +- * +- * Check if we can update the PD with a huge page. ++ * Make sure to set the right flags for the PTEs at the desired level. + */ +-static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p, +- struct amdgpu_vm_pt *entry, +- struct amdgpu_vm_pt *parent, +- unsigned nptes, uint64_t dst, +- uint64_t flags) +-{ +- uint64_t pde; ++static void amdgpu_vm_update_huge(struct amdgpu_pte_update_params *params, ++ struct amdgpu_bo *bo, unsigned level, ++ uint64_t pe, uint64_t addr, ++ unsigned count, uint32_t incr, ++ uint64_t flags) + +- /* In the case of a mixed PT the PDE must point to it*/ +- if (p->adev->asic_type >= CHIP_VEGA10 && !p->src && +- nptes == AMDGPU_VM_PTE_COUNT(p->adev)) { +- /* Set the huge page flag to stop scanning at this PDE */ ++{ ++ if (level != AMDGPU_VM_PTB) { + flags |= AMDGPU_PDE_PTE; ++ amdgpu_gmc_get_vm_pde(params->adev, level, &addr, &flags); + } + +- if (!(flags & AMDGPU_PDE_PTE)) { +- if (entry->huge) { +- /* Add the entry to the relocated list to update it. */ +- entry->huge = false; +- amdgpu_vm_bo_relocated(&entry->base); +- } ++ amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags); ++} ++ ++/** ++ * amdgpu_vm_fragment - get fragment for PTEs ++ * ++ * @params: see amdgpu_pte_update_params definition ++ * @start: first PTE to handle ++ * @end: last PTE to handle ++ * @flags: hw mapping flags ++ * @frag: resulting fragment size ++ * @frag_end: end of this fragment ++ * ++ * Returns the first possible fragment for the start and end address. ++ */ ++static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params, ++ uint64_t start, uint64_t end, uint64_t flags, ++ unsigned int *frag, uint64_t *frag_end) ++{ ++ /** ++ * The MC L1 TLB supports variable sized pages, based on a fragment ++ * field in the PTE. When this field is set to a non-zero value, page ++ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE ++ * flags are considered valid for all PTEs within the fragment range ++ * and corresponding mappings are assumed to be physically contiguous. ++ * ++ * The L1 TLB can store a single PTE for the whole fragment, ++ * significantly increasing the space available for translation ++ * caching. This leads to large improvements in throughput when the ++ * TLB is under pressure. ++ * ++ * The L2 TLB distributes small and large fragments into two ++ * asymmetric partitions. The large fragment cache is significantly ++ * larger. Thus, we try to use large fragments wherever possible. ++ * Userspace can support this by aligning virtual base address and ++ * allocation size to the fragment size. ++ */ ++ unsigned max_frag = params->adev->vm_manager.fragment_size; ++ ++ /* system pages are non continuously */ ++ if (params->src || !(flags & AMDGPU_PTE_VALID)) { ++ *frag = 0; ++ *frag_end = end; + return; + } + +- entry->huge = true; +- amdgpu_gmc_get_vm_pde(p->adev, AMDGPU_VM_PDB0, &dst, &flags); +- +- pde = (entry - parent->entries) * 8; +- amdgpu_vm_update_func(p, parent->base.bo, pde, dst, 1, 0, flags); ++ /* This intentionally wraps around if no bit is set */ ++ *frag = min((unsigned)ffs(start) - 1, (unsigned)fls64(end - start) - 1); ++ if (*frag >= max_frag) { ++ *frag = max_frag; ++ *frag_end = end & ~((1ULL << max_frag) - 1); ++ } else { ++ *frag_end = start + (1 << *frag); ++ } + } + + /** +@@ -1540,108 +1570,105 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p, + * 0 for success, -EINVAL for failure. + */ + static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, +- uint64_t start, uint64_t end, +- uint64_t dst, uint64_t flags) ++ uint64_t start, uint64_t end, ++ uint64_t dst, uint64_t flags) + { + struct amdgpu_device *adev = params->adev; +- const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1; + struct amdgpu_vm_pt_cursor cursor; ++ uint64_t frag_start = start, frag_end; ++ unsigned int frag; + +- /* walk over the address space and update the page tables */ +- for_each_amdgpu_vm_pt_leaf(adev, params->vm, start, end - 1, cursor) { ++ /* figure out the initial fragment */ ++ amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end); ++ ++ /* walk over the address space and update the PTs */ ++ amdgpu_vm_pt_start(adev, params->vm, start, &cursor); ++ while (cursor.pfn < end) { + struct amdgpu_bo *pt = cursor.entry->base.bo; +- uint64_t pe_start; +- unsigned nptes; ++ unsigned shift, parent_shift, num_entries; ++ uint64_t incr, entry_end, pe_start; + +- if (!pt || cursor.level != AMDGPU_VM_PTB) ++ if (!pt) + return -ENOENT; + +- if ((cursor.pfn & ~mask) == (end & ~mask)) +- nptes = end - cursor.pfn; +- else +- nptes = AMDGPU_VM_PTE_COUNT(adev) - (cursor.pfn & mask); +- +- amdgpu_vm_handle_huge_pages(params, cursor.entry, cursor.parent, +- nptes, dst, flags); +- /* We don't need to update PTEs for huge pages */ +- if (cursor.entry->huge) { +- dst += nptes * AMDGPU_GPU_PAGE_SIZE; ++ /* The root level can't be a huge page */ ++ if (cursor.level == adev->vm_manager.root_level) { ++ if (!amdgpu_vm_pt_descendant(adev, &cursor)) ++ return -ENOENT; + continue; + } + +- pe_start = (cursor.pfn & mask) * 8; +- amdgpu_vm_update_func(params, pt, pe_start, dst, nptes, +- AMDGPU_GPU_PAGE_SIZE, flags); +- dst += nptes * AMDGPU_GPU_PAGE_SIZE; +- } +- +- return 0; +-} ++ /* First check if the entry is already handled */ ++ if (cursor.pfn < frag_start) { ++ cursor.entry->huge = true; ++ amdgpu_vm_pt_next(adev, &cursor); ++ continue; ++ } + +-/* +- * amdgpu_vm_frag_ptes - add fragment information to PTEs +- * +- * @params: see amdgpu_pte_update_params definition +- * @vm: requested vm +- * @start: first PTE to handle +- * @end: last PTE to handle +- * @dst: addr those PTEs should point to +- * @flags: hw mapping flags +- * +- * Returns: +- * 0 for success, -EINVAL for failure. +- */ +-static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, +- uint64_t start, uint64_t end, +- uint64_t dst, uint64_t flags) +-{ +- /** +- * The MC L1 TLB supports variable sized pages, based on a fragment +- * field in the PTE. When this field is set to a non-zero value, page +- * granularity is increased from 4KB to (1 << (12 + frag)). The PTE +- * flags are considered valid for all PTEs within the fragment range +- * and corresponding mappings are assumed to be physically contiguous. +- * +- * The L1 TLB can store a single PTE for the whole fragment, +- * significantly increasing the space available for translation +- * caching. This leads to large improvements in throughput when the +- * TLB is under pressure. +- * +- * The L2 TLB distributes small and large fragments into two +- * asymmetric partitions. The large fragment cache is significantly +- * larger. Thus, we try to use large fragments wherever possible. +- * Userspace can support this by aligning virtual base address and +- * allocation size to the fragment size. +- */ +- unsigned max_frag = params->adev->vm_manager.fragment_size; +- int r; ++ /* If it isn't already handled it can't be a huge page */ ++ if (cursor.entry->huge) { ++ /* Add the entry to the relocated list to update it. */ ++ cursor.entry->huge = false; ++ amdgpu_vm_bo_relocated(&cursor.entry->base); ++ } + +- /* system pages are non continuously */ +- if (params->src || !(flags & AMDGPU_PTE_VALID)) +- return amdgpu_vm_update_ptes(params, start, end, dst, flags); +- +- while (start != end) { +- uint64_t frag_flags, frag_end; +- unsigned frag; +- +- /* This intentionally wraps around if no bit is set */ +- frag = min((unsigned)ffs(start) - 1, +- (unsigned)fls64(end - start) - 1); +- if (frag >= max_frag) { +- frag_flags = AMDGPU_PTE_FRAG(max_frag); +- frag_end = end & ~((1ULL << max_frag) - 1); +- } else { +- frag_flags = AMDGPU_PTE_FRAG(frag); +- frag_end = start + (1 << frag); ++ shift = amdgpu_vm_level_shift(adev, cursor.level); ++ parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1); ++ if (adev->asic_type < CHIP_VEGA10) { ++ /* No huge page support before GMC v9 */ ++ if (cursor.level != AMDGPU_VM_PTB) { ++ if (!amdgpu_vm_pt_descendant(adev, &cursor)) ++ return -ENOENT; ++ continue; ++ } ++ } else if (frag < shift) { ++ /* We can't use this level when the fragment size is ++ * smaller than the address shift. Go to the next ++ * child entry and try again. ++ */ ++ if (!amdgpu_vm_pt_descendant(adev, &cursor)) ++ return -ENOENT; ++ continue; ++ } else if (frag >= parent_shift) { ++ /* If the fragment size is even larger than the parent ++ * shift we should go up one level and check it again. ++ */ ++ if (!amdgpu_vm_pt_ancestor(&cursor)) ++ return -ENOENT; ++ continue; + } + +- r = amdgpu_vm_update_ptes(params, start, frag_end, dst, +- flags | frag_flags); +- if (r) +- return r; ++ /* Looks good so far, calculate parameters for the update */ ++ incr = AMDGPU_GPU_PAGE_SIZE << shift; ++ num_entries = amdgpu_vm_num_entries(adev, cursor.level); ++ pe_start = ((cursor.pfn >> shift) & (num_entries - 1)) * 8; ++ entry_end = num_entries << shift; ++ entry_end += cursor.pfn & ~(entry_end - 1); ++ entry_end = min(entry_end, end); ++ ++ do { ++ uint64_t upd_end = min(entry_end, frag_end); ++ unsigned nptes = (upd_end - frag_start) >> shift; ++ ++ amdgpu_vm_update_huge(params, pt, cursor.level, ++ pe_start, dst, nptes, incr, ++ flags | AMDGPU_PTE_FRAG(frag)); ++ ++ pe_start += nptes * 8; ++ dst += nptes * AMDGPU_GPU_PAGE_SIZE << shift; ++ ++ frag_start = upd_end; ++ if (frag_start >= frag_end) { ++ /* figure out the next fragment */ ++ amdgpu_vm_fragment(params, frag_start, end, ++ flags, &frag, &frag_end); ++ if (frag < shift) ++ break; ++ } ++ } while (frag_start < entry_end); + +- dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE; +- start = frag_end; ++ if (frag >= shift) ++ amdgpu_vm_pt_next(adev, &cursor); + } + + return 0; +@@ -1703,8 +1730,8 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, + + params.func = amdgpu_vm_cpu_set_ptes; + params.pages_addr = pages_addr; +- return amdgpu_vm_frag_ptes(¶ms, start, last + 1, +- addr, flags); ++ return amdgpu_vm_update_ptes(¶ms, start, last + 1, ++ addr, flags); + } + + ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched); +@@ -1783,7 +1810,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, + if (r) + goto error_free; + +- r = amdgpu_vm_frag_ptes(¶ms, start, last + 1, addr, flags); ++ r = amdgpu_vm_update_ptes(¶ms, start, last + 1, addr, flags); + if (r) + goto error_free; + +-- +2.17.1 + |