diff options
Diffstat (limited to 'meta-steppeeagle/recipes-kernel/linux/linux-yocto/0018-yocto-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch')
-rw-r--r-- | meta-steppeeagle/recipes-kernel/linux/linux-yocto/0018-yocto-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/meta-steppeeagle/recipes-kernel/linux/linux-yocto/0018-yocto-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch b/meta-steppeeagle/recipes-kernel/linux/linux-yocto/0018-yocto-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch new file mode 100644 index 00000000..6498c071 --- /dev/null +++ b/meta-steppeeagle/recipes-kernel/linux/linux-yocto/0018-yocto-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch @@ -0,0 +1,237 @@ +From a2bc39a6394bb8e11060df3da33d603a66ccf9f6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> +Date: Fri, 25 Oct 2013 18:07:55 +0200 +Subject: [PATCH 18/44] drm/radeon: add large PTE support for NI, SI and CIK + v3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch implements support for VRAM page table entry compression. +PTE construction is enhanced to identify physically contiguous page +ranges and mark them in the PTE fragment field. L1 TLB and L2 cache +support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, +significantly improving TLB utilization for VRAM allocations. + +Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. +Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS +on default settings at 1920x1200 resolution with vsync disabled. + +See main comment in radeon_gart.c gives a technical description. + +v2 (chk): rebased and simplified. +v3 (chk): add missing hw setup + +Signed-off-by: Jay Cornwall <jay@jcornwall.me> +Signed-off-by: Christian König <christian.koenig@amd.com> +--- + drivers/gpu/drm/radeon/cik.c | 4 +- + drivers/gpu/drm/radeon/ni.c | 2 + + drivers/gpu/drm/radeon/radeon.h | 5 ++ + drivers/gpu/drm/radeon/radeon_gart.c | 91 +++++++++++++++++++++++++++++++--- + drivers/gpu/drm/radeon/si.c | 5 +- + 5 files changed, 98 insertions(+), 9 deletions(-) + +diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c +index d7e86ef3..27fa479 100644 +--- a/drivers/gpu/drm/radeon/cik.c ++++ b/drivers/gpu/drm/radeon/cik.c +@@ -4522,6 +4522,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) + WREG32(MC_VM_MX_L1_TLB_CNTL, + (0xA << 7) | + ENABLE_L1_TLB | ++ ENABLE_L1_FRAGMENT_PROCESSING | + SYSTEM_ACCESS_MODE_NOT_IN_SYS | + ENABLE_ADVANCED_DRIVER_MODEL | + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); +@@ -4534,7 +4535,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | +- L2_CACHE_BIGK_FRAGMENT_SIZE(6)); ++ BANK_SELECT(4) | ++ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); + WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); +diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c +index f59a9e9..9291982 100644 +--- a/drivers/gpu/drm/radeon/ni.c ++++ b/drivers/gpu/drm/radeon/ni.c +@@ -1227,12 +1227,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); + /* Setup L2 cache */ + WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | ++ ENABLE_L2_FRAGMENT_PROCESSING | + ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | + ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | + EFFECTIVE_L2_QUEUE_SIZE(7) | + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | ++ BANK_SELECT(6) | + L2_CACHE_BIGK_FRAGMENT_SIZE(6)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); +diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h +index b987f01..06b3375 100644 +--- a/drivers/gpu/drm/radeon/radeon.h ++++ b/drivers/gpu/drm/radeon/radeon.h +@@ -846,6 +846,11 @@ struct radeon_mec { + #define R600_PTE_READABLE (1 << 5) + #define R600_PTE_WRITEABLE (1 << 6) + ++/* PTE (Page Table Entry) fragment field for different page sizes */ ++#define R600_PTE_FRAG_4KB (0 << 7) ++#define R600_PTE_FRAG_64KB (4 << 7) ++#define R600_PTE_FRAG_256KB (6 << 7) ++ + struct radeon_vm { + struct list_head list; + struct list_head va; +diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c +index 9ceabdf..f960ce6 100644 +--- a/drivers/gpu/drm/radeon/radeon_gart.c ++++ b/drivers/gpu/drm/radeon/radeon_gart.c +@@ -1021,6 +1021,84 @@ retry: + } + + /** ++ * radeon_vm_frag_ptes - add fragment information to PTEs ++ * ++ * @rdev: radeon_device pointer ++ * @ib: IB for the update ++ * @pe_start: first PTE to handle ++ * @pe_end: last PTE to handle ++ * @addr: addr those PTEs should point to ++ * @flags: hw mapping flags ++ * ++ * Global and local mutex must be locked! ++ */ ++static void radeon_vm_frag_ptes(struct radeon_device *rdev, ++ struct radeon_ib *ib, ++ uint64_t pe_start, uint64_t pe_end, ++ uint64_t addr, uint32_t flags) ++{ ++ /** ++ * The MC L1 TLB supports variable sized pages, based on a fragment ++ * field in the PTE. When this field is set to a non-zero value, page ++ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE ++ * flags are considered valid for all PTEs within the fragment range ++ * and corresponding mappings are assumed to be physically contiguous. ++ * ++ * The L1 TLB can store a single PTE for the whole fragment, ++ * significantly increasing the space available for translation ++ * caching. This leads to large improvements in throughput when the ++ * TLB is under pressure. ++ * ++ * The L2 cache distributes small and large fragments into two ++ * asymmetric partitions. The large fragment cache is significantly ++ * larger. Thus, we try to use large fragments wherever possible. ++ * Userspace can support this by aligning virtual base address and ++ * allocation size to the fragment size. ++ */ ++ ++ /* NI is optimized for 256KB fragments, SI and newer for 64KB */ ++ uint64_t frag_flags = rdev->family == CHIP_CAYMAN ? ++ R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB; ++ uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80; ++ ++ uint64_t frag_start = ALIGN(pe_start, frag_align); ++ uint64_t frag_end = pe_end & ~(frag_align - 1); ++ ++ unsigned count; ++ ++ /* system pages are non continuously */ ++ if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) || ++ (frag_start >= frag_end)) { ++ ++ count = (pe_end - pe_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ return; ++ } ++ ++ /* handle the 4K area at the beginning */ ++ if (pe_start != frag_start) { ++ count = (frag_start - pe_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ addr += RADEON_GPU_PAGE_SIZE * count; ++ } ++ ++ /* handle the area in the middle */ ++ count = (frag_end - frag_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags | frag_flags); ++ ++ /* handle the 4K area at the end */ ++ if (frag_end != pe_end) { ++ addr += RADEON_GPU_PAGE_SIZE * count; ++ count = (pe_end - frag_end) / 8; ++ radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ } ++} ++ ++/** + * radeon_vm_update_ptes - make sure that page tables are valid + * + * @rdev: radeon_device pointer +@@ -1066,10 +1144,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, + if ((last_pte + 8 * count) != pte) { + + if (count) { +- radeon_asic_vm_set_page(rdev, ib, last_pte, +- last_dst, count, +- RADEON_GPU_PAGE_SIZE, +- flags); ++ radeon_vm_frag_ptes(rdev, ib, last_pte, ++ last_pte + 8 * count, ++ last_dst, flags); + } + + count = nptes; +@@ -1084,9 +1161,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, + } + + if (count) { +- radeon_asic_vm_set_page(rdev, ib, last_pte, +- last_dst, count, +- RADEON_GPU_PAGE_SIZE, flags); ++ radeon_vm_frag_ptes(rdev, ib, last_pte, ++ last_pte + 8 * count, ++ last_dst, flags); + } + } + +diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c +index 0334f3e..d9173ef 100644 +--- a/drivers/gpu/drm/radeon/si.c ++++ b/drivers/gpu/drm/radeon/si.c +@@ -3928,18 +3928,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) + WREG32(MC_VM_MX_L1_TLB_CNTL, + (0xA << 7) | + ENABLE_L1_TLB | ++ ENABLE_L1_FRAGMENT_PROCESSING | + SYSTEM_ACCESS_MODE_NOT_IN_SYS | + ENABLE_ADVANCED_DRIVER_MODEL | + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); + /* Setup L2 cache */ + WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | ++ ENABLE_L2_FRAGMENT_PROCESSING | + ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | + ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | + EFFECTIVE_L2_QUEUE_SIZE(7) | + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | +- L2_CACHE_BIGK_FRAGMENT_SIZE(0)); ++ BANK_SELECT(4) | ++ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); + WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); +-- +1.7.9.5 + |