diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch | 236 |
1 files changed, 236 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch b/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch new file mode 100644 index 00000000..f93f84ad --- /dev/null +++ b/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch @@ -0,0 +1,236 @@ +From c3679d52cb42a2cc76c0c893ad364157dc3699dc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> +Date: Fri, 25 Oct 2013 18:07:55 +0200 +Subject: [PATCH 18/60] drm/radeon: add large PTE support for NI, SI and CIK v3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch implements support for VRAM page table entry compression. +PTE construction is enhanced to identify physically contiguous page +ranges and mark them in the PTE fragment field. L1 TLB and L2 cache +support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, +significantly improving TLB utilization for VRAM allocations. + +Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. +Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS +on default settings at 1920x1200 resolution with vsync disabled. + +See main comment in radeon_gart.c gives a technical description. + +v2 (chk): rebased and simplified. +v3 (chk): add missing hw setup + +Signed-off-by: Jay Cornwall <jay@jcornwall.me> +Signed-off-by: Christian König <christian.koenig@amd.com> +--- + drivers/gpu/drm/radeon/cik.c | 4 +- + drivers/gpu/drm/radeon/ni.c | 2 + + drivers/gpu/drm/radeon/radeon.h | 5 ++ + drivers/gpu/drm/radeon/radeon_gart.c | 91 +++++++++++++++++++++++++++++++++--- + drivers/gpu/drm/radeon/si.c | 5 +- + 5 files changed, 98 insertions(+), 9 deletions(-) + +diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c +index e3bec288..e84005a 100644 +--- a/drivers/gpu/drm/radeon/cik.c ++++ b/drivers/gpu/drm/radeon/cik.c +@@ -4526,6 +4526,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) + WREG32(MC_VM_MX_L1_TLB_CNTL, + (0xA << 7) | + ENABLE_L1_TLB | ++ ENABLE_L1_FRAGMENT_PROCESSING | + SYSTEM_ACCESS_MODE_NOT_IN_SYS | + ENABLE_ADVANCED_DRIVER_MODEL | + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); +@@ -4538,7 +4539,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | +- L2_CACHE_BIGK_FRAGMENT_SIZE(6)); ++ BANK_SELECT(4) | ++ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); + WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); +diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c +index 2443d11..a3c7826 100644 +--- a/drivers/gpu/drm/radeon/ni.c ++++ b/drivers/gpu/drm/radeon/ni.c +@@ -1227,12 +1227,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); + /* Setup L2 cache */ + WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | ++ ENABLE_L2_FRAGMENT_PROCESSING | + ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | + ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | + EFFECTIVE_L2_QUEUE_SIZE(7) | + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | ++ BANK_SELECT(6) | + L2_CACHE_BIGK_FRAGMENT_SIZE(6)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); +diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h +index d478c28..e796f9a 100644 +--- a/drivers/gpu/drm/radeon/radeon.h ++++ b/drivers/gpu/drm/radeon/radeon.h +@@ -852,6 +852,11 @@ struct radeon_mec { + #define R600_PTE_READABLE (1 << 5) + #define R600_PTE_WRITEABLE (1 << 6) + ++/* PTE (Page Table Entry) fragment field for different page sizes */ ++#define R600_PTE_FRAG_4KB (0 << 7) ++#define R600_PTE_FRAG_64KB (4 << 7) ++#define R600_PTE_FRAG_256KB (6 << 7) ++ + struct radeon_vm { + struct list_head list; + struct list_head va; +diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c +index 9ceabdf..f960ce6 100644 +--- a/drivers/gpu/drm/radeon/radeon_gart.c ++++ b/drivers/gpu/drm/radeon/radeon_gart.c +@@ -1021,6 +1021,84 @@ retry: + } + + /** ++ * radeon_vm_frag_ptes - add fragment information to PTEs ++ * ++ * @rdev: radeon_device pointer ++ * @ib: IB for the update ++ * @pe_start: first PTE to handle ++ * @pe_end: last PTE to handle ++ * @addr: addr those PTEs should point to ++ * @flags: hw mapping flags ++ * ++ * Global and local mutex must be locked! ++ */ ++static void radeon_vm_frag_ptes(struct radeon_device *rdev, ++ struct radeon_ib *ib, ++ uint64_t pe_start, uint64_t pe_end, ++ uint64_t addr, uint32_t flags) ++{ ++ /** ++ * The MC L1 TLB supports variable sized pages, based on a fragment ++ * field in the PTE. When this field is set to a non-zero value, page ++ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE ++ * flags are considered valid for all PTEs within the fragment range ++ * and corresponding mappings are assumed to be physically contiguous. ++ * ++ * The L1 TLB can store a single PTE for the whole fragment, ++ * significantly increasing the space available for translation ++ * caching. This leads to large improvements in throughput when the ++ * TLB is under pressure. ++ * ++ * The L2 cache distributes small and large fragments into two ++ * asymmetric partitions. The large fragment cache is significantly ++ * larger. Thus, we try to use large fragments wherever possible. ++ * Userspace can support this by aligning virtual base address and ++ * allocation size to the fragment size. ++ */ ++ ++ /* NI is optimized for 256KB fragments, SI and newer for 64KB */ ++ uint64_t frag_flags = rdev->family == CHIP_CAYMAN ? ++ R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB; ++ uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80; ++ ++ uint64_t frag_start = ALIGN(pe_start, frag_align); ++ uint64_t frag_end = pe_end & ~(frag_align - 1); ++ ++ unsigned count; ++ ++ /* system pages are non continuously */ ++ if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) || ++ (frag_start >= frag_end)) { ++ ++ count = (pe_end - pe_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ return; ++ } ++ ++ /* handle the 4K area at the beginning */ ++ if (pe_start != frag_start) { ++ count = (frag_start - pe_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ addr += RADEON_GPU_PAGE_SIZE * count; ++ } ++ ++ /* handle the area in the middle */ ++ count = (frag_end - frag_start) / 8; ++ radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags | frag_flags); ++ ++ /* handle the 4K area at the end */ ++ if (frag_end != pe_end) { ++ addr += RADEON_GPU_PAGE_SIZE * count; ++ count = (pe_end - frag_end) / 8; ++ radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count, ++ RADEON_GPU_PAGE_SIZE, flags); ++ } ++} ++ ++/** + * radeon_vm_update_ptes - make sure that page tables are valid + * + * @rdev: radeon_device pointer +@@ -1066,10 +1144,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, + if ((last_pte + 8 * count) != pte) { + + if (count) { +- radeon_asic_vm_set_page(rdev, ib, last_pte, +- last_dst, count, +- RADEON_GPU_PAGE_SIZE, +- flags); ++ radeon_vm_frag_ptes(rdev, ib, last_pte, ++ last_pte + 8 * count, ++ last_dst, flags); + } + + count = nptes; +@@ -1084,9 +1161,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, + } + + if (count) { +- radeon_asic_vm_set_page(rdev, ib, last_pte, +- last_dst, count, +- RADEON_GPU_PAGE_SIZE, flags); ++ radeon_vm_frag_ptes(rdev, ib, last_pte, ++ last_pte + 8 * count, ++ last_dst, flags); + } + } + +diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c +index 8a0bc79..46b57e1 100644 +--- a/drivers/gpu/drm/radeon/si.c ++++ b/drivers/gpu/drm/radeon/si.c +@@ -3942,18 +3942,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) + WREG32(MC_VM_MX_L1_TLB_CNTL, + (0xA << 7) | + ENABLE_L1_TLB | ++ ENABLE_L1_FRAGMENT_PROCESSING | + SYSTEM_ACCESS_MODE_NOT_IN_SYS | + ENABLE_ADVANCED_DRIVER_MODEL | + SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); + /* Setup L2 cache */ + WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | ++ ENABLE_L2_FRAGMENT_PROCESSING | + ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | + ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | + EFFECTIVE_L2_QUEUE_SIZE(7) | + CONTEXT1_IDENTITY_ACCESS_MODE(1)); + WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); + WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | +- L2_CACHE_BIGK_FRAGMENT_SIZE(0)); ++ BANK_SELECT(4) | ++ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); + /* setup context0 */ + WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); + WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); +-- +1.9.1 + |