aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch')
-rw-r--r--common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch236
1 files changed, 236 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch b/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch
new file mode 100644
index 00000000..f93f84ad
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-amd/0018-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch
@@ -0,0 +1,236 @@
+From c3679d52cb42a2cc76c0c893ad364157dc3699dc Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
+Date: Fri, 25 Oct 2013 18:07:55 +0200
+Subject: [PATCH 18/60] drm/radeon: add large PTE support for NI, SI and CIK v3
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch implements support for VRAM page table entry compression.
+PTE construction is enhanced to identify physically contiguous page
+ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
+support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments,
+significantly improving TLB utilization for VRAM allocations.
+
+Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn.
+Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS
+on default settings at 1920x1200 resolution with vsync disabled.
+
+See main comment in radeon_gart.c gives a technical description.
+
+v2 (chk): rebased and simplified.
+v3 (chk): add missing hw setup
+
+Signed-off-by: Jay Cornwall <jay@jcornwall.me>
+Signed-off-by: Christian König <christian.koenig@amd.com>
+---
+ drivers/gpu/drm/radeon/cik.c | 4 +-
+ drivers/gpu/drm/radeon/ni.c | 2 +
+ drivers/gpu/drm/radeon/radeon.h | 5 ++
+ drivers/gpu/drm/radeon/radeon_gart.c | 91 +++++++++++++++++++++++++++++++++---
+ drivers/gpu/drm/radeon/si.c | 5 +-
+ 5 files changed, 98 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
+index e3bec288..e84005a 100644
+--- a/drivers/gpu/drm/radeon/cik.c
++++ b/drivers/gpu/drm/radeon/cik.c
+@@ -4526,6 +4526,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev)
+ WREG32(MC_VM_MX_L1_TLB_CNTL,
+ (0xA << 7) |
+ ENABLE_L1_TLB |
++ ENABLE_L1_FRAGMENT_PROCESSING |
+ SYSTEM_ACCESS_MODE_NOT_IN_SYS |
+ ENABLE_ADVANCED_DRIVER_MODEL |
+ SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
+@@ -4538,7 +4539,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev)
+ CONTEXT1_IDENTITY_ACCESS_MODE(1));
+ WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
+ WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
+- L2_CACHE_BIGK_FRAGMENT_SIZE(6));
++ BANK_SELECT(4) |
++ L2_CACHE_BIGK_FRAGMENT_SIZE(4));
+ /* setup context0 */
+ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
+ WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
+diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
+index 2443d11..a3c7826 100644
+--- a/drivers/gpu/drm/radeon/ni.c
++++ b/drivers/gpu/drm/radeon/ni.c
+@@ -1227,12 +1227,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev)
+ SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
+ /* Setup L2 cache */
+ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
++ ENABLE_L2_FRAGMENT_PROCESSING |
+ ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
+ ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
+ EFFECTIVE_L2_QUEUE_SIZE(7) |
+ CONTEXT1_IDENTITY_ACCESS_MODE(1));
+ WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
+ WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
++ BANK_SELECT(6) |
+ L2_CACHE_BIGK_FRAGMENT_SIZE(6));
+ /* setup context0 */
+ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
+diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
+index d478c28..e796f9a 100644
+--- a/drivers/gpu/drm/radeon/radeon.h
++++ b/drivers/gpu/drm/radeon/radeon.h
+@@ -852,6 +852,11 @@ struct radeon_mec {
+ #define R600_PTE_READABLE (1 << 5)
+ #define R600_PTE_WRITEABLE (1 << 6)
+
++/* PTE (Page Table Entry) fragment field for different page sizes */
++#define R600_PTE_FRAG_4KB (0 << 7)
++#define R600_PTE_FRAG_64KB (4 << 7)
++#define R600_PTE_FRAG_256KB (6 << 7)
++
+ struct radeon_vm {
+ struct list_head list;
+ struct list_head va;
+diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
+index 9ceabdf..f960ce6 100644
+--- a/drivers/gpu/drm/radeon/radeon_gart.c
++++ b/drivers/gpu/drm/radeon/radeon_gart.c
+@@ -1021,6 +1021,84 @@ retry:
+ }
+
+ /**
++ * radeon_vm_frag_ptes - add fragment information to PTEs
++ *
++ * @rdev: radeon_device pointer
++ * @ib: IB for the update
++ * @pe_start: first PTE to handle
++ * @pe_end: last PTE to handle
++ * @addr: addr those PTEs should point to
++ * @flags: hw mapping flags
++ *
++ * Global and local mutex must be locked!
++ */
++static void radeon_vm_frag_ptes(struct radeon_device *rdev,
++ struct radeon_ib *ib,
++ uint64_t pe_start, uint64_t pe_end,
++ uint64_t addr, uint32_t flags)
++{
++ /**
++ * The MC L1 TLB supports variable sized pages, based on a fragment
++ * field in the PTE. When this field is set to a non-zero value, page
++ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
++ * flags are considered valid for all PTEs within the fragment range
++ * and corresponding mappings are assumed to be physically contiguous.
++ *
++ * The L1 TLB can store a single PTE for the whole fragment,
++ * significantly increasing the space available for translation
++ * caching. This leads to large improvements in throughput when the
++ * TLB is under pressure.
++ *
++ * The L2 cache distributes small and large fragments into two
++ * asymmetric partitions. The large fragment cache is significantly
++ * larger. Thus, we try to use large fragments wherever possible.
++ * Userspace can support this by aligning virtual base address and
++ * allocation size to the fragment size.
++ */
++
++ /* NI is optimized for 256KB fragments, SI and newer for 64KB */
++ uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
++ R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
++ uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
++
++ uint64_t frag_start = ALIGN(pe_start, frag_align);
++ uint64_t frag_end = pe_end & ~(frag_align - 1);
++
++ unsigned count;
++
++ /* system pages are non continuously */
++ if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
++ (frag_start >= frag_end)) {
++
++ count = (pe_end - pe_start) / 8;
++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
++ RADEON_GPU_PAGE_SIZE, flags);
++ return;
++ }
++
++ /* handle the 4K area at the beginning */
++ if (pe_start != frag_start) {
++ count = (frag_start - pe_start) / 8;
++ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
++ RADEON_GPU_PAGE_SIZE, flags);
++ addr += RADEON_GPU_PAGE_SIZE * count;
++ }
++
++ /* handle the area in the middle */
++ count = (frag_end - frag_start) / 8;
++ radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
++ RADEON_GPU_PAGE_SIZE, flags | frag_flags);
++
++ /* handle the 4K area at the end */
++ if (frag_end != pe_end) {
++ addr += RADEON_GPU_PAGE_SIZE * count;
++ count = (pe_end - frag_end) / 8;
++ radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
++ RADEON_GPU_PAGE_SIZE, flags);
++ }
++}
++
++/**
+ * radeon_vm_update_ptes - make sure that page tables are valid
+ *
+ * @rdev: radeon_device pointer
+@@ -1066,10 +1144,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
+ if ((last_pte + 8 * count) != pte) {
+
+ if (count) {
+- radeon_asic_vm_set_page(rdev, ib, last_pte,
+- last_dst, count,
+- RADEON_GPU_PAGE_SIZE,
+- flags);
++ radeon_vm_frag_ptes(rdev, ib, last_pte,
++ last_pte + 8 * count,
++ last_dst, flags);
+ }
+
+ count = nptes;
+@@ -1084,9 +1161,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
+ }
+
+ if (count) {
+- radeon_asic_vm_set_page(rdev, ib, last_pte,
+- last_dst, count,
+- RADEON_GPU_PAGE_SIZE, flags);
++ radeon_vm_frag_ptes(rdev, ib, last_pte,
++ last_pte + 8 * count,
++ last_dst, flags);
+ }
+ }
+
+diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
+index 8a0bc79..46b57e1 100644
+--- a/drivers/gpu/drm/radeon/si.c
++++ b/drivers/gpu/drm/radeon/si.c
+@@ -3942,18 +3942,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev)
+ WREG32(MC_VM_MX_L1_TLB_CNTL,
+ (0xA << 7) |
+ ENABLE_L1_TLB |
++ ENABLE_L1_FRAGMENT_PROCESSING |
+ SYSTEM_ACCESS_MODE_NOT_IN_SYS |
+ ENABLE_ADVANCED_DRIVER_MODEL |
+ SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
+ /* Setup L2 cache */
+ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
++ ENABLE_L2_FRAGMENT_PROCESSING |
+ ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
+ ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
+ EFFECTIVE_L2_QUEUE_SIZE(7) |
+ CONTEXT1_IDENTITY_ACCESS_MODE(1));
+ WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
+ WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
+- L2_CACHE_BIGK_FRAGMENT_SIZE(0));
++ BANK_SELECT(4) |
++ L2_CACHE_BIGK_FRAGMENT_SIZE(4));
+ /* setup context0 */
+ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
+ WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
+--
+1.9.1
+