diff options
Diffstat (limited to 'meta-baldeagle/recipes-kernel/linux/linux-yocto/0018-yocto-poky-dora-10.0.0-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch')
-rw-r--r-- | meta-baldeagle/recipes-kernel/linux/linux-yocto/0018-yocto-poky-dora-10.0.0-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch | 237 |
1 files changed, 0 insertions, 237 deletions
diff --git a/meta-baldeagle/recipes-kernel/linux/linux-yocto/0018-yocto-poky-dora-10.0.0-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch b/meta-baldeagle/recipes-kernel/linux/linux-yocto/0018-yocto-poky-dora-10.0.0-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch deleted file mode 100644 index 6498c071..00000000 --- a/meta-baldeagle/recipes-kernel/linux/linux-yocto/0018-yocto-poky-dora-10.0.0-amd-drm-radeon-add-large-PTE-support-for-NI-SI-and-CIK-v.patch +++ /dev/null @@ -1,237 +0,0 @@ -From a2bc39a6394bb8e11060df3da33d603a66ccf9f6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> -Date: Fri, 25 Oct 2013 18:07:55 +0200 -Subject: [PATCH 18/44] drm/radeon: add large PTE support for NI, SI and CIK - v3 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This patch implements support for VRAM page table entry compression. -PTE construction is enhanced to identify physically contiguous page -ranges and mark them in the PTE fragment field. L1 TLB and L2 cache -support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, -significantly improving TLB utilization for VRAM allocations. - -Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. -Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS -on default settings at 1920x1200 resolution with vsync disabled. - -See main comment in radeon_gart.c gives a technical description. - -v2 (chk): rebased and simplified. -v3 (chk): add missing hw setup - -Signed-off-by: Jay Cornwall <jay@jcornwall.me> -Signed-off-by: Christian König <christian.koenig@amd.com> ---- - drivers/gpu/drm/radeon/cik.c | 4 +- - drivers/gpu/drm/radeon/ni.c | 2 + - drivers/gpu/drm/radeon/radeon.h | 5 ++ - drivers/gpu/drm/radeon/radeon_gart.c | 91 +++++++++++++++++++++++++++++++--- - drivers/gpu/drm/radeon/si.c | 5 +- - 5 files changed, 98 insertions(+), 9 deletions(-) - -diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c -index d7e86ef3..27fa479 100644 ---- a/drivers/gpu/drm/radeon/cik.c -+++ b/drivers/gpu/drm/radeon/cik.c -@@ -4522,6 +4522,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) - WREG32(MC_VM_MX_L1_TLB_CNTL, - (0xA << 7) | - ENABLE_L1_TLB | -+ ENABLE_L1_FRAGMENT_PROCESSING | - SYSTEM_ACCESS_MODE_NOT_IN_SYS | - ENABLE_ADVANCED_DRIVER_MODEL | - SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); -@@ -4534,7 +4535,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) - CONTEXT1_IDENTITY_ACCESS_MODE(1)); - WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); - WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | -- L2_CACHE_BIGK_FRAGMENT_SIZE(6)); -+ BANK_SELECT(4) | -+ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); - /* setup context0 */ - WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); - WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); -diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c -index f59a9e9..9291982 100644 ---- a/drivers/gpu/drm/radeon/ni.c -+++ b/drivers/gpu/drm/radeon/ni.c -@@ -1227,12 +1227,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) - SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); - /* Setup L2 cache */ - WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | -+ ENABLE_L2_FRAGMENT_PROCESSING | - ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | - ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | - EFFECTIVE_L2_QUEUE_SIZE(7) | - CONTEXT1_IDENTITY_ACCESS_MODE(1)); - WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); - WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | -+ BANK_SELECT(6) | - L2_CACHE_BIGK_FRAGMENT_SIZE(6)); - /* setup context0 */ - WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); -diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h -index b987f01..06b3375 100644 ---- a/drivers/gpu/drm/radeon/radeon.h -+++ b/drivers/gpu/drm/radeon/radeon.h -@@ -846,6 +846,11 @@ struct radeon_mec { - #define R600_PTE_READABLE (1 << 5) - #define R600_PTE_WRITEABLE (1 << 6) - -+/* PTE (Page Table Entry) fragment field for different page sizes */ -+#define R600_PTE_FRAG_4KB (0 << 7) -+#define R600_PTE_FRAG_64KB (4 << 7) -+#define R600_PTE_FRAG_256KB (6 << 7) -+ - struct radeon_vm { - struct list_head list; - struct list_head va; -diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c -index 9ceabdf..f960ce6 100644 ---- a/drivers/gpu/drm/radeon/radeon_gart.c -+++ b/drivers/gpu/drm/radeon/radeon_gart.c -@@ -1021,6 +1021,84 @@ retry: - } - - /** -+ * radeon_vm_frag_ptes - add fragment information to PTEs -+ * -+ * @rdev: radeon_device pointer -+ * @ib: IB for the update -+ * @pe_start: first PTE to handle -+ * @pe_end: last PTE to handle -+ * @addr: addr those PTEs should point to -+ * @flags: hw mapping flags -+ * -+ * Global and local mutex must be locked! -+ */ -+static void radeon_vm_frag_ptes(struct radeon_device *rdev, -+ struct radeon_ib *ib, -+ uint64_t pe_start, uint64_t pe_end, -+ uint64_t addr, uint32_t flags) -+{ -+ /** -+ * The MC L1 TLB supports variable sized pages, based on a fragment -+ * field in the PTE. When this field is set to a non-zero value, page -+ * granularity is increased from 4KB to (1 << (12 + frag)). The PTE -+ * flags are considered valid for all PTEs within the fragment range -+ * and corresponding mappings are assumed to be physically contiguous. -+ * -+ * The L1 TLB can store a single PTE for the whole fragment, -+ * significantly increasing the space available for translation -+ * caching. This leads to large improvements in throughput when the -+ * TLB is under pressure. -+ * -+ * The L2 cache distributes small and large fragments into two -+ * asymmetric partitions. The large fragment cache is significantly -+ * larger. Thus, we try to use large fragments wherever possible. -+ * Userspace can support this by aligning virtual base address and -+ * allocation size to the fragment size. -+ */ -+ -+ /* NI is optimized for 256KB fragments, SI and newer for 64KB */ -+ uint64_t frag_flags = rdev->family == CHIP_CAYMAN ? -+ R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB; -+ uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80; -+ -+ uint64_t frag_start = ALIGN(pe_start, frag_align); -+ uint64_t frag_end = pe_end & ~(frag_align - 1); -+ -+ unsigned count; -+ -+ /* system pages are non continuously */ -+ if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) || -+ (frag_start >= frag_end)) { -+ -+ count = (pe_end - pe_start) / 8; -+ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, -+ RADEON_GPU_PAGE_SIZE, flags); -+ return; -+ } -+ -+ /* handle the 4K area at the beginning */ -+ if (pe_start != frag_start) { -+ count = (frag_start - pe_start) / 8; -+ radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, -+ RADEON_GPU_PAGE_SIZE, flags); -+ addr += RADEON_GPU_PAGE_SIZE * count; -+ } -+ -+ /* handle the area in the middle */ -+ count = (frag_end - frag_start) / 8; -+ radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count, -+ RADEON_GPU_PAGE_SIZE, flags | frag_flags); -+ -+ /* handle the 4K area at the end */ -+ if (frag_end != pe_end) { -+ addr += RADEON_GPU_PAGE_SIZE * count; -+ count = (pe_end - frag_end) / 8; -+ radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count, -+ RADEON_GPU_PAGE_SIZE, flags); -+ } -+} -+ -+/** - * radeon_vm_update_ptes - make sure that page tables are valid - * - * @rdev: radeon_device pointer -@@ -1066,10 +1144,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, - if ((last_pte + 8 * count) != pte) { - - if (count) { -- radeon_asic_vm_set_page(rdev, ib, last_pte, -- last_dst, count, -- RADEON_GPU_PAGE_SIZE, -- flags); -+ radeon_vm_frag_ptes(rdev, ib, last_pte, -+ last_pte + 8 * count, -+ last_dst, flags); - } - - count = nptes; -@@ -1084,9 +1161,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, - } - - if (count) { -- radeon_asic_vm_set_page(rdev, ib, last_pte, -- last_dst, count, -- RADEON_GPU_PAGE_SIZE, flags); -+ radeon_vm_frag_ptes(rdev, ib, last_pte, -+ last_pte + 8 * count, -+ last_dst, flags); - } - } - -diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c -index 0334f3e..d9173ef 100644 ---- a/drivers/gpu/drm/radeon/si.c -+++ b/drivers/gpu/drm/radeon/si.c -@@ -3928,18 +3928,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) - WREG32(MC_VM_MX_L1_TLB_CNTL, - (0xA << 7) | - ENABLE_L1_TLB | -+ ENABLE_L1_FRAGMENT_PROCESSING | - SYSTEM_ACCESS_MODE_NOT_IN_SYS | - ENABLE_ADVANCED_DRIVER_MODEL | - SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); - /* Setup L2 cache */ - WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | -+ ENABLE_L2_FRAGMENT_PROCESSING | - ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | - ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | - EFFECTIVE_L2_QUEUE_SIZE(7) | - CONTEXT1_IDENTITY_ACCESS_MODE(1)); - WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); - WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | -- L2_CACHE_BIGK_FRAGMENT_SIZE(0)); -+ BANK_SELECT(4) | -+ L2_CACHE_BIGK_FRAGMENT_SIZE(4)); - /* setup context0 */ - WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); - WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); --- -1.7.9.5 - |