aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch311
1 files changed, 311 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch
new file mode 100644
index 00000000..a64f09ea
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch
@@ -0,0 +1,311 @@
+From 09a25a2290321c636b800bf671328c7fe57e110b Mon Sep 17 00:00:00 2001
+From: Huang Rui <ray.huang@amd.com>
+Date: Mon, 6 Aug 2018 10:57:08 +0800
+Subject: [PATCH 0187/2940] drm/amdgpu: use bulk moves for efficient VM LRU
+ handling (v6)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+I continue to work for bulk moving that based on the proposal by Christian.
+
+Background:
+amdgpu driver will move all PD/PT and PerVM BOs into idle list. Then move all of
+them on the end of LRU list one by one. Thus, that cause so many BOs moved to
+the end of the LRU, and impact performance seriously.
+
+Then Christian provided a workaround to not move PD/PT BOs on LRU with below
+patch:
+Commit 0bbf32026cf5ba41e9922b30e26e1bed1ecd38ae ("drm/amdgpu: band aid
+validating VM PTs")
+
+However, the final solution should bulk move all PD/PT and PerVM BOs on the LRU
+instead of one by one.
+
+Whenever amdgpu_vm_validate_pt_bos() is called and we have BOs which need to be
+validated we move all BOs together to the end of the LRU without dropping the
+lock for the LRU.
+
+While doing so we note the beginning and end of this block in the LRU list.
+
+Now when amdgpu_vm_validate_pt_bos() is called and we don't have anything to do,
+we don't move every BO one by one, but instead cut the LRU list into pieces so
+that we bulk move everything to the end in just one operation.
+
+Test data:
++--------------+-----------------+-----------+---------------------------------------+
+| |The Talos |Clpeak(OCL)|BusSpeedReadback(OCL) |
+| |Principle(Vulkan)| | |
++------------------------------------------------------------------------------------+
+| | | |0.319 ms(1k) 0.314 ms(2K) 0.308 ms(4K) |
+| Original | 147.7 FPS | 76.86 us |0.307 ms(8K) 0.310 ms(16K) |
++------------------------------------------------------------------------------------+
+| Orignial + WA| | |0.254 ms(1K) 0.241 ms(2K) |
+|(don't move | 162.1 FPS | 42.15 us |0.230 ms(4K) 0.223 ms(8K) 0.204 ms(16K)|
+|PT BOs on LRU)| | | |
++------------------------------------------------------------------------------------+
+| Bulk move | 163.1 FPS | 40.52 us |0.244 ms(1K) 0.252 ms(2K) 0.213 ms(4K) |
+| | | |0.214 ms(8K) 0.225 ms(16K) |
++--------------+-----------------+-----------+---------------------------------------+
+
+After test them with above three benchmarks include vulkan and opencl. We can
+see the visible improvement than original, and even better than original with
+workaround.
+
+v2: move all BOs include idle, relocated, and moved list to the end of LRU and
+put them together.
+v3: remove unused parameter and use list_for_each_entry instead of the one with
+save entry.
+v4: move the amdgpu_vm_move_to_lru_tail after command submission, at that time,
+all bo will be back on idle list.
+v5: remove amdgpu_vm_move_to_lru_tail_by_list(), use bulk_moveable instread of
+validated, and move ttm_bo_bulk_move_lru_tail() also into
+amdgpu_vm_move_to_lru_tail().
+v6: clean up and fix return value.
+
+Change-Id: I51908d5affe817219d24ad0a1f5da289b2e9f560
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Huang Rui <ray.huang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Reviewed-by: Junwei Zhang <Jerry.Zhang@amd.com>
+Acked-by: Chunming Zhou <david1.zhou@amd.com>
+Tested-by: Mike Lothian <mike@fireburn.co.uk>
+Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
+Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 66 +++++++++++++++++---------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 11 ++++-
+ include/drm/ttm/ttm_bo_api.h | 10 ++++
+ include/drm/ttm/ttm_bo_driver.h | 28 +++++++++++
+ 5 files changed, 95 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+index 3c1443d7fb84..8875c18dcfc0 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+@@ -1274,6 +1274,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+ union drm_amdgpu_cs *cs = data;
+ struct amdgpu_cs_parser parser = {};
+ bool reserved_buffers = false;
++ struct amdgpu_fpriv *fpriv;
+ int i, r;
+
+ if (!adev->accel_working)
+@@ -1318,6 +1319,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+
+ r = amdgpu_cs_submit(&parser, cs);
+
++ fpriv = filp->driver_priv;
++ amdgpu_vm_move_to_lru_tail(adev, &fpriv->vm);
+ out:
+ amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
+ return r;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index 52b1cdea845a..617085b6e22c 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -268,6 +268,47 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
+ list_add(&entry->tv.head, validated);
+ }
+
++/**
++ * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
++ *
++ * @adev: amdgpu device pointer
++ * @vm: vm providing the BOs
++ *
++ * Move all BOs to the end of LRU and remember their positions to put them
++ * together.
++ */
++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
++ struct amdgpu_vm *vm)
++{
++ struct ttm_bo_global *glob = adev->mman.bdev.glob;
++ struct amdgpu_vm_bo_base *bo_base;
++
++ if (vm->bulk_moveable) {
++ spin_lock(&glob->lru_lock);
++ ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
++ spin_unlock(&glob->lru_lock);
++ return;
++ }
++
++ memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
++
++ spin_lock(&glob->lru_lock);
++ list_for_each_entry(bo_base, &vm->idle, vm_status) {
++ struct amdgpu_bo *bo = bo_base->bo;
++
++ if (!bo->parent)
++ continue;
++
++ ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move);
++ if (bo->shadow)
++ ttm_bo_move_to_lru_tail(&bo->shadow->tbo,
++ &vm->lru_bulk_move);
++ }
++ spin_unlock(&glob->lru_lock);
++
++ vm->bulk_moveable = true;
++}
++
+ /**
+ * amdgpu_vm_validate_pt_bos - validate the page table BOs
+ *
+@@ -285,10 +326,11 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ int (*validate)(void *p, struct amdgpu_bo *bo),
+ void *param)
+ {
+- struct ttm_bo_global *glob = adev->mman.bdev.glob;
+ struct amdgpu_vm_bo_base *bo_base, *tmp;
+ int r = 0;
+
++ vm->bulk_moveable &= list_empty(&vm->evicted);
++
+ list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
+ struct amdgpu_bo *bo = bo_base->bo;
+
+@@ -296,14 +338,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ if (r)
+ break;
+
+- if (bo->parent) {
+- spin_lock(&glob->lru_lock);
+- ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
+- if (bo->shadow)
+- ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
+- spin_unlock(&glob->lru_lock);
+- }
+-
+ if (bo->tbo.type != ttm_bo_type_kernel) {
+ spin_lock(&vm->moved_lock);
+ list_move(&bo_base->vm_status, &vm->moved);
+@@ -313,19 +347,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ }
+ }
+
+- spin_lock(&glob->lru_lock);
+- list_for_each_entry(bo_base, &vm->idle, vm_status) {
+- struct amdgpu_bo *bo = bo_base->bo;
+-
+- if (!bo->parent)
+- continue;
+-
+- ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
+- if (bo->shadow)
+- ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
+- }
+- spin_unlock(&glob->lru_lock);
+-
+ return r;
+ }
+
+@@ -2639,6 +2660,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ return r;
+
+ vm->pte_support_ats = false;
++ vm->bulk_moveable = true;
+
+ if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
+ vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+index ab1d23e4b8ad..7a461eb76d44 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+@@ -29,6 +29,7 @@
+ #include <linux/rbtree.h>
+ #include <drm/gpu_scheduler.h>
+ #include <drm/drm_file.h>
++#include <drm/ttm/ttm_bo_driver.h>
+
+ #include "amdgpu_sync.h"
+ #include "amdgpu_ring.h"
+@@ -247,6 +248,11 @@ struct amdgpu_vm {
+
+ /* Some basic info about the task */
+ struct amdgpu_task_info task_info;
++
++ /* Store positions of group of BOs */
++ struct ttm_lru_bulk_move lru_bulk_move;
++ /* mark whether can do the bulk move */
++ bool bulk_moveable;
+ };
+
+ struct amdgpu_vm_manager {
+@@ -354,8 +360,11 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
+ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
+
+ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+- struct amdgpu_task_info *task_info);
++ struct amdgpu_task_info *task_info);
+
+ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+
++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
++ struct amdgpu_vm *vm);
++
+ #endif
+diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
+index 92d0a1e892c3..e6955ce24d73 100644
+--- a/include/drm/ttm/ttm_bo_api.h
++++ b/include/drm/ttm/ttm_bo_api.h
+@@ -417,6 +417,16 @@ void ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
+ void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
+ struct ttm_lru_bulk_move *bulk);
+
++/**
++ * ttm_bo_bulk_move_lru_tail
++ *
++ * @bulk: bulk move structure
++ *
++ * Bulk move BOs to the LRU tail, only valid to use when driver makes sure that
++ * BO order never changes. Should be called with ttm_bo_global::lru_lock held.
++ */
++void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk);
++
+ /**
+ * ttm_bo_lock_delayed_workqueue
+ *
+diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
+index 3234cc322e70..17e20781ca7a 100644
+--- a/include/drm/ttm/ttm_bo_driver.h
++++ b/include/drm/ttm/ttm_bo_driver.h
+@@ -490,6 +490,34 @@ struct ttm_bo_device {
+ bool no_retry;
+ };
+
++/**
++ * struct ttm_lru_bulk_move_pos
++ *
++ * @first: first BO in the bulk move range
++ * @last: last BO in the bulk move range
++ *
++ * Positions for a lru bulk move.
++ */
++struct ttm_lru_bulk_move_pos {
++ struct ttm_buffer_object *first;
++ struct ttm_buffer_object *last;
++};
++
++/**
++ * struct ttm_lru_bulk_move
++ *
++ * @tt: first/last lru entry for BOs in the TT domain
++ * @vram: first/last lru entry for BOs in the VRAM domain
++ * @swap: first/last lru entry for BOs on the swap list
++ *
++ * Helper structure for bulk moves on the LRU list.
++ */
++struct ttm_lru_bulk_move {
++ struct ttm_lru_bulk_move_pos tt[TTM_MAX_BO_PRIORITY];
++ struct ttm_lru_bulk_move_pos vram[TTM_MAX_BO_PRIORITY];
++ struct ttm_lru_bulk_move_pos swap[TTM_MAX_BO_PRIORITY];
++};
++
+ /**
+ * ttm_flag_masked
+ *
+--
+2.17.1
+