1 files changed, 311 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch
new file mode 100644
index 00000000..a64f09ea
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch
@@ -0,0 +1,311 @@
+From 09a25a2290321c636b800bf671328c7fe57e110b Mon Sep 17 00:00:00 2001
+From: Huang Rui <ray.huang@amd.com>
+Date: Mon, 6 Aug 2018 10:57:08 +0800
+Subject: [PATCH 0187/2940] drm/amdgpu: use bulk moves for efficient VM LRU
+ handling (v6)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+I continue to work for bulk moving that based on the proposal by Christian.
+
+Background:
+amdgpu driver will move all PD/PT and PerVM BOs into idle list. Then move all of
+them on the end of LRU list one by one. Thus, that cause so many BOs moved to
+the end of the LRU, and impact performance seriously.
+
+Then Christian provided a workaround to not move PD/PT BOs on LRU with below
+patch:
+Commit 0bbf32026cf5ba41e9922b30e26e1bed1ecd38ae ("drm/amdgpu: band aid
+validating VM PTs")
+
+However, the final solution should bulk move all PD/PT and PerVM BOs on the LRU
+instead of one by one.
+
+Whenever amdgpu_vm_validate_pt_bos() is called and we have BOs which need to be
+validated we move all BOs together to the end of the LRU without dropping the
+lock for the LRU.
+
+While doing so we note the beginning and end of this block in the LRU list.
+
+Now when amdgpu_vm_validate_pt_bos() is called and we don't have anything to do,
+we don't move every BO one by one, but instead cut the LRU list into pieces so
+that we bulk move everything to the end in just one operation.
+
+Test data:
++--------------+-----------------+-----------+---------------------------------------+
+|              |The Talos        |Clpeak(OCL)|BusSpeedReadback(OCL)                  |
+|              |Principle(Vulkan)|           |                                       |
++------------------------------------------------------------------------------------+
+|              |                 |           |0.319 ms(1k) 0.314 ms(2K) 0.308 ms(4K) |
+| Original     |  147.7 FPS      |  76.86 us |0.307 ms(8K) 0.310 ms(16K)             |
++------------------------------------------------------------------------------------+
+| Orignial + WA|                 |           |0.254 ms(1K) 0.241 ms(2K)              |
+|(don't move   |  162.1 FPS      |  42.15 us |0.230 ms(4K) 0.223 ms(8K) 0.204 ms(16K)|
+|PT BOs on LRU)|                 |           |                                       |
++------------------------------------------------------------------------------------+
+| Bulk move    |  163.1 FPS      |  40.52 us |0.244 ms(1K) 0.252 ms(2K) 0.213 ms(4K) |
+|              |                 |           |0.214 ms(8K) 0.225 ms(16K)             |
++--------------+-----------------+-----------+---------------------------------------+
+
+After test them with above three benchmarks include vulkan and opencl. We can
+see the visible improvement than original, and even better than original with
+workaround.
+
+v2: move all BOs include idle, relocated, and moved list to the end of LRU and
+put them together.
+v3: remove unused parameter and use list_for_each_entry instead of the one with
+save entry.
+v4: move the amdgpu_vm_move_to_lru_tail after command submission, at that time,
+all bo will be back on idle list.
+v5: remove amdgpu_vm_move_to_lru_tail_by_list(), use bulk_moveable instread of
+validated, and move ttm_bo_bulk_move_lru_tail() also into
+amdgpu_vm_move_to_lru_tail().
+v6: clean up and fix return value.
+
+Change-Id: I51908d5affe817219d24ad0a1f5da289b2e9f560
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Huang Rui <ray.huang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Reviewed-by: Junwei Zhang <Jerry.Zhang@amd.com>
+Acked-by: Chunming Zhou <david1.zhou@amd.com>
+Tested-by: Mike Lothian <mike@fireburn.co.uk>
+Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
+Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  3 ++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 66 +++++++++++++++++---------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 11 ++++-
+ include/drm/ttm/ttm_bo_api.h           | 10 ++++
+ include/drm/ttm/ttm_bo_driver.h        | 28 +++++++++++
+ 5 files changed, 95 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+index 3c1443d7fb84..8875c18dcfc0 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+@@ -1274,6 +1274,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+ 	union drm_amdgpu_cs *cs = data;
+ 	struct amdgpu_cs_parser parser = {};
+ 	bool reserved_buffers = false;
++	struct amdgpu_fpriv *fpriv;
+ 	int i, r;
+ 
+ 	if (!adev->accel_working)
+@@ -1318,6 +1319,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+ 
+ 	r = amdgpu_cs_submit(&parser, cs);
+ 
++	fpriv = filp->driver_priv;
++	amdgpu_vm_move_to_lru_tail(adev, &fpriv->vm);
+ out:
+ 	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
+ 	return r;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index 52b1cdea845a..617085b6e22c 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -268,6 +268,47 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
+ 	list_add(&entry->tv.head, validated);
+ }
+ 
++/**
++ * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
++ *
++ * @adev: amdgpu device pointer
++ * @vm: vm providing the BOs
++ *
++ * Move all BOs to the end of LRU and remember their positions to put them
++ * together.
++ */
++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
++				struct amdgpu_vm *vm)
++{
++	struct ttm_bo_global *glob = adev->mman.bdev.glob;
++	struct amdgpu_vm_bo_base *bo_base;
++
++	if (vm->bulk_moveable) {
++		spin_lock(&glob->lru_lock);
++		ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
++		spin_unlock(&glob->lru_lock);
++		return;
++	}
++
++	memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
++
++	spin_lock(&glob->lru_lock);
++	list_for_each_entry(bo_base, &vm->idle, vm_status) {
++		struct amdgpu_bo *bo = bo_base->bo;
++
++		if (!bo->parent)
++			continue;
++
++		ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move);
++		if (bo->shadow)
++			ttm_bo_move_to_lru_tail(&bo->shadow->tbo,
++						&vm->lru_bulk_move);
++	}
++	spin_unlock(&glob->lru_lock);
++
++	vm->bulk_moveable = true;
++}
++
+ /**
+  * amdgpu_vm_validate_pt_bos - validate the page table BOs
+  *
+@@ -285,10 +326,11 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ 			      int (*validate)(void *p, struct amdgpu_bo *bo),
+ 			      void *param)
+ {
+-	struct ttm_bo_global *glob = adev->mman.bdev.glob;
+ 	struct amdgpu_vm_bo_base *bo_base, *tmp;
+ 	int r = 0;
+ 
++	vm->bulk_moveable &= list_empty(&vm->evicted);
++
+ 	list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
+ 		struct amdgpu_bo *bo = bo_base->bo;
+ 
+@@ -296,14 +338,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ 		if (r)
+ 			break;
+ 
+-		if (bo->parent) {
+-			spin_lock(&glob->lru_lock);
+-			ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
+-			if (bo->shadow)
+-				ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
+-			spin_unlock(&glob->lru_lock);
+-		}
+-
+ 		if (bo->tbo.type != ttm_bo_type_kernel) {
+ 			spin_lock(&vm->moved_lock);
+ 			list_move(&bo_base->vm_status, &vm->moved);
+@@ -313,19 +347,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ 		}
+ 	}
+ 
+-	spin_lock(&glob->lru_lock);
+-	list_for_each_entry(bo_base, &vm->idle, vm_status) {
+-		struct amdgpu_bo *bo = bo_base->bo;
+-
+-		if (!bo->parent)
+-			continue;
+-
+-		ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
+-		if (bo->shadow)
+-			ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
+-	}
+-	spin_unlock(&glob->lru_lock);
+-
+ 	return r;
+ }
+ 
+@@ -2639,6 +2660,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ 		return r;
+ 
+ 	vm->pte_support_ats = false;
++	vm->bulk_moveable = true;
+ 
+ 	if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
+ 		vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+index ab1d23e4b8ad..7a461eb76d44 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+@@ -29,6 +29,7 @@
+ #include <linux/rbtree.h>
+ #include <drm/gpu_scheduler.h>
+ #include <drm/drm_file.h>
++#include <drm/ttm/ttm_bo_driver.h>
+ 
+ #include "amdgpu_sync.h"
+ #include "amdgpu_ring.h"
+@@ -247,6 +248,11 @@ struct amdgpu_vm {
+ 
+ 	/* Some basic info about the task */
+ 	struct amdgpu_task_info task_info;
++
++	/* Store positions of group of BOs */
++	struct ttm_lru_bulk_move lru_bulk_move;
++	/* mark whether can do the bulk move */
++	bool			bulk_moveable;
+ };
+ 
+ struct amdgpu_vm_manager {
+@@ -354,8 +360,11 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
+ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
+ 
+ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
+-			 struct amdgpu_task_info *task_info);
++			     struct amdgpu_task_info *task_info);
+ 
+ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+ 
++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
++				struct amdgpu_vm *vm);
++
+ #endif
+diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
+index 92d0a1e892c3..e6955ce24d73 100644
+--- a/include/drm/ttm/ttm_bo_api.h
++++ b/include/drm/ttm/ttm_bo_api.h
+@@ -417,6 +417,16 @@ void ttm_bo_del_from_lru(struct ttm_buffer_object *bo);
+ void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
+                              struct ttm_lru_bulk_move *bulk);
+ 
++/**
++ * ttm_bo_bulk_move_lru_tail
++ *
++ * @bulk: bulk move structure
++ *
++ * Bulk move BOs to the LRU tail, only valid to use when driver makes sure that
++ * BO order never changes. Should be called with ttm_bo_global::lru_lock held.
++ */
++void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk);
++
+ /**
+  * ttm_bo_lock_delayed_workqueue
+  *
+diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
+index 3234cc322e70..17e20781ca7a 100644
+--- a/include/drm/ttm/ttm_bo_driver.h
++++ b/include/drm/ttm/ttm_bo_driver.h
+@@ -490,6 +490,34 @@ struct ttm_bo_device {
+ 	bool no_retry;
+ };
+ 
++/**
++ * struct ttm_lru_bulk_move_pos
++ *
++ * @first: first BO in the bulk move range
++ * @last: last BO in the bulk move range
++ *
++ * Positions for a lru bulk move.
++ */
++struct ttm_lru_bulk_move_pos {
++        struct ttm_buffer_object *first;
++        struct ttm_buffer_object *last;
++};
++
++/**
++ * struct ttm_lru_bulk_move
++ *
++ * @tt: first/last lru entry for BOs in the TT domain
++ * @vram: first/last lru entry for BOs in the VRAM domain
++ * @swap: first/last lru entry for BOs on the swap list
++ *
++ * Helper structure for bulk moves on the LRU list.
++ */
++struct ttm_lru_bulk_move {
++        struct ttm_lru_bulk_move_pos tt[TTM_MAX_BO_PRIORITY];
++        struct ttm_lru_bulk_move_pos vram[TTM_MAX_BO_PRIORITY];
++        struct ttm_lru_bulk_move_pos swap[TTM_MAX_BO_PRIORITY];
++};
++
+ /**
+  * ttm_flag_masked
+  *
+-- 
+2.17.1
+