diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch new file mode 100644 index 00000000..a64f09ea --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0187-drm-amdgpu-use-bulk-moves-for-efficient-VM-LRU-handl.patch @@ -0,0 +1,311 @@ +From 09a25a2290321c636b800bf671328c7fe57e110b Mon Sep 17 00:00:00 2001 +From: Huang Rui <ray.huang@amd.com> +Date: Mon, 6 Aug 2018 10:57:08 +0800 +Subject: [PATCH 0187/2940] drm/amdgpu: use bulk moves for efficient VM LRU + handling (v6) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +I continue to work for bulk moving that based on the proposal by Christian. + +Background: +amdgpu driver will move all PD/PT and PerVM BOs into idle list. Then move all of +them on the end of LRU list one by one. Thus, that cause so many BOs moved to +the end of the LRU, and impact performance seriously. + +Then Christian provided a workaround to not move PD/PT BOs on LRU with below +patch: +Commit 0bbf32026cf5ba41e9922b30e26e1bed1ecd38ae ("drm/amdgpu: band aid +validating VM PTs") + +However, the final solution should bulk move all PD/PT and PerVM BOs on the LRU +instead of one by one. + +Whenever amdgpu_vm_validate_pt_bos() is called and we have BOs which need to be +validated we move all BOs together to the end of the LRU without dropping the +lock for the LRU. + +While doing so we note the beginning and end of this block in the LRU list. + +Now when amdgpu_vm_validate_pt_bos() is called and we don't have anything to do, +we don't move every BO one by one, but instead cut the LRU list into pieces so +that we bulk move everything to the end in just one operation. + +Test data: ++--------------+-----------------+-----------+---------------------------------------+ +| |The Talos |Clpeak(OCL)|BusSpeedReadback(OCL) | +| |Principle(Vulkan)| | | ++------------------------------------------------------------------------------------+ +| | | |0.319 ms(1k) 0.314 ms(2K) 0.308 ms(4K) | +| Original | 147.7 FPS | 76.86 us |0.307 ms(8K) 0.310 ms(16K) | ++------------------------------------------------------------------------------------+ +| Orignial + WA| | |0.254 ms(1K) 0.241 ms(2K) | +|(don't move | 162.1 FPS | 42.15 us |0.230 ms(4K) 0.223 ms(8K) 0.204 ms(16K)| +|PT BOs on LRU)| | | | ++------------------------------------------------------------------------------------+ +| Bulk move | 163.1 FPS | 40.52 us |0.244 ms(1K) 0.252 ms(2K) 0.213 ms(4K) | +| | | |0.214 ms(8K) 0.225 ms(16K) | ++--------------+-----------------+-----------+---------------------------------------+ + +After test them with above three benchmarks include vulkan and opencl. We can +see the visible improvement than original, and even better than original with +workaround. + +v2: move all BOs include idle, relocated, and moved list to the end of LRU and +put them together. +v3: remove unused parameter and use list_for_each_entry instead of the one with +save entry. +v4: move the amdgpu_vm_move_to_lru_tail after command submission, at that time, +all bo will be back on idle list. +v5: remove amdgpu_vm_move_to_lru_tail_by_list(), use bulk_moveable instread of +validated, and move ttm_bo_bulk_move_lru_tail() also into +amdgpu_vm_move_to_lru_tail(). +v6: clean up and fix return value. + +Change-Id: I51908d5affe817219d24ad0a1f5da289b2e9f560 +Signed-off-by: Christian König <christian.koenig@amd.com> +Signed-off-by: Huang Rui <ray.huang@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +Reviewed-by: Junwei Zhang <Jerry.Zhang@amd.com> +Acked-by: Chunming Zhou <david1.zhou@amd.com> +Tested-by: Mike Lothian <mike@fireburn.co.uk> +Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de> +Signed-off-by: Kalyan Alle <kalyan.alle@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ++ + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 66 +++++++++++++++++--------- + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 11 ++++- + include/drm/ttm/ttm_bo_api.h | 10 ++++ + include/drm/ttm/ttm_bo_driver.h | 28 +++++++++++ + 5 files changed, 95 insertions(+), 23 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +index 3c1443d7fb84..8875c18dcfc0 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +@@ -1274,6 +1274,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) + union drm_amdgpu_cs *cs = data; + struct amdgpu_cs_parser parser = {}; + bool reserved_buffers = false; ++ struct amdgpu_fpriv *fpriv; + int i, r; + + if (!adev->accel_working) +@@ -1318,6 +1319,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) + + r = amdgpu_cs_submit(&parser, cs); + ++ fpriv = filp->driver_priv; ++ amdgpu_vm_move_to_lru_tail(adev, &fpriv->vm); + out: + amdgpu_cs_parser_fini(&parser, r, reserved_buffers); + return r; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +index 52b1cdea845a..617085b6e22c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +@@ -268,6 +268,47 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, + list_add(&entry->tv.head, validated); + } + ++/** ++ * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU ++ * ++ * @adev: amdgpu device pointer ++ * @vm: vm providing the BOs ++ * ++ * Move all BOs to the end of LRU and remember their positions to put them ++ * together. ++ */ ++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, ++ struct amdgpu_vm *vm) ++{ ++ struct ttm_bo_global *glob = adev->mman.bdev.glob; ++ struct amdgpu_vm_bo_base *bo_base; ++ ++ if (vm->bulk_moveable) { ++ spin_lock(&glob->lru_lock); ++ ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); ++ spin_unlock(&glob->lru_lock); ++ return; ++ } ++ ++ memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); ++ ++ spin_lock(&glob->lru_lock); ++ list_for_each_entry(bo_base, &vm->idle, vm_status) { ++ struct amdgpu_bo *bo = bo_base->bo; ++ ++ if (!bo->parent) ++ continue; ++ ++ ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move); ++ if (bo->shadow) ++ ttm_bo_move_to_lru_tail(&bo->shadow->tbo, ++ &vm->lru_bulk_move); ++ } ++ spin_unlock(&glob->lru_lock); ++ ++ vm->bulk_moveable = true; ++} ++ + /** + * amdgpu_vm_validate_pt_bos - validate the page table BOs + * +@@ -285,10 +326,11 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, + int (*validate)(void *p, struct amdgpu_bo *bo), + void *param) + { +- struct ttm_bo_global *glob = adev->mman.bdev.glob; + struct amdgpu_vm_bo_base *bo_base, *tmp; + int r = 0; + ++ vm->bulk_moveable &= list_empty(&vm->evicted); ++ + list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { + struct amdgpu_bo *bo = bo_base->bo; + +@@ -296,14 +338,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, + if (r) + break; + +- if (bo->parent) { +- spin_lock(&glob->lru_lock); +- ttm_bo_move_to_lru_tail(&bo->tbo, NULL); +- if (bo->shadow) +- ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL); +- spin_unlock(&glob->lru_lock); +- } +- + if (bo->tbo.type != ttm_bo_type_kernel) { + spin_lock(&vm->moved_lock); + list_move(&bo_base->vm_status, &vm->moved); +@@ -313,19 +347,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, + } + } + +- spin_lock(&glob->lru_lock); +- list_for_each_entry(bo_base, &vm->idle, vm_status) { +- struct amdgpu_bo *bo = bo_base->bo; +- +- if (!bo->parent) +- continue; +- +- ttm_bo_move_to_lru_tail(&bo->tbo, NULL); +- if (bo->shadow) +- ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL); +- } +- spin_unlock(&glob->lru_lock); +- + return r; + } + +@@ -2639,6 +2660,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, + return r; + + vm->pte_support_ats = false; ++ vm->bulk_moveable = true; + + if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) { + vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +index ab1d23e4b8ad..7a461eb76d44 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +@@ -29,6 +29,7 @@ + #include <linux/rbtree.h> + #include <drm/gpu_scheduler.h> + #include <drm/drm_file.h> ++#include <drm/ttm/ttm_bo_driver.h> + + #include "amdgpu_sync.h" + #include "amdgpu_ring.h" +@@ -247,6 +248,11 @@ struct amdgpu_vm { + + /* Some basic info about the task */ + struct amdgpu_task_info task_info; ++ ++ /* Store positions of group of BOs */ ++ struct ttm_lru_bulk_move lru_bulk_move; ++ /* mark whether can do the bulk move */ ++ bool bulk_moveable; + }; + + struct amdgpu_vm_manager { +@@ -354,8 +360,11 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, + void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); + + void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, +- struct amdgpu_task_info *task_info); ++ struct amdgpu_task_info *task_info); + + void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); + ++void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, ++ struct amdgpu_vm *vm); ++ + #endif +diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h +index 92d0a1e892c3..e6955ce24d73 100644 +--- a/include/drm/ttm/ttm_bo_api.h ++++ b/include/drm/ttm/ttm_bo_api.h +@@ -417,6 +417,16 @@ void ttm_bo_del_from_lru(struct ttm_buffer_object *bo); + void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo, + struct ttm_lru_bulk_move *bulk); + ++/** ++ * ttm_bo_bulk_move_lru_tail ++ * ++ * @bulk: bulk move structure ++ * ++ * Bulk move BOs to the LRU tail, only valid to use when driver makes sure that ++ * BO order never changes. Should be called with ttm_bo_global::lru_lock held. ++ */ ++void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk); ++ + /** + * ttm_bo_lock_delayed_workqueue + * +diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h +index 3234cc322e70..17e20781ca7a 100644 +--- a/include/drm/ttm/ttm_bo_driver.h ++++ b/include/drm/ttm/ttm_bo_driver.h +@@ -490,6 +490,34 @@ struct ttm_bo_device { + bool no_retry; + }; + ++/** ++ * struct ttm_lru_bulk_move_pos ++ * ++ * @first: first BO in the bulk move range ++ * @last: last BO in the bulk move range ++ * ++ * Positions for a lru bulk move. ++ */ ++struct ttm_lru_bulk_move_pos { ++ struct ttm_buffer_object *first; ++ struct ttm_buffer_object *last; ++}; ++ ++/** ++ * struct ttm_lru_bulk_move ++ * ++ * @tt: first/last lru entry for BOs in the TT domain ++ * @vram: first/last lru entry for BOs in the VRAM domain ++ * @swap: first/last lru entry for BOs on the swap list ++ * ++ * Helper structure for bulk moves on the LRU list. ++ */ ++struct ttm_lru_bulk_move { ++ struct ttm_lru_bulk_move_pos tt[TTM_MAX_BO_PRIORITY]; ++ struct ttm_lru_bulk_move_pos vram[TTM_MAX_BO_PRIORITY]; ++ struct ttm_lru_bulk_move_pos swap[TTM_MAX_BO_PRIORITY]; ++}; ++ + /** + * ttm_flag_masked + * +-- +2.17.1 + |