aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch82
1 files changed, 82 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch
new file mode 100644
index 00000000..bf2ad659
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch
@@ -0,0 +1,82 @@
+From b7cfd306079e1746315d6d2ad5c837623a07cbaf Mon Sep 17 00:00:00 2001
+From: Philip Yang <Philip.Yang@amd.com>
+Date: Fri, 20 Oct 2017 14:32:27 -0400
+Subject: [PATCH 2131/4131] drm/amdkfd: fix hang issue if KFD memory failed to
+ restore
+
+In current logic, if restore failed, the restore work is rescheduled,
+and new fence is created for process->eviction_fence, but the new fence
+will attach to BOs created before the next restore, as a result, next
+restore will start evict work because new fence is not signaled while
+ttm_bo_wait on those new BOs. The eviction count can detect and skip
+eviction but it schedules restore work again, this is recursive
+eviction/restore.
+
+The fix is to create new fence only after restore is finished completely.
+
+BUG: SWDEV-134782
+
+Change-Id: Ib2f25f8a50a5eb8247572a105085482dc7326733
+Signed-off-by: Philip Yang <Philip.Yang@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 30 +++++++++++++-----------
+ 1 file changed, 16 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+index 900019d..5de8398 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+@@ -2287,7 +2287,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
+ struct amdkfd_vm *peer_vm;
+ struct kgd_mem *mem;
+ struct bo_vm_reservation_context ctx;
+- struct amdgpu_amdkfd_fence *old_fence;
++ struct amdgpu_amdkfd_fence *new_fence;
+ int ret = 0, i;
+ struct list_head duplicate_save;
+ struct amdgpu_sync sync_obj;
+@@ -2302,19 +2302,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
+ if (pd_bo_list == NULL)
+ return -ENOMEM;
+
+- /* Release old eviction fence and create new one. Use context and mm
+- * from the old fence.
+- */
+- old_fence = process_info->eviction_fence;
+- process_info->eviction_fence =
+- amdgpu_amdkfd_fence_create(old_fence->base.context,
+- old_fence->mm);
+- dma_fence_put(&old_fence->base);
+- if (!process_info->eviction_fence) {
+- pr_err("Failed to create eviction fence\n");
+- goto evict_fence_fail;
+- }
+-
+ i = 0;
+ mutex_lock(&process_info->lock);
+ list_for_each_entry(peer_vm, &process_info->vm_list_head,
+@@ -2386,6 +2373,21 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
+
+ amdgpu_sync_wait(ctx.sync, false);
+
++ /* Release old eviction fence and create new one, because fence only
++ * goes from unsignaled to signaled, fence cannot be reused.
++ * Use context and mm from the old fence.
++ */
++ new_fence = amdgpu_amdkfd_fence_create(
++ process_info->eviction_fence->base.context,
++ process_info->eviction_fence->mm);
++ if (!new_fence) {
++ pr_err("Failed to create eviction fence\n");
++ ret = -ENOMEM;
++ goto validate_map_fail;
++ }
++ dma_fence_put(&process_info->eviction_fence->base);
++ process_info->eviction_fence = new_fence;
++
+ /* Wait for validate to finish and attach new eviction fence */
+ list_for_each_entry(mem, &process_info->kfd_bo_list,
+ validate_list.head)
+--
+2.7.4
+