aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1654-drm-amdgpu-Allow-get_user_pages-to-fail-in-restore-w.patch
blob: 3dd1057d60c736ccaba3b0b9584ce36535378e79 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
From 7d8e00c65331476edd5611b4ba9d161ac59000d7 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Wed, 5 Apr 2017 19:45:23 -0400
Subject: [PATCH 1654/4131] drm/amdgpu: Allow get_user_pages to fail in restore
 worker

Avoid stalling queues indefinitely if get_user_pages fails. This type
of failure is indicative of a user mode bug where memory is freed that
is still mapped to the GPU. Let the restore continue and update the
page table with invalid entries for the failed BO. If the GPU tries to
access it, the application will crash with a VM fault.

Bug: SWDEV-117987
Change-Id: Ica23b8d562e8268d80109192f1c5f0c16eb72de0
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 52 ++++++++++++++++++------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index bde4f6a..3f1b1d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -523,7 +523,7 @@ static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
 	ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages);
 	up_read(&mm->mmap_sem);
 	if (ret) {
-		pr_err("%s: Failed to get user pages\n", __func__);
+		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
 		goto free_out;
 	}
 
@@ -1996,8 +1996,14 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
 						   mem->user_pages);
 		if (ret) {
 			mem->user_pages[0] = NULL;
-			pr_err("%s: Failed to get user pages\n", __func__);
-			goto unlock_mmap_out;
+			pr_info("%s: Failed to get user pages: %d\n",
+				__func__, ret);
+			ret = 0;
+			/* Pretend it succeeded. It will fail later
+			 * with a VM fault if the GPU tries to access
+			 * it. Better than hanging indefinitely with
+			 * stalled user mode queues.
+			 */
 		}
 
 		/* Mark the BO as valid unless it was invalidated
@@ -2064,6 +2070,16 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 
 	amdgpu_sync_create(&sync);
 
+	/* Avoid triggering eviction fences when unmapping invalid
+	 * userptr BOs (waits for all fences, doesn't use
+	 * FENCE_OWNER_VM)
+	 */
+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
+			    vm_list_node)
+		amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.bo,
+						process_info->eviction_fence,
+						NULL, NULL);
+
 	ret = process_validate_vms(process_info);
 	if (ret)
 		goto unreserve_out;
@@ -2076,15 +2092,17 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 
 		bo = mem->bo;
 
-		/* Copy pages array and validate the BO */
-		memcpy(bo->tbo.ttm->pages, mem->user_pages,
-		       sizeof(struct page *) * bo->tbo.ttm->num_pages);
-		amdgpu_ttm_placement_from_domain(bo, mem->domain);
-		ret = ttm_bo_validate(&bo->tbo, &bo->placement,
-				      false, false);
-		if (ret) {
-			pr_err("%s: failed to validate BO\n", __func__);
-			goto unreserve_out;
+		/* Copy pages array and validate the BO if we got user pages */
+		if (mem->user_pages[0]) {
+			memcpy(bo->tbo.ttm->pages, mem->user_pages,
+			       sizeof(struct page *) * bo->tbo.ttm->num_pages);
+			amdgpu_ttm_placement_from_domain(bo, mem->domain);
+			ret = ttm_bo_validate(&bo->tbo, &bo->placement,
+					      false, false);
+			if (ret) {
+				pr_err("%s: failed to validate BO\n", __func__);
+				goto unreserve_out;
+			}
 		}
 
 		/* Validate succeeded, now the BO owns the pages, free
@@ -2097,6 +2115,12 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 		list_move_tail(&mem->validate_list.head,
 			       &process_info->userptr_valid_list);
 
+		/* Update mapping. If the BO was not validated
+		 * (because we couldn't get user pages), this will
+		 * clear the page table entries, which will result in
+		 * VM faults if the GPU tries to access the invalid
+		 * memory.
+		 */
 		list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) {
 			if (!bo_va_entry->is_mapped)
 				continue;
@@ -2113,6 +2137,10 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
 		}
 	}
 unreserve_out:
+	list_for_each_entry(peer_vm, &process_info->vm_list_head,
+			    vm_list_node)
+		amdgpu_bo_fence(peer_vm->base.root.bo,
+				&process_info->eviction_fence->base, true);
 	ttm_eu_backoff_reservation(&ticket, &resv_list);
 	amdgpu_sync_wait(&sync);
 	amdgpu_sync_free(&sync);
-- 
2.7.4