aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1579-drm-amdgpu-Avoid-leaking-KFD-userpages.patch
blob: 76f6cf8a86e668564fe1605b1d23771afd3c7faf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
From 93c99331c4fd68d4d8510d193bf7db06ce207f48 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Wed, 25 Jan 2017 18:42:41 -0500
Subject: [PATCH 1579/4131] drm/amdgpu: Avoid leaking KFD userpages

Validate and invalidate userptr BOs explicitly instead of reference
counting. In particular guarantee that userptr BOs are always
validated (bound) after calling amdgpu_ttm_tt_get_user_pages.
Otherwise the BO would not be unbound later. Releasing the userpages
happens in unbind. Not releasing the userpages leaks them and leads
to lots of kernel messages about bad page state later on.

Change-Id: I2484376895e5c93325c2fcfc9f504d2a3c0a684e
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 72 ++++++++++--------------
 1 file changed, 30 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c440419..034bf91 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -413,19 +413,13 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
 							 ef_count);
 		}
 	} else {
-		/* Userptrs are not pinned. Therefore we can use the
-		 * bo->pin_count for our version of pinning without conflict.
-		 */
-		if (bo->pin_count == 0) {
-			amdgpu_ttm_placement_from_domain(bo, domain);
-			ret = ttm_bo_validate(&bo->tbo, &bo->placement,
-					      true, false);
-			if (ret)
-				goto validate_fail;
-			if (wait)
-				ttm_bo_wait(&bo->tbo, false, false);
-		}
-		bo->pin_count++;
+		amdgpu_ttm_placement_from_domain(bo, domain);
+		ret = ttm_bo_validate(&bo->tbo, &bo->placement,
+				      true, false);
+		if (ret)
+			goto validate_fail;
+		if (wait)
+			ttm_bo_wait(&bo->tbo, false, false);
 	}
 
 validate_fail:
@@ -443,8 +437,7 @@ static int amdgpu_amdkfd_bo_invalidate(struct amdgpu_bo *bo)
 {
 	int ret = 0;
 
-	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) &&
-		(--bo->pin_count == 0)) {
+	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
 		amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
 		ret = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
 		if (ret != 0)
@@ -898,7 +891,6 @@ static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
 {
 	struct amdgpu_bo_va *bo_va = entry->bo_va;
 	struct amdgpu_vm *vm = bo_va->vm;
-	struct amdgpu_bo *bo = bo_va->bo;
 
 	amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
 
@@ -906,8 +898,6 @@ static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
 
 	amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
 
-	amdgpu_amdkfd_bo_invalidate(bo);
-
 	return 0;
 }
 
@@ -1216,21 +1206,13 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 		}
 	}
 
-	if (!mem->evicted) {
+	if (mem->mapped_to_gpu_memory == 0 && !mem->evicted) {
 		ret = update_user_pages(mem, current->mm, &ctx);
 		if (ret != 0) {
 			pr_err("update_user_pages failed\n");
 			goto update_user_pages_failed;
 		}
-	}
 
-	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
-		ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
-		if (ret) {
-			pr_debug("userptr: Validate failed\n");
-			goto map_bo_to_gpuvm_failed;
-		}
-	} else if (mem->mapped_to_gpu_memory == 0) {
 		/* Validate BO only once. The eviction fence gets added to BO
 		 * the first time it is mapped. Validate will wait for all
 		 * background evictions to complete.
@@ -1518,12 +1500,14 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 	}
 
 	/* If BO is unmapped from all VMs, unfence it. It can be evicted if
-	 * required.
+	 * required. User pages of userptr BOs can be released.
 	 */
-	if (mem->mapped_to_gpu_memory == 0)
+	if (mem->mapped_to_gpu_memory == 0) {
 		amdgpu_amdkfd_remove_eviction_fence(mem->bo,
 						    master_vm->eviction_fence,
 						    NULL, NULL);
+		amdgpu_amdkfd_bo_invalidate(mem->bo);
+	}
 
 	if (mapped_before == mem->mapped_to_gpu_memory) {
 		pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n",
@@ -1885,6 +1869,8 @@ int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 		n_unmapped++;
 	}
 
+	amdgpu_amdkfd_bo_invalidate(mem->bo);
+
 	unreserve_bo_and_vms(&ctx, true);
 
 	return 0;
@@ -1935,17 +1921,30 @@ int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm)
 	if (likely(ret == 0)) {
 		ret = update_user_pages(mem, mm, &ctx);
 		have_pages = !ret;
-		if (!have_pages)
+		if (!have_pages) {
 			unreserve_bo_and_vms(&ctx, false);
+			pr_err("get_user_pages failed. Probably userptr is freed. %d\n",
+			       ret);
+		}
 	}
 
 	/* update_user_pages drops the lock briefly. Check if someone
 	 * else evicted or restored the buffer in the mean time */
 	if (mem->evicted != 1) {
-		unreserve_bo_and_vms(&ctx, false);
+		if (have_pages)
+			unreserve_bo_and_vms(&ctx, false);
 		return 0;
 	}
 
+	if (have_pages) {
+		r = amdgpu_amdkfd_bo_validate(mem->bo, domain, true);
+		if (unlikely(r != 0)) {
+			pr_err("Failed to validate BO %p\n", mem);
+			have_pages = false;
+			unreserve_bo_and_vms(&ctx, false);
+		}
+	}
+
 	/* Try to restore all mappings. Mappings that fail to restore
 	 * will be marked as unmapped. If we failed to get the user
 	 * pages, all mappings will be marked as unmapped. */
@@ -1958,18 +1957,7 @@ int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm)
 		adev = (struct amdgpu_device *)entry->kgd_dev;
 
 		if (unlikely(!have_pages)) {
-			pr_err("get_user_pages failed. Probably userptr is freed. %d\n",
-				ret);
-			entry->map_fail = true;
-			continue;
-		}
-
-		r = amdgpu_amdkfd_bo_validate(mem->bo, domain, true);
-		if (unlikely(r != 0)) {
-			pr_err("Failed to validate BO\n");
 			entry->map_fail = true;
-			if (ret == 0)
-				ret = r;
 			continue;
 		}
 
-- 
2.7.4