aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1517-drm-amdkfd-Add-kgd2kfd-schedule_evict_and_restore_pr.patch
blob: a48c5d456d5188283eba3d992b30c2a116e69404 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
From e79bd47c809916d7671512b2bcfbaefe00557375 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Fri, 2 Sep 2016 15:01:41 -0400
Subject: [PATCH 1517/4131] drm/amdkfd: Add kgd2kfd
 schedule_evict_and_restore_process

Change-Id: I27860af58c54449a9ba1fc0a04e0436edb7fae8b
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>

 Conflicts:
	drivers/gpu/drm/amd/include/kgd_kfd_interface.h
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c         | 116 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_module.c         |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h           |  11 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c        |   2 +
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h |   6 ++
 5 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0ce9572..6acc5fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
+#include <linux/fence.h>
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_pm4_headers.h"
@@ -745,6 +746,42 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
 	return r;
 }
 
+/* quiesce_process_mm -
+ *  Quiesce all user queues that belongs to given process p
+ */
+static int quiesce_process_mm(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+	int r = 0;
+	unsigned int n_evicted = 0;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		r = process_evict_queues(pdd->dev->dqm, &pdd->qpd);
+		if (r != 0) {
+			pr_err("Failed to evict process queues\n");
+			goto fail;
+		}
+		n_evicted++;
+	}
+
+	return r;
+
+fail:
+	/* To keep state consistent, roll back partial eviction by
+	 * restoring queues
+	 */
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		if (n_evicted == 0)
+			break;
+		if (process_restore_queues(pdd->dev->dqm, &pdd->qpd))
+			pr_err("Failed to restore queues\n");
+
+		n_evicted--;
+	}
+
+	return r;
+}
+
 /* resume_process_mm -
  *  Resume all user queues that belongs to given process p. The caller must
  *  ensure that process p context is valid.
@@ -827,6 +864,85 @@ void kfd_restore_bo_worker(struct work_struct *work)
 		pr_err("Failed to resume user queues\n");
 }
 
+/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
+ *   prepare for safe eviction of KFD BOs that belong to the specified
+ *   process.
+ *
+ * @mm: mm_struct that identifies the specified KFD process
+ * @fence: eviction fence attached to KFD process BOs
+ *
+ */
+int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+					       struct fence *fence)
+{
+	struct kfd_process *p;
+
+	if (!fence)
+		return -EINVAL;
+
+	if (fence_is_signaled(fence))
+		return 0;
+
+	p = kfd_lookup_process_by_mm(mm);
+	if (!p)
+		return -ENODEV;
+
+	if (work_pending(&p->eviction_work.work)) {
+		/* It is possible has TTM has lined up couple of BOs of the same
+		 * process to be evicted. Check if the fence is same which
+		 * indicates that previous work item scheduled is not complted
+		 */
+		if (p->eviction_work.eviction_fence == fence)
+			goto out;
+		else {
+			WARN(1, "Starting new evict with previous evict is not completed\n");
+			cancel_work_sync(&p->eviction_work.work);
+		}
+	}
+
+	/* During process initialization eviction_work.work is initialized
+	 * to kfd_evict_bo_worker
+	 */
+	p->eviction_work.eviction_fence = fence_get(fence);
+	schedule_work(&p->eviction_work.work);
+out:
+	kfd_unref_process(p);
+	return 0;
+}
+
+void kfd_evict_bo_worker(struct work_struct *work)
+{
+	int ret;
+	struct kfd_process *p;
+	struct kfd_eviction_work *eviction_work;
+
+	eviction_work = container_of(work, struct kfd_eviction_work,
+				     work);
+
+	/* Process termination destroys this worker thread. So during the
+	 * lifetime of this thread, kfd_process p will be valid
+	 */
+	p = container_of(eviction_work, struct kfd_process, eviction_work);
+
+	/* Narrow window of overlap between restore and evict work item is
+	 * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves
+	 * KFD BOs, it is possible to evicted again. But restore has few more
+	 * steps of finish. So lets wait for the restore work to complete
+	 */
+	if (delayed_work_pending(&p->restore_work))
+		flush_delayed_work(&p->restore_work);
+
+	ret = quiesce_process_mm(p);
+	if (!ret) {
+		fence_signal(eviction_work->eviction_fence);
+		fence_put(eviction_work->eviction_fence);
+		kfd_schedule_restore_bos_and_queues(p);
+	} else {
+		pr_err("Failed to quiesce user queues. Cannot evict BOs\n");
+	}
+
+}
+
 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
 				unsigned int chunk_size)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 914a0cd..42c559b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -47,6 +47,8 @@ static const struct kgd2kfd_calls kgd2kfd = {
 	.restore	= kgd2kfd_restore,
 	.quiesce_mm	= kgd2kfd_quiesce_mm,
 	.resume_mm	= kgd2kfd_resume_mm,
+	.schedule_evict_and_restore_process =
+			  kgd2kfd_schedule_evict_and_restore_process,
 };
 
 int sched_policy = KFD_SCHED_POLICY_HWS;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index eb9541f..182d065 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -542,12 +542,20 @@ struct qcm_process_device {
 };
 
 /* KFD Memory Eviction */
+struct kfd_eviction_work {
+	struct work_struct work;
+	struct fence *eviction_fence;
+};
+
 /* Appox. wait time before attempting to restore evicted BOs */
 #define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 1000
 
+void kfd_evict_bo_worker(struct work_struct *work);
 void kfd_restore_bo_worker(struct work_struct *work);
+int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+					       struct fence *fence);
 
 
 /*8 byte handle containing GPU ID in the most significant 4 bytes and
@@ -681,7 +689,8 @@ struct kfd_process {
 
 	void *master_vm;
 
-	/* For restoring BOs after eviction */
+	/* Work items for evicting and restoring BOs */
+	struct kfd_eviction_work eviction_work;
 	struct delayed_work restore_work;
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 542f0df..54ed2a1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -509,6 +509,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	p = container_of(mn, struct kfd_process, mmu_notifier);
 	BUG_ON(p->mm != mm);
 
+	cancel_work_sync(&p->eviction_work.work);
 	cancel_delayed_work_sync(&p->restore_work);
 
 	mutex_lock(&kfd_processes_mutex);
@@ -679,6 +680,7 @@ static struct kfd_process *create_process(const struct task_struct *thread,
 	if (err)
 		goto err_init_cwsr;
 
+	INIT_WORK(&process->eviction_work.work, kfd_evict_bo_worker);
 	INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker);
 	return process;
 
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index c38e707..d344496 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -30,6 +30,7 @@
 
 #include <linux/types.h>
 #include <linux/bitmap.h>
+#include <linux/fence.h>
 
 struct pci_dev;
 
@@ -388,6 +389,9 @@ struct kfd2kgd_calls {
  *
  * @resume_mm: Resume user queue access to specified MM address space
  *
+ * @schedule_evict_and_restore_process: Schedules work queue that will prepare
+ * for safe eviction of KFD BOs that belong to the specified process.
+ *
  * This structure contains function callback pointers so the kgd driver
  * will notify to the amdkfd about certain status changes.
  *
@@ -406,6 +410,8 @@ struct kgd2kfd_calls {
 	int (*restore)(struct kfd_dev *kfd);
 	int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
 	int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm);
+	int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
+			struct fence *fence);
 };
 
 int kgd2kfd_init(unsigned interface_version,
-- 
2.7.4