aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1144-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch
blob: b1e44a3a4784945edf8f09990a24cc8b8162d049 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
From 284b4b30d1d110d5f11a861981729b0f26e38a73 Mon Sep 17 00:00:00 2001
From: Yong Zhao <yong.zhao@amd.com>
Date: Mon, 30 May 2016 20:47:17 -0400
Subject: [PATCH 1144/4131] drm/amdkfd: Fix suspend/resume issue on Carrizo

When we do suspend/resume through "sudo pm-suspend" while there is
HSA activity running, upon resume we will encounter HWS hanging, which
is caused by memory read/write failures. The root cause is that when
suspend, we neglected to unbind pasid from kfd device.

Another major change is that the bind/unbinding is changed to be
performed on a per process basis, instead of whether there are queues
in dqm.

There are some other small changes as well.

Change-Id: If9ac972fc4309b688f6c1d07e27cede54814410e
Signed-off-by: Yong Zhao <yong.zhao@amd.com>

 Conflicts:
	drivers/gpu/drm/amd/amdkfd/kfd_process.c
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 26 +++---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 14 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 94 ++++++++++++++++++----
 4 files changed, 105 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8bf209d..aae4e5a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -258,7 +258,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 
 	if (dev)
-		kfd_unbind_process_from_device(dev, pasid);
+		kfd_process_iommu_unbind_callback(dev, pasid);
 }
 
 /*
@@ -488,14 +488,18 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
 {
 	BUG_ON(kfd == NULL);
 
-	if (kfd->init_complete) {
-		kfd->dqm->ops.stop(kfd->dqm);
-		if (kfd->device_info->is_need_iommu_device) {
-			amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
-			amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
-			amd_iommu_free_device(kfd->pdev);
-		}
-	}
+	if (!kfd->init_complete)
+		return;
+
+	kfd->dqm->ops.stop(kfd->dqm);
+	if (!kfd->device_info->is_need_iommu_device)
+		return;
+
+	kfd_unbind_processes_from_device(kfd);
+
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+	amd_iommu_free_device(kfd->pdev);
 }
 
 int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem)
@@ -533,6 +537,10 @@ static int kfd_resume(struct kfd_dev *kfd)
 				iommu_pasid_shutdown_callback);
 		amd_iommu_set_invalid_ppr_cb(kfd->pdev,
 				iommu_invalid_ppr_cb);
+
+		err = kfd_bind_processes_to_device(kfd);
+		if (err)
+			return -ENXIO;
 	}
 
 	err = kfd->dqm->ops.start(kfd->dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 5fb3c1e..6f01393 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -863,7 +863,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 
 static int start_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
 	int retval;
 
 	BUG_ON(!dqm);
@@ -892,11 +891,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 	init_interrupts(dqm);
 
-	list_for_each_entry(node, &dqm->queues, list)
-		if (node->qpd->pqm->process && dqm->dev)
-			kfd_bind_process_to_device(dqm->dev,
-						node->qpd->pqm->process);
-
 	mutex_lock(&dqm->lock);
 	execute_queues_cpsch(dqm, false);
 	mutex_unlock(&dqm->lock);
@@ -911,9 +905,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
-	struct device_process_node *node;
-	struct kfd_process_device *pdd;
-
 	BUG_ON(!dqm);
 
 	mutex_lock(&dqm->lock);
@@ -922,10 +913,6 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 
 	mutex_unlock(&dqm->lock);
 
-	list_for_each_entry(node, &dqm->queues, list) {
-		pdd = qpd_to_pdd(node->qpd);
-		pdd->bound = false;
-	}
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
 	pm_uninit(&dqm->packets);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e3335db..bfce9a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -531,6 +531,12 @@ struct qcm_process_device {
 #define GET_GPU_ID(handle) (handle >> 32)
 #define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
 
+enum kfd_pdd_bound {
+	PDD_UNBOUND = 0,
+	PDD_BOUND,
+	PDD_BOUND_SUSPENDED,
+};
+
 /* Data that is per-process-per device. */
 struct kfd_process_device {
 	/*
@@ -564,7 +570,7 @@ struct kfd_process_device {
 	uint64_t sh_hidden_private_base_vmid;
 
 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
-	bool bound;
+	enum kfd_pdd_bound bound;
 
 	/* VM context for GPUVM allocations */
 	void *vm;
@@ -658,8 +664,10 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
-							struct kfd_process *p);
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
+						struct kfd_process *p);
+int kfd_bind_processes_to_device(struct kfd_dev *dev);
+void kfd_unbind_processes_from_device(struct kfd_dev *dev);
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p);
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 2c371cd..3b312b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -319,8 +319,13 @@ static void kfd_process_wq_release(struct work_struct *work)
 		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
 				pdd->dev->id, p->pasid);
 
-		if (pdd->dev->device_info->is_need_iommu_device)
-			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+		if (pdd->dev->device_info->is_need_iommu_device) {
+			if (pdd->bound == PDD_BOUND) {
+				amd_iommu_unbind_pasid(pdd->dev->pdev,
+						p->pasid);
+				pdd->bound = PDD_UNBOUND;
+			}
+		}
 
 		/*
 		 * Remove all handles from idr and release appropriate
@@ -616,9 +621,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
 		if (pdd->dev == dev)
-			break;
+			return pdd;
 
-	return pdd;
+	return NULL;
 }
 
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
@@ -636,6 +641,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 		pdd->qpd.evicted = 0;
 		pdd->reset_wavefronts = false;
 		pdd->process = p;
+		pdd->bound = PDD_UNBOUND;
 		list_add(&pdd->per_device_list, &p->per_device_data);
 
 		/* Init idr used for memory handle translation */
@@ -672,21 +678,85 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (pdd->bound)
+	if (pdd->bound == PDD_BOUND)
 		return pdd;
 
+	if (pdd->bound == PDD_BOUND_SUSPENDED) {
+		pr_err("kfd: binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	if (dev->device_info->is_need_iommu_device) {
 		err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
 		if (err < 0)
 			return ERR_PTR(err);
 	}
 
-	pdd->bound = true;
+	pdd->bound = PDD_BOUND;
 
 	return pdd;
 }
 
-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+int kfd_bind_processes_to_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp;
+	int err = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		down_write(&p->lock);
+		pdd = kfd_get_process_device_data(dev, p);
+		if (pdd->bound != PDD_BOUND_SUSPENDED) {
+			up_write(&p->lock);
+			continue;
+		}
+
+		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+				p->lead_thread);
+		if (err < 0) {
+			pr_err("unexpected pasid %d binding failure\n",
+					p->pasid);
+			up_write(&p->lock);
+			break;
+		}
+
+		pdd->bound = PDD_BOUND;
+		up_write(&p->lock);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return err;
+}
+
+void kfd_unbind_processes_from_device(struct kfd_dev *dev)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp, temp_bound, temp_pasid;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		down_write(&p->lock);
+		pdd = kfd_get_process_device_data(dev, p);
+		temp_bound = pdd->bound;
+		temp_pasid = p->pasid;
+		if (pdd->bound == PDD_BOUND)
+			pdd->bound = PDD_BOUND_SUSPENDED;
+		up_write(&p->lock);
+
+		if (temp_bound == PDD_BOUND)
+			amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 {
 	struct kfd_process *p;
 	struct kfd_process_device *pdd;
@@ -722,16 +792,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 		pdd->reset_wavefronts = false;
 	}
 
-	/*
-	 * Just mark pdd as unbound, because we still need it
-	 * to call amd_iommu_unbind_pasid() in when the
-	 * process exits.
-	 * We don't call amd_iommu_unbind_pasid() here
-	 * because the IOMMU called us.
-	 */
-	if (pdd)
-		pdd->bound = false;
-
 	up_write(&p->lock);
 }
 
-- 
2.7.4