aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3183-drm-amdkfd-Fix-and-simplify-evict-restore-synchroniz.patch
blob: 81eb8ad39fa0805c9d942fa39f5cd347d1f5d815 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
From 625f835c4f75076cc2c6ecb75d484e000e33f264 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 16 Jan 2018 17:37:43 -0500
Subject: [PATCH 3183/4131] drm/amdkfd: Fix and simplify evict/restore
 synchronization

Use the fence sequence number to check whether an eviction has already
been scheduled for a given eviction fence. Using the pointer is not
reliable because a new fence can have the same address as a previous
fence.

Remove the quiesce_fence that is no longer needed.

In the eviction worker, flush restore_work unconditionally.
delayed_work_pending return false if the worker is already running.

Change-Id: Id769e456221f47f3d8b308505d475eacaeecaf7c
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

Conflicts:
      drivers/gpu/drm/amd/amdkfd/kfd_process.c
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  | 60 +++++++++++---------------------
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  8 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  7 ++--
 3 files changed, 26 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index eeebaf4..489c4a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -953,21 +953,10 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
 	if (!p)
 		return -ENODEV;
 
-	if (delayed_work_pending(&p->eviction_work.dwork)) {
-		/* It is possible has TTM has lined up couple of BOs of the same
-		 * process to be evicted. Check if the fence is same which
-		 * indicates that previous work item scheduled is not completed
-		 */
-		if (p->eviction_work.quiesce_fence == fence)
-			goto out;
-		else {
-			WARN(1, "Starting new evict with previous evict is not completed\n");
-			if (cancel_delayed_work_sync(&p->eviction_work.dwork))
-				dma_fence_put(p->eviction_work.quiesce_fence);
-		}
-	}
+	if (fence->seqno == p->last_eviction_seqno)
+		goto out;
 
-	p->eviction_work.quiesce_fence = dma_fence_get(fence);
+	p->last_eviction_seqno = fence->seqno;
 
 	/* Avoid KFD process starvation. Wait for at least
 	 * PROCESS_ACTIVE_TIME_MS before evicting the process again
@@ -981,7 +970,7 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
 	/* During process initialization eviction_work.dwork is initialized
 	 * to kfd_evict_bo_worker
 	 */
-	schedule_delayed_work(&p->eviction_work.dwork, delay_jiffies);
+	schedule_delayed_work(&p->eviction_work, delay_jiffies);
 out:
 	kfd_unref_process(p);
 	return 0;
@@ -991,48 +980,39 @@ void kfd_evict_bo_worker(struct work_struct *work)
 {
 	int ret;
 	struct kfd_process *p;
-	struct kfd_eviction_work *eviction_work;
 	struct delayed_work *dwork;
 
 	dwork = to_delayed_work(work);
-	eviction_work = container_of(dwork, struct kfd_eviction_work,
-				     dwork);
 
 	/* Process termination destroys this worker thread. So during the
 	 * lifetime of this thread, kfd_process p will be valid
 	 */
-	p = container_of(eviction_work, struct kfd_process, eviction_work);
-
-	/* Narrow window of overlap between restore and evict work item is
-	 * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves
-	 * KFD BOs, it is possible to evicted again. But restore has few more
-	 * steps of finish. So lets wait for the restore work to complete
+	p = container_of(dwork, struct kfd_process, eviction_work);
+	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
+		  "Eviction fence mismatch\n");
+
+	/* Narrow window of overlap between restore and evict work
+	 * item is possible. Once
+	 * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs,
+	 * it is possible to evicted again. But restore has few more
+	 * steps of finish. So lets wait for any previous restore work
+	 * to complete
 	 */
-	if (delayed_work_pending(&p->restore_work))
-		flush_delayed_work(&p->restore_work);
+	flush_delayed_work(&p->restore_work);
 
 	pr_info("Started evicting process of pasid %d\n", p->pasid);
 	ret = quiesce_process_mm(p);
 	if (!ret) {
-		dma_fence_signal(eviction_work->quiesce_fence);
-		WARN_ONCE(eviction_work->quiesce_fence != p->ef,
-			 "Eviction fence mismatch\n");
+		dma_fence_signal(p->ef);
 		dma_fence_put(p->ef);
-		/* TODO: quiesce_fence is same as kfd_process->ef. But
-		 * quiesce_fence is also used to avoid starting multiple
-		 * eviction work items. This might not be necessary and
-		 * one of the variables could be removed
-		 */
 		p->ef = NULL;
 		schedule_delayed_work(&p->restore_work,
 				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
-	} else
-		pr_err("Failed to quiesce user queues. Cannot evict BOs\n");
-
-	dma_fence_put(eviction_work->quiesce_fence);
-
-	pr_info("Finished evicting process of pasid %d\n", p->pasid);
 
+		pr_info("Finished evicting process of pasid %d\n", p->pasid);
+	} else
+		pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n",
+		       p->pasid);
 }
 
 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6b2e1e6..e04f6f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -595,10 +595,6 @@ struct qcm_process_device {
 };
 
 /* KFD Memory Eviction */
-struct kfd_eviction_work {
-	struct delayed_work dwork;
-	struct dma_fence *quiesce_fence;
-};
 
 /* Approx. wait time before attempting to restore evicted BOs */
 #define PROCESS_RESTORE_TIME_MS 100
@@ -743,8 +739,10 @@ struct kfd_process {
 	struct dma_fence *ef;
 
 	/* Work items for evicting and restoring BOs */
-	struct kfd_eviction_work eviction_work;
+	struct delayed_work eviction_work;
 	struct delayed_work restore_work;
+	/* seqno of the last scheduled eviction */
+	unsigned int last_eviction_seqno;
 	/* Approx. the last timestamp (in jiffies) when the process was
 	 * restored after an eviction
 	 */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 3d663d31..64042f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -458,7 +458,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	mutex_unlock(&kfd_processes_mutex);
 	synchronize_srcu(&kfd_processes_srcu);
 
-	cancel_delayed_work_sync(&p->eviction_work.dwork);
+	cancel_delayed_work_sync(&p->eviction_work);
 	cancel_delayed_work_sync(&p->restore_work);
 
 	mutex_lock(&p->mutex);
@@ -607,7 +607,7 @@ static struct kfd_process *create_process(const struct task_struct *thread,
 	if (err != 0)
 		goto err_init_apertures;
 
-	INIT_DELAYED_WORK(&process->eviction_work.dwork, kfd_evict_bo_worker);
+	INIT_DELAYED_WORK(&process->eviction_work, kfd_evict_bo_worker);
 	INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker);
 	process->last_restore_timestamp = get_jiffies_64();
 
@@ -1051,8 +1051,7 @@ void kfd_suspend_all_processes(void)
 #else
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 #endif
-		if (cancel_delayed_work_sync(&p->eviction_work.dwork))
-			dma_fence_put(p->eviction_work.quiesce_fence);
+		cancel_delayed_work_sync(&p->eviction_work);
 		cancel_delayed_work_sync(&p->restore_work);
 
 		if (quiesce_process_mm(p))
-- 
2.7.4