meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

From e9519126a39f8cfe39323335f35f1d10d22f1702 Mon Sep 17 00:00:00 2001
From: Monk Liu <Monk.Liu@amd.com>
Date: Wed, 25 Oct 2017 16:21:08 +0800
Subject: [PATCH 2085/4131] amd/scheduler:imple job skip feature(v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.

2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.

v2:
some logic fix

v3:
when entity detected guilty, don't drop the job in the poping
stage, instead set its fence error as -ECANCELED

in run_job(), skip the scheduling either:1) fence->error < 0
or 2) there was a VRAM LOST occurred on this job.
this way we can unify the job skipping logic.

with this feature we can introduce new gpu recover feature.

Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>

 Conflicts:
        drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 11 +++++---
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index f60662e..e97713a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
 
 static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 {
-	struct dma_fence *fence = NULL;
+	struct dma_fence *fence = NULL, *finished;
 	struct amdgpu_device *adev;
 	struct amdgpu_job *job;
 	int r;
@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 		return NULL;
 	}
 	job = to_amdgpu_job(sched_job);
+	finished = &job->base.s_fence->finished;
 	adev = job->adev;
 
 	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
 
 	trace_amdgpu_sched_run_job(job);
 	/* skip ib schedule when vram is lost */
-	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
-		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
-		DRM_ERROR("Skip scheduling IBs!\n");
+	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+
+	if (finished->error < 0) {
+		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
 		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
 				       &fence);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 903ef8b..6f041e8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	/* skip jobs from entity that marked guilty */
+	if (entity->guilty && atomic_read(entity->guilty))
+		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
+
 	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
 }
@@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
 	job->sched->ops->timedout_job(job);
 }
 
-static void amd_sched_set_guilty(struct amd_sched_job *s_job,
-				 struct amd_sched_entity *s_entity)
-{
-	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
-		if (s_entity->guilty)
-			atomic_set(s_entity->guilty, 1);
-}
-
 void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
 {
 	struct amd_sched_job *s_job;
@@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
 	spin_unlock(&sched->job_list_lock);
 
 	if (bad) {
-		bool found = false;
-
-		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
+		/* don't increase @bad's karma if it's from KERNEL RQ,
+		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+		 * corrupt but keep in mind that kernel jobs always considered good.
+		 */
+		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
 			struct amd_sched_rq *rq = &sched->sched_rq[i];
 
 			spin_lock(&rq->lock);
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
 				if (bad->s_fence->scheduled.context == entity->fence_context) {
-					found = true;
-					amd_sched_set_guilty(bad, entity);
+				    if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
+						if (entity->guilty)
+							atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
 			spin_unlock(&rq->lock);
-			if (found)
+			if (&entity->list != &rq->entities)
 				break;
 		}
 	}
@@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 {
 	struct amd_sched_job *s_job, *tmp;
+	bool found_guilty = false;
 	int r;
 
 	spin_lock(&sched->job_list_lock);
@@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct amd_sched_fence *s_fence = s_job->s_fence;
 		struct dma_fence *fence;
+		uint64_t guilty_context;
+
+		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+			found_guilty = true;
+			guilty_context = s_job->s_fence->scheduled.context;
+		}
+
+		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
 		spin_unlock(&sched->job_list_lock);
 		fence = sched->ops->run_job(s_job);
@@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 		spin_lock(&sched->job_list_lock);
@@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
-- 
2.7.4