meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4643-drm-scheduler-Avoid-accessing-freed-bad-job.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

From de6e01403013d5d7ec970b8b4eb4ca07cc736d19 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Mon, 25 Nov 2019 15:51:29 -0500
Subject: [PATCH 4643/4736] drm/scheduler: Avoid accessing freed bad job.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:
Due to a race between drm_sched_cleanup_jobs in sched thread and
drm_sched_job_timedout in timeout work there is a possiblity that
bad job was already freed while still being accessed from the
timeout thread.

Fix:
Instead of just peeking at the bad job in the mirror list
remove it from the list under lock and then put it back later when
we are garanteed no race with main sched thread is possible which
is after the thread is parked.

v2: Lock around processing ring_mirror_list in drm_sched_cleanup_jobs.

v3: Rebase on top of drm-misc-next. v2 is not needed anymore as
drm_sched_get_cleanup_job already has a lock there.

v4: Fix comments to relfect latest code in drm-misc.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Emily Deng <Emily.Deng@amd.com>
Tested-by: Emily Deng <Emily.Deng@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/342356
Signed-off-by: Rahul Kumar <rahul.kumar1@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 27 ++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 108bac88dedb..0ccce80513e5 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -288,10 +288,21 @@ static void drm_sched_job_timedout(struct work_struct *work)
 	unsigned long flags;
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
+
+	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+	spin_lock_irqsave(&sched->job_list_lock, flags);
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
 				       struct drm_sched_job, node);
 
 	if (job) {
+		/*
+		 * Remove the bad job so it cannot be freed by concurrent
+		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
+		 * is parked at which point it's safe.
+		 */
+		list_del_init(&job->node);
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
 		job->sched->ops->timedout_job(job);
 
 		/*
@@ -302,6 +313,8 @@ static void drm_sched_job_timedout(struct work_struct *work)
 			job->sched->ops->free_job(job);
 			sched->free_guilty = false;
 		}
+	} else {
+		spin_unlock_irqrestore(&sched->job_list_lock, flags);
 	}
 
 	spin_lock_irqsave(&sched->job_list_lock, flags);
@@ -413,6 +426,20 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 		}
 	}
 
+	/*
+	 * Reinsert back the bad job here - now it's safe as
+	 * drm_sched_get_cleanup_job cannot race against us and release the
+	 * bad job at this point - we parked (waited for) any in progress
+	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
+	 * now until the scheduler thread is unparked.
+	 */
+	if (bad && bad->sched == sched)
+		/*
+		 * Add at the head of the queue to reflect it was the earliest
+		 * job extracted.
+		 */
+		list_add(&bad->node, &sched->ring_mirror_list);
+
 	/*
 	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
 	 * avoids the pending timeout work in progress to fire right away after
-- 
2.17.1