aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1477-drm-amdkfd-Don-t-dereference-kfd_process.mm.patch
blob: 0f65397771a0ddf4ceda62c1be55ac94b194e20c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
From 4e615881865564e540f40efb96021093bd42c7a3 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 26 Jul 2016 17:30:54 -0400
Subject: [PATCH 1477/4131] drm/amdkfd: Don't dereference kfd_process.mm

The kfd_process doesn't own a reference to the mm_struct, so it can
disappear without warning even while the kfd_process still exists.
In fact, the delayed kfd_process teardown is triggered by an MMU
notifier when the mm_struct is destroyed. Permanently holding a
reference to the mm_struct would prevent this from happening.

Therefore, avoid dereferencing the kfd_process.mm pointer and make
it opaque. Use other ways to access the mm:
 * In process context, use current->mm
 * In calls that know the mm, use it directly
 * Otherwise use get_task_mm to get a reference

Change-Id: Idcea859d0eaa6d62978b3a8ee54d83cbcfc0d7cd
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c               |  9 ++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++--------
 drivers/gpu/drm/amd/amdkfd/kfd_events.c               | 17 ++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 |  7 ++++++-
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index af3790f..0111510 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -708,9 +708,16 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
 
 	r = -ENODEV;
 	pdd = kfd_get_process_device_data(kfd, p);
-	if (pdd)
+	if (pdd) {
+		if (kfd->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+			down_read(&mm->mmap_sem);
+
 		r = process_restore_queues(kfd->dqm, &pdd->qpd);
 
+		if (kfd->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+			up_read(&mm->mmap_sem);
+	}
+
 	up_read(&p->lock);
 	return r;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1506597..df9b3f3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -409,7 +409,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	BUG_ON(!dqm || !q || !q->mqd);
 
 	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
-		down_read(&q->process->mm->mmap_sem);
+		down_read(&current->mm->mmap_sem);
 	mutex_lock(&dqm->lock);
 
 	pdd = kfd_get_process_device_data(q->device, q->process);
@@ -466,7 +466,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 out_unlock:
 	mutex_unlock(&dqm->lock);
 	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
-		up_read(&q->process->mm->mmap_sem);
+		up_read(&current->mm->mmap_sem);
 
 	return retval;
 }
@@ -541,14 +541,10 @@ int process_restore_queues(struct device_queue_manager *dqm,
 {
 	struct queue *q, *next;
 	struct mqd_manager *mqd;
-	struct kfd_process_device *pdd =
-		container_of(qpd, struct kfd_process_device, qpd);
 	int retval = 0;
 
 	BUG_ON(!dqm || !qpd);
 
-	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
-		down_read(&pdd->process->mm->mmap_sem);
 	mutex_lock(&dqm->lock);
 	if (qpd->evicted == 0) /* already restored, do nothing */
 		goto out_unlock;
@@ -588,8 +584,6 @@ int process_restore_queues(struct device_queue_manager *dqm,
 
 out_unlock:
 	mutex_unlock(&dqm->lock);
-	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
-		up_read(&pdd->process->mm->mmap_sem);
 
 	return retval;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index eb51873..5f7aa78 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1027,14 +1027,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 	 * running so the lookup function returns a read-locked process.
 	 */
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct mm_struct *mm;
 
 	if (!p)
 		return; /* Presumably process exited. */
 
+	/* Take a safe reference to the mm_struct, which may otherwise
+	 * disappear even while the kfd_process is still referenced.
+	 */
+	mm = get_task_mm(p->lead_thread);
+	if (!mm) {
+		up_read(&p->lock);
+		return; /* Process is exiting */
+	}
+
 	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
 
-	down_read(&p->mm->mmap_sem);
-	vma = find_vma(p->mm, address);
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
 
 	memory_exception_data.gpu_id = dev->id;
 	memory_exception_data.va = address;
@@ -1060,7 +1070,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 		}
 	}
 
-	up_read(&p->mm->mmap_sem);
+	up_read(&mm->mmap_sem);
+	mmdrop(mm);
 
 	mutex_lock(&p->event_mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f540931..7576799 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -607,7 +607,12 @@ struct kfd_process {
 	 */
 	struct hlist_node kfd_processes;
 
-	struct mm_struct *mm;
+	/*
+	 * Opaque pointer to mm_struct. We don't hold a reference to
+	 * it so it should never be dereferenced from here. This is
+	 * only used for looking up processes by their mm.
+	 */
+	void *mm;
 
 	struct kref ref;
 	struct work_struct release_work;
-- 
2.7.4