common/recipes-kernel/linux/linux-yocto-4.14.71/1516-drm-amdkfd-Add-restore-work-item-function.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

From 4797dfd92f43f3cc68e5fa1e1d280f86c5d2128f Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Tue, 5 Jul 2016 17:32:48 -0400
Subject: [PATCH 1516/4131] drm/amdkfd: Add restore work item function

If any BO from a process is evicted, then a restore work queue is
scheduled to start after a TIMEOUT. This thread will restore all the BOs
and the user queues.

Change-Id: I5591e2665bd8e7617243ad1772d0b057c0ce9f5e
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  | 82 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 12 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  3 ++
 3 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index a47d7f1..0ce9572 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -745,6 +745,88 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
 	return r;
 }
 
+/* resume_process_mm -
+ *  Resume all user queues that belongs to given process p. The caller must
+ *  ensure that process p context is valid.
+ */
+static int resume_process_mm(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+	struct mm_struct *mm = (struct mm_struct *)p->mm;
+	int r, ret = 0;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+			down_read(&mm->mmap_sem);
+
+		r = process_restore_queues(pdd->dev->dqm, &pdd->qpd);
+		if (r != 0) {
+			pr_err("Failed to restore process queues\n");
+			if (ret == 0)
+				ret = r;
+		}
+
+		if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+			up_read(&mm->mmap_sem);
+	}
+
+	return ret;
+}
+
+/** kfd_schedule_restore_bos_and_queues - Schedules work queue that will
+ *   restore all BOs that belong to given process and then restore its queues
+ *
+ * @mm: mm_struct that identifies the KFD process
+ *
+ */
+static int kfd_schedule_restore_bos_and_queues(struct kfd_process *p)
+{
+	if (delayed_work_pending(&p->restore_work)) {
+		WARN(1, "Trying to evict an unrestored process\n");
+		cancel_delayed_work_sync(&p->restore_work);
+	}
+
+	/* During process initialization restore_work is initialized
+	 * to kfd_restore_bo_worker
+	 */
+	schedule_delayed_work(&p->restore_work, PROCESS_RESTORE_TIME_MS);
+	return 0;
+}
+
+void kfd_restore_bo_worker(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+	struct kfd_process *p;
+	struct kfd_process_device *pdd;
+	int ret = 0;
+
+	dwork = to_delayed_work(work);
+
+	/* Process termination destroys this worker thread. So during the
+	 * lifetime of this thread, kfd_process p will be valid
+	 */
+	p = container_of(dwork, struct kfd_process, restore_work);
+
+	/* Call restore_process_bos on the first KGD device. This function
+	 * takes care of restoring the whole process including other devices.
+	 * Restore can fail if enough memory is not available. If so,
+	 * reschedule again.
+	 */
+	pdd = list_first_entry(&p->per_device_data,
+			       struct kfd_process_device,
+			       per_device_list);
+
+	ret = pdd->dev->kfd2kgd->restore_process_bos(p->master_vm);
+	if (ret) {
+		kfd_schedule_restore_bos_and_queues(p);
+		return;
+	}
+
+	ret = resume_process_mm(p);
+	if (ret)
+		pr_err("Failed to resume user queues\n");
+}
+
 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
 				unsigned int chunk_size)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a672a72..eb9541f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -541,6 +541,15 @@ struct qcm_process_device {
 	void *ib_kaddr;
 };
 
+/* KFD Memory Eviction */
+/* Appox. wait time before attempting to restore evicted BOs */
+#define PROCESS_RESTORE_TIME_MS 2000
+/* Approx. back off time if restore fails due to lack of memory */
+#define PROCESS_BACK_OFF_TIME_MS 1000
+
+void kfd_restore_bo_worker(struct work_struct *work);
+
+
 /*8 byte handle containing GPU ID in the most significant 4 bytes and
  * idr_handle in the least significant 4 bytes*/
 #define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle)
@@ -671,6 +680,9 @@ struct kfd_process {
 	struct rb_root bo_interval_tree;
 
 	void *master_vm;
+
+	/* For restoring BOs after eviction */
+	struct delayed_work restore_work;
 };
 
 /**
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 19e3af9..542f0df 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -509,6 +509,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	p = container_of(mn, struct kfd_process, mmu_notifier);
 	BUG_ON(p->mm != mm);
 
+	cancel_delayed_work_sync(&p->restore_work);
+
 	mutex_lock(&kfd_processes_mutex);
 	hash_del_rcu(&p->kfd_processes);
 	mutex_unlock(&kfd_processes_mutex);
@@ -677,6 +679,7 @@ static struct kfd_process *create_process(const struct task_struct *thread,
 	if (err)
 		goto err_init_cwsr;
 
+	INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker);
 	return process;
 
 err_init_cwsr:
-- 
2.7.4