aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3738-dmr-amdgpu-Add-system-auto-reboot-to-RAS.patch
blob: 6ad7b6ae6f002a58b7c6a7fefb9d15447a8607d5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
From 013f63d334471e85b469aad0bba8ed3c2d256cd0 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Date: Thu, 22 Aug 2019 15:01:37 -0400
Subject: [PATCH 3738/4256] dmr/amdgpu: Add system auto reboot to RAS.

In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

v4: Use latest kernel API for disk sync.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  9 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  2 +-
 include/linux/suspend.h                    |  3 +++
 include/linux/syscalls.h                   |  2 +-
 kernel/power/main.c                        | 11 ++++++++++-
 6 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e30f7ba53aab..b29b26098b8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -64,6 +64,8 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_pmu.h"
 
+#include <linux/suspend.h>
+
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
@@ -3758,6 +3760,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool in_ras_intr = amdgpu_ras_intr_triggered();
 
+	/*
+	 * Flush RAM to disk so that after reboot
+	 * the user can read log and see why the system rebooted.
+	 */
+	if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+
+		DRM_WARN("Emergency reboot.");
+
+		ksys_sync_helper();
+		emergency_restart();
+	}
+
 	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 01a66559f04e..5c2276bb8325 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -154,6 +154,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
+	else if (sscanf(str, "reboot %32s", block_name) == 1)
+		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -287,6 +289,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
+	case 3:
+		amdgpu_ras_get_context(adev)->reboot = true;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1744,6 +1749,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
-		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
+		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
+
+		amdgpu_ras_reset_gpu(adev, false);
 	}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6fda96b29f1f..f487038ba331 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -334,7 +334,7 @@ struct amdgpu_ras {
 	struct mutex recovery_lock;
 
 	uint32_t flags;
-
+	bool reboot;
 	struct amdgpu_ras_eeprom_control eeprom_control;
 };
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3f529ad9a9d2..6b3ea9ea6a9e 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -425,6 +425,7 @@ void restore_processor_state(void);
 /* kernel/power/main.c */
 extern int register_pm_notifier(struct notifier_block *nb);
 extern int unregister_pm_notifier(struct notifier_block *nb);
+extern void ksys_sync_helper(void);
 
 #define pm_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
@@ -462,6 +463,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline void ksys_sync_helper(void) {}
+
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
 
 static inline bool pm_wakeup_pending(void) { return false; }
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ff814c92f7f..9dc129a92b31 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -906,7 +906,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
-
+void ksys_sync(void);
 /*
  * Architecture-specific system calls
  */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 35b50823d83b..1f5b1b262ff7 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,7 +16,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
-
+#include <linux/syscalls.h>
 #include "power.h"
 
 #ifdef CONFIG_PM_SLEEP
@@ -49,6 +49,15 @@ void unlock_system_sleep(void)
 	current->flags &= ~PF_FREEZER_SKIP;
 	mutex_unlock(&system_transition_mutex);
 }
+
+void ksys_sync_helper(void)
+{
+        pr_info("Syncing filesystems ... ");
+        ksys_sync();
+        pr_cont("done.\n");
+}
+EXPORT_SYMBOL_GPL(ksys_sync_helper);
+
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
 
 /* Routines for PM-transition notifications */
-- 
2.17.1