meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

From 3bdc8244e734cff375c70e51b03220787da61eed Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 30 Oct 2019 14:40:09 -0400
Subject: [PATCH 4378/4736] drm/amdgpu: Improve RAS documentation (v2)

Clarify some areas, clean up formatting, add section for
unrecoverable error handling.

v2: fix grammatical errors

Reviewed-by: Yong Zhao <yong.zhao@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 Documentation/gpu/amdgpu.rst            | 35 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 ++++++++++++++++++++-----
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index 5b9eaf23558e..0efede580039 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -82,12 +82,21 @@ AMDGPU XGMI Support
 AMDGPU RAS Support
 ==================
 
+The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and
+debugfs (for error injection).
+
 RAS debugfs/sysfs Control and Error Injection Interfaces
 --------------------------------------------------------
 
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
    :doc: AMDGPU RAS debugfs control interface
 
+RAS Reboot Behavior for Unrecoverable Errors
+--------------------------------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
+
 RAS Error Count sysfs Interface
 -------------------------------
 
@@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
    :internal:
 
+Sample Code
+-----------
+Sample code for testing error injection can be found here:
+https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c
+
+This is part of the libdrm amdgpu unit tests which cover several areas of the GPU.
+There are four sets of tests:
+
+RAS Basic Test
+
+The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files
+are present.
+
+RAS Query Test
+
+This test checks the RAS availability and enablement status for each supported IP block as well as
+the error counts.
+
+RAS Inject Test
+
+This test injects errors for each IP.
+
+RAS Disable Test
+
+This test tests disabling of RAS features for each IP block.
+
 
 GPU Power/Thermal Controls and Monitoring
 =========================================
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index afc3ee47d1b2..399617932427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -218,7 +218,7 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * As their names indicate, inject operation will write the
  * value to the address.
  *
- * Second member: struct ras_debug_if::op.
+ * The second member: struct ras_debug_if::op.
  * It has three kinds of operations.
  *
  * - 0: disable RAS on the block. Take ::head as its data.
@@ -226,14 +226,20 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * - 2: inject errors on the block. Take ::inject as its data.
  *
  * How to use the interface?
- * programs:
- * copy the struct ras_debug_if in your codes and initialize it.
- * write the struct to the control node.
+ *
+ * Programs
+ *
+ * Copy the struct ras_debug_if in your codes and initialize it.
+ * Write the struct to the control node.
+ *
+ * Shells
  *
  * .. code-block:: bash
  *
  *	echo op block [error [sub_block address value]] > .../ras/ras_ctrl
  *
+ * Parameters:
+ *
  * op: disable, enable, inject
  *	disable: only block is needed
  *	enable: block and error are needed
@@ -263,8 +269,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
  *
  * .. note::
- *	Operation is only allowed on blocks which are supported.
+ *	Operations are only allowed on blocks which are supported.
  *	Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
+ *	to see which blocks support RAS on a particular asic.
+ *
  */
 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 		size_t size, loff_t *pos)
@@ -320,7 +328,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
  * Some boards contain an EEPROM which is used to persistently store a list of
- * bad pages containing ECC errors detected in vram.  This interface provides
+ * bad pages which experiences ECC errors in vram.  This interface provides
  * a way to reset the EEPROM, e.g., after testing error injection.
  *
  * Usage:
@@ -360,7 +368,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 /**
  * DOC: AMDGPU RAS sysfs Error Count Interface
  *
- * It allows user to read the error count for each IP block on the gpu through
+ * It allows the user to read the error count for each IP block on the gpu through
  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
  *
  * It outputs the multiple lines which report the uncorrected (ue) and corrected
@@ -1025,6 +1033,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 }
 /* sysfs end */
 
+/**
+ * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
+ *
+ * Normally when there is an uncorrectable error, the driver will reset
+ * the GPU to recover.  However, in the event of an unrecoverable error,
+ * the driver provides an interface to reboot the system automatically
+ * in that event.
+ *
+ * The following file in debugfs provides that interface:
+ * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
+ *
+ * Usage:
+ *
+ * .. code-block:: bash
+ *
+ *	echo true > .../ras/auto_reboot
+ *
+ */
 /* debugfs begin */
 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 {
-- 
2.17.1