aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch171
1 files changed, 171 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch
new file mode 100644
index 00000000..9960cf92
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch
@@ -0,0 +1,171 @@
+From 3bdc8244e734cff375c70e51b03220787da61eed Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Wed, 30 Oct 2019 14:40:09 -0400
+Subject: [PATCH 4378/4736] drm/amdgpu: Improve RAS documentation (v2)
+
+Clarify some areas, clean up formatting, add section for
+unrecoverable error handling.
+
+v2: fix grammatical errors
+
+Reviewed-by: Yong Zhao <yong.zhao@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+---
+ Documentation/gpu/amdgpu.rst | 35 ++++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 ++++++++++++++++++++-----
+ 2 files changed, 68 insertions(+), 7 deletions(-)
+
+diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
+index 5b9eaf23558e..0efede580039 100644
+--- a/Documentation/gpu/amdgpu.rst
++++ b/Documentation/gpu/amdgpu.rst
+@@ -82,12 +82,21 @@ AMDGPU XGMI Support
+ AMDGPU RAS Support
+ ==================
+
++The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and
++debugfs (for error injection).
++
+ RAS debugfs/sysfs Control and Error Injection Interfaces
+ --------------------------------------------------------
+
+ .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+ :doc: AMDGPU RAS debugfs control interface
+
++RAS Reboot Behavior for Unrecoverable Errors
++--------------------------------------------------------
++
++.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
++ :doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
++
+ RAS Error Count sysfs Interface
+ -------------------------------
+
+@@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface
+ .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+ :internal:
+
++Sample Code
++-----------
++Sample code for testing error injection can be found here:
++https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c
++
++This is part of the libdrm amdgpu unit tests which cover several areas of the GPU.
++There are four sets of tests:
++
++RAS Basic Test
++
++The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files
++are present.
++
++RAS Query Test
++
++This test checks the RAS availability and enablement status for each supported IP block as well as
++the error counts.
++
++RAS Inject Test
++
++This test injects errors for each IP.
++
++RAS Disable Test
++
++This test tests disabling of RAS features for each IP block.
++
+
+ GPU Power/Thermal Controls and Monitoring
+ =========================================
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+index afc3ee47d1b2..399617932427 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+@@ -218,7 +218,7 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ * As their names indicate, inject operation will write the
+ * value to the address.
+ *
+- * Second member: struct ras_debug_if::op.
++ * The second member: struct ras_debug_if::op.
+ * It has three kinds of operations.
+ *
+ * - 0: disable RAS on the block. Take ::head as its data.
+@@ -226,14 +226,20 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ * - 2: inject errors on the block. Take ::inject as its data.
+ *
+ * How to use the interface?
+- * programs:
+- * copy the struct ras_debug_if in your codes and initialize it.
+- * write the struct to the control node.
++ *
++ * Programs
++ *
++ * Copy the struct ras_debug_if in your codes and initialize it.
++ * Write the struct to the control node.
++ *
++ * Shells
+ *
+ * .. code-block:: bash
+ *
+ * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
+ *
++ * Parameters:
++ *
+ * op: disable, enable, inject
+ * disable: only block is needed
+ * enable: block and error are needed
+@@ -263,8 +269,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
+ *
+ * .. note::
+- * Operation is only allowed on blocks which are supported.
++ * Operations are only allowed on blocks which are supported.
+ * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
++ * to see which blocks support RAS on a particular asic.
++ *
+ */
+ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
+ size_t size, loff_t *pos)
+@@ -320,7 +328,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
+ * DOC: AMDGPU RAS debugfs EEPROM table reset interface
+ *
+ * Some boards contain an EEPROM which is used to persistently store a list of
+- * bad pages containing ECC errors detected in vram. This interface provides
++ * bad pages which experiences ECC errors in vram. This interface provides
+ * a way to reset the EEPROM, e.g., after testing error injection.
+ *
+ * Usage:
+@@ -360,7 +368,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
+ /**
+ * DOC: AMDGPU RAS sysfs Error Count Interface
+ *
+- * It allows user to read the error count for each IP block on the gpu through
++ * It allows the user to read the error count for each IP block on the gpu through
+ * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
+ *
+ * It outputs the multiple lines which report the uncorrected (ue) and corrected
+@@ -1025,6 +1033,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
+ }
+ /* sysfs end */
+
++/**
++ * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
++ *
++ * Normally when there is an uncorrectable error, the driver will reset
++ * the GPU to recover. However, in the event of an unrecoverable error,
++ * the driver provides an interface to reboot the system automatically
++ * in that event.
++ *
++ * The following file in debugfs provides that interface:
++ * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
++ *
++ * Usage:
++ *
++ * .. code-block:: bash
++ *
++ * echo true > .../ras/auto_reboot
++ *
++ */
+ /* debugfs begin */
+ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
+ {
+--
+2.17.1
+