diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch new file mode 100644 index 00000000..9960cf92 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4378-drm-amdgpu-Improve-RAS-documentation-v2.patch @@ -0,0 +1,171 @@ +From 3bdc8244e734cff375c70e51b03220787da61eed Mon Sep 17 00:00:00 2001 +From: Alex Deucher <alexander.deucher@amd.com> +Date: Wed, 30 Oct 2019 14:40:09 -0400 +Subject: [PATCH 4378/4736] drm/amdgpu: Improve RAS documentation (v2) + +Clarify some areas, clean up formatting, add section for +unrecoverable error handling. + +v2: fix grammatical errors + +Reviewed-by: Yong Zhao <yong.zhao@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +--- + Documentation/gpu/amdgpu.rst | 35 ++++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 ++++++++++++++++++++----- + 2 files changed, 68 insertions(+), 7 deletions(-) + +diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst +index 5b9eaf23558e..0efede580039 100644 +--- a/Documentation/gpu/amdgpu.rst ++++ b/Documentation/gpu/amdgpu.rst +@@ -82,12 +82,21 @@ AMDGPU XGMI Support + AMDGPU RAS Support + ================== + ++The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and ++debugfs (for error injection). ++ + RAS debugfs/sysfs Control and Error Injection Interfaces + -------------------------------------------------------- + + .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS debugfs control interface + ++RAS Reboot Behavior for Unrecoverable Errors ++-------------------------------------------------------- ++ ++.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++ :doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors ++ + RAS Error Count sysfs Interface + ------------------------------- + +@@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface + .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :internal: + ++Sample Code ++----------- ++Sample code for testing error injection can be found here: ++https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c ++ ++This is part of the libdrm amdgpu unit tests which cover several areas of the GPU. ++There are four sets of tests: ++ ++RAS Basic Test ++ ++The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files ++are present. ++ ++RAS Query Test ++ ++This test checks the RAS availability and enablement status for each supported IP block as well as ++the error counts. ++ ++RAS Inject Test ++ ++This test injects errors for each IP. ++ ++RAS Disable Test ++ ++This test tests disabling of RAS features for each IP block. ++ + + GPU Power/Thermal Controls and Monitoring + ========================================= +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index afc3ee47d1b2..399617932427 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -218,7 +218,7 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + * As their names indicate, inject operation will write the + * value to the address. + * +- * Second member: struct ras_debug_if::op. ++ * The second member: struct ras_debug_if::op. + * It has three kinds of operations. + * + * - 0: disable RAS on the block. Take ::head as its data. +@@ -226,14 +226,20 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + * - 2: inject errors on the block. Take ::inject as its data. + * + * How to use the interface? +- * programs: +- * copy the struct ras_debug_if in your codes and initialize it. +- * write the struct to the control node. ++ * ++ * Programs ++ * ++ * Copy the struct ras_debug_if in your codes and initialize it. ++ * Write the struct to the control node. ++ * ++ * Shells + * + * .. code-block:: bash + * + * echo op block [error [sub_block address value]] > .../ras/ras_ctrl + * ++ * Parameters: ++ * + * op: disable, enable, inject + * disable: only block is needed + * enable: block and error are needed +@@ -263,8 +269,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * + * .. note:: +- * Operation is only allowed on blocks which are supported. ++ * Operations are only allowed on blocks which are supported. + * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask ++ * to see which blocks support RAS on a particular asic. ++ * + */ + static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, + size_t size, loff_t *pos) +@@ -320,7 +328,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * + * DOC: AMDGPU RAS debugfs EEPROM table reset interface + * + * Some boards contain an EEPROM which is used to persistently store a list of +- * bad pages containing ECC errors detected in vram. This interface provides ++ * bad pages which experiences ECC errors in vram. This interface provides + * a way to reset the EEPROM, e.g., after testing error injection. + * + * Usage: +@@ -360,7 +368,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { + /** + * DOC: AMDGPU RAS sysfs Error Count Interface + * +- * It allows user to read the error count for each IP block on the gpu through ++ * It allows the user to read the error count for each IP block on the gpu through + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * + * It outputs the multiple lines which report the uncorrected (ue) and corrected +@@ -1025,6 +1033,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) + } + /* sysfs end */ + ++/** ++ * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors ++ * ++ * Normally when there is an uncorrectable error, the driver will reset ++ * the GPU to recover. However, in the event of an unrecoverable error, ++ * the driver provides an interface to reboot the system automatically ++ * in that event. ++ * ++ * The following file in debugfs provides that interface: ++ * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot ++ * ++ * Usage: ++ * ++ * .. code-block:: bash ++ * ++ * echo true > .../ras/auto_reboot ++ * ++ */ + /* debugfs begin */ + static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) + { +-- +2.17.1 + |