aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch193
1 files changed, 193 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch b/meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch
new file mode 100644
index 00000000..bf3777a1
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux-5.4/linux-yocto-5.4.2/0051-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch
@@ -0,0 +1,193 @@
+From faf960c1ca7572016421f55fdabec5949ae7ea09 Mon Sep 17 00:00:00 2001
+From: Tao Zhou <tao.zhou1@amd.com>
+Date: Fri, 30 Aug 2019 19:50:39 +0800
+Subject: [PATCH 0051/1453] drm/amdgpu: move the call of ras recovery_init and
+ bad page reserve to proper place
+
+ras recovery_init should be called after ttm init,
+bad page reserve should be put in front of gpu reset since i2c
+may be unstable during gpu reset.
+add cleanup for recovery_init and recovery_fini
+
+v2: add more comment and print.
+ remove cancel_work_sync in recovery_init.
+
+Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
+Reviewed-by: Guchun Chen <guchun.chen@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++--------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++
+ 4 files changed, 43 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index 66fc2f38c282..36d5a1f63f14 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -3637,11 +3637,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ break;
+ }
+ }
+-
+- list_for_each_entry(tmp_adev, device_list_handle,
+- gmc.xgmi.head) {
+- amdgpu_ras_reserve_bad_pages(tmp_adev);
+- }
+ }
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+index 74cc44359b3d..7240a8f42fa4 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+@@ -1492,16 +1492,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
+ return 0;
+ }
+
+-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
++int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+ {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data **data = &con->eh_data;
+ int ret;
+
+- *data = kmalloc(sizeof(**data),
+- GFP_KERNEL|__GFP_ZERO);
+- if (!*data)
+- return -ENOMEM;
++ *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
++ if (!*data) {
++ ret = -ENOMEM;
++ goto out;
++ }
+
+ mutex_init(&con->recovery_lock);
+ INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
+@@ -1510,18 +1511,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+
+ ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
+ if (ret)
+- return ret;
++ goto free;
+
+ if (adev->psp.ras.ras->eeprom_control.num_recs) {
+ ret = amdgpu_ras_load_bad_pages(adev);
+ if (ret)
+- return ret;
++ goto free;
+ ret = amdgpu_ras_reserve_bad_pages(adev);
+ if (ret)
+- return ret;
++ goto release;
+ }
+
+ return 0;
++
++release:
++ amdgpu_ras_release_bad_pages(adev);
++free:
++ con->eh_data = NULL;
++ kfree((*data)->bps);
++ kfree((*data)->bps_bo);
++ kfree(*data);
++out:
++ DRM_WARN("Failed to initialize ras recovery!\n");
++
++ return ret;
+ }
+
+ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
+@@ -1529,12 +1542,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
++ /* recovery_init failed to init it, fini is useless */
++ if (!data)
++ return 0;
++
+ cancel_work_sync(&con->recovery_work);
+ amdgpu_ras_release_bad_pages(adev);
+
+ mutex_lock(&con->recovery_lock);
+ con->eh_data = NULL;
+ kfree(data->bps);
++ kfree(data->bps_bo);
+ kfree(data);
+ mutex_unlock(&con->recovery_lock);
+
+@@ -1626,9 +1644,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
+ return r;
+ }
+
+- if (amdgpu_ras_recovery_init(adev))
+- goto recovery_out;
+-
+ amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
+
+ if (amdgpu_ras_fs_init(adev))
+@@ -1643,8 +1658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
+ con->hw_supported, con->supported);
+ return 0;
+ fs_out:
+- amdgpu_ras_recovery_fini(adev);
+-recovery_out:
+ amdgpu_ras_set_context(adev, NULL);
+ kfree(con);
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+index 96210e18191e..012034d2ae06 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+ return ras && (ras->supported & (1 << block));
+ }
+
++int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
+ unsigned int block);
+
+@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
+ {
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
++ /* save bad page to eeprom before gpu reset,
++ * i2c may be unstable in gpu reset
++ */
++ amdgpu_ras_reserve_bad_pages(adev);
+ if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+ schedule_work(&ras->recovery_work);
+ return 0;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+index dff41d0a85fe..d0b783dae6dc 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+@@ -54,6 +54,7 @@
+ #include "amdgpu_trace.h"
+ #include "amdgpu_amdkfd.h"
+ #include "amdgpu_sdma.h"
++#include "amdgpu_ras.h"
+ #include "bif/bif_4_1_d.h"
+
+ static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
+@@ -1762,6 +1763,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
+ adev->gmc.visible_vram_size);
+ #endif
+
++ /*
++ * retired pages will be loaded from eeprom and reserved here,
++ * it should be called after ttm init since new bo may be created,
++ * recovery_init may fail, but it can free all resources allocated by
++ * itself and its failure should not stop amdgpu init process.
++ *
++ * Note: theoretically, this should be called before all vram allocations
++ * to protect retired page from abusing
++ */
++ amdgpu_ras_recovery_init(adev);
++
+ /*
+ *The reserved vram for firmware must be pinned to the specified
+ *place on the VRAM, so reserve it early.
+--
+2.17.1
+