1 files changed, 512 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3736-dmr-amdgpu-Avoid-HW-GPU-reset-for-RAS.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3736-dmr-amdgpu-Avoid-HW-GPU-reset-for-RAS.patch
new file mode 100644
index 00000000..476de3c3
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3736-dmr-amdgpu-Avoid-HW-GPU-reset-for-RAS.patch
@@ -0,0 +1,512 @@
+From 61d5f95e1eca078269c2b3dc74e18b57ad13a064 Mon Sep 17 00:00:00 2001
+From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Date: Thu, 22 Aug 2019 14:40:00 -0400
+Subject: [PATCH 3736/4256] dmr/amdgpu: Avoid HW GPU reset for RAS.
+
+Problem:
+Under certain conditions, when some IP bocks take a RAS error,
+we can get into a situation where a GPU reset is not possible
+due to issues in RAS in SMU/PSP.
+
+Temporary fix until proper solution in PSP/SMU is ready:
+When uncorrectable error happens the DF will unconditionally
+broadcast error event packets to all its clients/slave upon
+receiving fatal error event and freeze all its outbound queues,
+err_event_athub interrupt  will be triggered.
+In such case and we use this interrupt
+to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
+reset, only stops schedulers, deatches all in progress and not yet scheduled
+job's fences, set error code on them and signals.
+Also reject any new incoming job submissions from user space.
+All this is done to notify the applications of the problem.
+
+v2:
+Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
+Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
+Remove print param from amdgpu_ras_query_error_count
+
+v3:
+Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset
+for other XGMI hive memebers.
+
+Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 +++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++------
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 +++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 38 ++++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 ++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 22 +++++++++++--
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 10 ++++++
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 +++---
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 +++++++-------
+ drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 +++
+ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++++++---------
+ 12 files changed, 155 insertions(+), 42 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+index 55282bfcaa45..901ce33cc481 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+@@ -34,6 +34,7 @@
+ #include "amdgpu_gmc.h"
+ #include "amdgpu_gem.h"
+ #include "amdgpu_display.h"
++#include "amdgpu_ras.h"
+ 
+ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,
+ 				      struct drm_amdgpu_cs_chunk_fence *data,
+@@ -1278,6 +1279,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+ 	bool reserved_buffers = false;
+ 	int i, r;
+ 
++	if (amdgpu_ras_intr_triggered())
++		return -EHWPOISON;
++
+ 	if (!adev->accel_working)
+ 		return -EBUSY;
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index be0a06014037..e30f7ba53aab 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -3725,25 +3725,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
+ 		adev->mp1_state = PP_MP1_STATE_NONE;
+ 		break;
+ 	}
+-	/* Block kfd: SRIOV would do it separately */
+-	if (!amdgpu_sriov_vf(adev))
+-                amdgpu_amdkfd_pre_reset(adev);
+ 
+ 	return true;
+ }
+ 
+ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+ {
+-	/*unlock kfd: SRIOV would do it separately */
+-	if (!amdgpu_sriov_vf(adev))
+-                amdgpu_amdkfd_post_reset(adev);
+ 	amdgpu_vf_error_trans_all(adev);
+ 	adev->mp1_state = PP_MP1_STATE_NONE;
+ 	adev->in_gpu_reset = 0;
+ 	mutex_unlock(&adev->lock_reset);
+ }
+ 
+-
+ /**
+  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
+  *
+@@ -3763,11 +3756,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 	struct amdgpu_hive_info *hive = NULL;
+ 	struct amdgpu_device *tmp_adev = NULL;
+ 	int i, r = 0;
++	bool in_ras_intr = amdgpu_ras_intr_triggered();
+ 
+ 	need_full_reset = job_signaled = false;
+ 	INIT_LIST_HEAD(&device_list);
+ 
+-	dev_info(adev->dev, "GPU reset begin!\n");
++	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
+ 
+ 	cancel_delayed_work_sync(&adev->delayed_init_work);
+ 
+@@ -3794,9 +3788,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 		return 0;
+ 	}
+ 
++	/* Block kfd: SRIOV would do it separately */
++	if (!amdgpu_sriov_vf(adev))
++                amdgpu_amdkfd_pre_reset(adev);
++
+ 	/* Build list of devices to reset */
+ 	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
+ 		if (!hive) {
++			/*unlock kfd: SRIOV would do it separately */
++			if (!amdgpu_sriov_vf(adev))
++		                amdgpu_amdkfd_post_reset(adev);
+ 			amdgpu_device_unlock_adev(adev);
+ 			return -ENODEV;
+ 		}
+@@ -3814,8 +3815,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 
+ 	/* block all schedulers and reset given job's ring */
+ 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+-		if (tmp_adev != adev)
++		if (tmp_adev != adev) {
+ 			amdgpu_device_lock_adev(tmp_adev, false);
++			if (!amdgpu_sriov_vf(tmp_adev))
++			                amdgpu_amdkfd_pre_reset(tmp_adev);
++		}
++
+ 		/*
+ 		 * Mark these ASICs to be reseted as untracked first
+ 		 * And add them back after reset completed
+@@ -3823,7 +3828,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 		amdgpu_unregister_gpu_instance(tmp_adev);
+ 
+ 		/* disable ras on ALL IPs */
+-		if (amdgpu_device_ip_need_full_reset(tmp_adev))
++		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
+ 			amdgpu_ras_suspend(tmp_adev);
+ 
+ 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+@@ -3833,10 +3838,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 				continue;
+ 
+ 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
++
++			if (in_ras_intr)
++				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
+ 		}
+ 	}
+ 
+ 
++	if (in_ras_intr)
++		goto skip_sched_resume;
++
+ 	/*
+ 	 * Must check guilty signal here since after this point all old
+ 	 * HW fences are force signaled.
+@@ -3895,6 +3906,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 
+ 	/* Post ASIC reset for all devs .*/
+ 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
++
+ 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ 			struct amdgpu_ring *ring = tmp_adev->rings[i];
+ 
+@@ -3921,7 +3933,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ 		} else {
+ 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ 		}
++	}
+ 
++skip_sched_resume:
++	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
++		/*unlock kfd: SRIOV would do it separately */
++		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
++	                amdgpu_amdkfd_post_reset(tmp_adev);
+ 		amdgpu_device_unlock_adev(tmp_adev);
+ 	}
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+index 2de8db5e864c..3aa7c136d2c3 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -40,6 +40,8 @@
+ 
+ #include "amdgpu_amdkfd.h"
+ 
++#include "amdgpu_ras.h"
++
+ /*
+  * KMS wrapper.
+  * - 3.0.0 - initial driver
+@@ -1144,6 +1146,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
+ 	struct drm_device *dev = pci_get_drvdata(pdev);
+ 	struct amdgpu_device *adev = dev->dev_private;
+ 
++	if (amdgpu_ras_intr_triggered())
++		return;
++
+ 	/* if we are running in a VM, make sure the device
+ 	 * torn down properly on reboot/shutdown.
+ 	 * unfortunately we can't detect certain
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+index 7ab1241bd9e5..c043d8f6bb8b 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+@@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
+ 	return fence;
+ }
+ 
++#define to_drm_sched_job(sched_job)		\
++		container_of((sched_job), struct drm_sched_job, queue_node)
++
++void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
++{
++	struct drm_sched_job *s_job;
++	struct drm_sched_entity *s_entity = NULL;
++	int i;
++
++	/* Signal all jobs not yet scheduled */
++	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
++		struct drm_sched_rq *rq = &sched->sched_rq[i];
++
++		if (!rq)
++			continue;
++
++		spin_lock(&rq->lock);
++		list_for_each_entry(s_entity, &rq->entities, list) {
++			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
++				struct drm_sched_fence *s_fence = s_job->s_fence;
++
++				dma_fence_signal(&s_fence->scheduled);
++				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
++				dma_fence_signal(&s_fence->finished);
++			}
++		}
++		spin_unlock(&rq->lock);
++	}
++
++	/* Signal all jobs already scheduled to HW */
++	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
++		struct drm_sched_fence *s_fence = s_job->s_fence;
++
++		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
++		dma_fence_signal(&s_fence->finished);
++	}
++}
++
+ const struct drm_sched_backend_ops amdgpu_sched_ops = {
+ 	.dependency = amdgpu_job_dependency,
+ 	.run_job = amdgpu_job_run,
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+index 51e62504c279..dc7ee9358dcd 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+@@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
+ 		      void *owner, struct dma_fence **f);
+ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
+ 			     struct dma_fence **fence);
++
++void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
++
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+index 751c4c8e1cee..9d4e71ee8791 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+@@ -1030,6 +1030,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
+ 	/* Ensure IB tests are run on ring */
+ 	flush_delayed_work(&adev->delayed_init_work);
+ 
++
++	if (amdgpu_ras_intr_triggered()) {
++		DRM_ERROR("RAS Intr triggered, device disabled!!");
++		return -EHWPOISON;
++	}
++
+ 	file_priv->driver_priv = NULL;
+ 
+ 	r = pm_runtime_get_sync(dev->dev);
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+index 2ca3997d4b3a..01a66559f04e 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+@@ -24,6 +24,8 @@
+ #include <linux/debugfs.h>
+ #include <linux/list.h>
+ #include <linux/module.h>
++#include <linux/reboot.h>
++#include <linux/syscalls.h>
+ #include "amdgpu.h"
+ #include "amdgpu_ras.h"
+ #include "amdgpu_atomfirmware.h"
+@@ -64,6 +66,9 @@ const char *ras_block_string[] = {
+ /* inject address is 52 bits */
+ #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
+ 
++
++atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
++
+ static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
+ 		uint64_t offset, uint64_t size,
+ 		struct amdgpu_bo **bo_ptr);
+@@ -188,6 +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
+ 
+ 	return 0;
+ }
++
++static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
++		struct ras_common_if *head);
++
+ /**
+  * DOC: AMDGPU RAS debugfs control interface
+  *
+@@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
+ 	info->ue_count = obj->err_data.ue_count;
+ 	info->ce_count = obj->err_data.ce_count;
+ 
+-	if (err_data.ce_count)
++	if (err_data.ce_count) {
+ 		dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
+ 			 obj->err_data.ce_count, ras_block_str(info->head.block));
+-	if (err_data.ue_count)
++	}
++	if (err_data.ue_count) {
+ 		dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
+ 			 obj->err_data.ue_count, ras_block_str(info->head.block));
++	}
+ 
+ 	return 0;
+ }
+@@ -1729,3 +1740,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
+ 
+ 	return 0;
+ }
++
++void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
++{
++	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
++		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
++	}
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+index 66b71525446e..6fda96b29f1f 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+@@ -606,4 +606,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
+ 
+ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
+ 		struct ras_dispatch_if *info);
++
++extern atomic_t amdgpu_ras_in_intr;
++
++static inline bool amdgpu_ras_intr_triggered(void)
++{
++	return !!atomic_read(&amdgpu_ras_in_intr);
++}
++
++void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
++
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+index 384fc226ecb5..918eaeedb5b9 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+@@ -5683,10 +5683,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+ 		struct amdgpu_iv_entry *entry)
+ {
+ 	/* TODO ue will trigger an interrupt. */
+-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+-	if (adev->gfx.funcs->query_ras_error_count)
+-		adev->gfx.funcs->query_ras_error_count(adev, err_data);
+-	amdgpu_ras_reset_gpu(adev, 0);
++	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
++		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
++		if (adev->gfx.funcs->query_ras_error_count)
++			adev->gfx.funcs->query_ras_error_count(adev, err_data);
++		amdgpu_ras_reset_gpu(adev, 0);
++	}
+ 	return AMDGPU_RAS_SUCCESS;
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+index 4a19647edfea..617311db7d2e 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+@@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+ 		struct ras_err_data *err_data,
+ 		struct amdgpu_iv_entry *entry)
+ {
+-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+-	if (adev->umc.funcs->query_ras_error_count)
+-		adev->umc.funcs->query_ras_error_count(adev, err_data);
+-	/* umc query_ras_error_address is also responsible for clearing
+-	 * error status
+-	 */
+-	if (adev->umc.funcs->query_ras_error_address)
+-		adev->umc.funcs->query_ras_error_address(adev, err_data);
++	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
++		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
++		if (adev->umc.funcs->query_ras_error_count)
++			adev->umc.funcs->query_ras_error_count(adev, err_data);
++		/* umc query_ras_error_address is also responsible for clearing
++		 * error status
++		 */
++		if (adev->umc.funcs->query_ras_error_address)
++			adev->umc.funcs->query_ras_error_address(adev, err_data);
+ 
+-	/* only uncorrectable error needs gpu reset */
+-	if (err_data->ue_count)
+-		amdgpu_ras_reset_gpu(adev, 0);
++		/* only uncorrectable error needs gpu reset */
++		if (err_data->ue_count)
++			amdgpu_ras_reset_gpu(adev, 0);
++	}
+ 
+ 	return AMDGPU_RAS_SUCCESS;
+ }
+diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+index 5e784bbd2d7f..27eeab143ad7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+@@ -30,6 +30,7 @@
+ #include "nbio/nbio_7_4_0_smn.h"
+ #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+ #include <uapi/linux/kfd_ioctl.h>
++#include "amdgpu_ras.h"
+ 
+ #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c
+ 
+@@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
+ 						BIF_DOORBELL_INT_CNTL,
+ 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
+ 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
++
++		amdgpu_ras_global_ras_isr(adev);
+ 	}
+ }
+ 
+@@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
+ 						BIF_DOORBELL_INT_CNTL,
+ 						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+ 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
++
++		amdgpu_ras_global_ras_isr(adev);
+ 	}
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+index 72840582f716..6424723e1af0 100644
+--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+@@ -1978,24 +1978,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
+ 	uint32_t err_source;
+ 	int instance;
+ 
+-	instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
+-	if (instance < 0)
+-		return 0;
++	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
++		instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
++		if (instance < 0)
++			return 0;
+ 
+-	switch (entry->src_id) {
+-	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
+-		err_source = 0;
+-		break;
+-	case SDMA0_4_0__SRCID__SDMA_ECC:
+-		err_source = 1;
+-		break;
+-	default:
+-		return 0;
+-	}
++		switch (entry->src_id) {
++		case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
++			err_source = 0;
++			break;
++		case SDMA0_4_0__SRCID__SDMA_ECC:
++			err_source = 1;
++			break;
++		default:
++			return 0;
++		}
+ 
+-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
++		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ 
+-	amdgpu_ras_reset_gpu(adev, 0);
++		amdgpu_ras_reset_gpu(adev, 0);
++	}
+ 
+ 	return AMDGPU_RAS_SUCCESS;
+ }
+-- 
+2.17.1
+