From 62a47e504af42f37ecf5ffef8fd1350276547cfc Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 8 Nov 2017 14:35:04 +0800 Subject: [PATCH 2156/4131] drm/amdgpu:fix gpu recover missing skipping(v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit if app close CTX right after IB submit, gpu recover will fail to find out the entity behind this guilty job thus lead to no job skipping for this guilty job. to fix this corner case just move the increasement of job->karma out of the entity iteration. v2: only do karma increasment if bad->s_priority != KERNEL because we always consider KERNEL job be correct and always want to recover an unfinished kernel job (sometimes kernel job is interrupted by VF FLR or other GPU hang event) Change-Id: I33e9e959e182d7e002a2108e565cb898acac4f9c Signed-off-by: Monk Liu Reviewed-by: Christian König Reviewed-By: Xiangliang Yu --- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index 74a60c5..537b296 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -464,7 +464,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo } spin_unlock(&sched->job_list_lock); - if (bad) { + if (bad && bad->s_priority != AMD_SCHED_PRIORITY_KERNEL) { + atomic_inc(&bad->karma); /* don't increase @bad's karma if it's from KERNEL RQ, * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) * corrupt but keep in mind that kernel jobs always considered good. @@ -475,7 +476,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo spin_lock(&rq->lock); list_for_each_entry_safe(entity, tmp, &rq->entities, list) { if (bad->s_fence->scheduled.context == entity->fence_context) { - if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit) + if (atomic_read(&bad->karma) > bad->sched->hang_limit) if (entity->guilty) atomic_set(entity->guilty, 1); break; -- 2.7.4