diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch | 180 |
1 files changed, 180 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch new file mode 100644 index 00000000..ca17c44d --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch @@ -0,0 +1,180 @@ +From e9519126a39f8cfe39323335f35f1d10d22f1702 Mon Sep 17 00:00:00 2001 +From: Monk Liu <Monk.Liu@amd.com> +Date: Wed, 25 Oct 2017 16:21:08 +0800 +Subject: [PATCH 2085/4131] amd/scheduler:imple job skip feature(v3) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +jobs are skipped under two cases +1)when the entity behind this job marked guilty, the job +poped from this entity's queue will be dropped in sched_main loop. + +2)in job_recovery(), skip the scheduling job if its karma detected +above limit, and also skipped as well for other jobs sharing the +same fence context. this approach is becuase job_recovery() cannot +access job->entity due to entity may already dead. + +v2: +some logic fix + +v3: +when entity detected guilty, don't drop the job in the poping +stage, instead set its fence error as -ECANCELED + +in run_job(), skip the scheduling either:1) fence->error < 0 +or 2) there was a VRAM LOST occurred on this job. +this way we can unify the job skipping logic. + +with this feature we can introduce new gpu recover feature. + +Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a +Signed-off-by: Monk Liu <Monk.Liu@amd.com> +Reviewed-by: Christian König <christian.koenig@amd.com> + + Conflicts: + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 11 +++++--- + drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++----------- + 2 files changed, 30 insertions(+), 20 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +index f60662e..e97713a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job, + + static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) + { +- struct dma_fence *fence = NULL; ++ struct dma_fence *fence = NULL, *finished; + struct amdgpu_device *adev; + struct amdgpu_job *job; + int r; +@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) + return NULL; + } + job = to_amdgpu_job(sched_job); ++ finished = &job->base.s_fence->finished; + adev = job->adev; + + BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); + + trace_amdgpu_sched_run_job(job); + /* skip ib schedule when vram is lost */ +- if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { +- dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); +- DRM_ERROR("Skip scheduling IBs!\n"); ++ if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) ++ dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ ++ ++ if (finished->error < 0) { ++ DRM_INFO("Skip scheduling IBs!\n"); + } else { + r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, + &fence); +diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +index 903ef8b..6f041e8 100644 +--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c ++++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +@@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) + if (amd_sched_entity_add_dependency_cb(entity)) + return NULL; + ++ /* skip jobs from entity that marked guilty */ ++ if (entity->guilty && atomic_read(entity->guilty)) ++ dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); ++ + spsc_queue_pop(&entity->job_queue); + return sched_job; + } +@@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work) + job->sched->ops->timedout_job(job); + } + +-static void amd_sched_set_guilty(struct amd_sched_job *s_job, +- struct amd_sched_entity *s_entity) +-{ +- if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) +- if (s_entity->guilty) +- atomic_set(s_entity->guilty, 1); +-} +- + void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) + { + struct amd_sched_job *s_job; +@@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo + spin_unlock(&sched->job_list_lock); + + if (bad) { +- bool found = false; +- +- for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { ++ /* don't increase @bad's karma if it's from KERNEL RQ, ++ * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) ++ * corrupt but keep in mind that kernel jobs always considered good. ++ */ ++ for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) { + struct amd_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == entity->fence_context) { +- found = true; +- amd_sched_set_guilty(bad, entity); ++ if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit) ++ if (entity->guilty) ++ atomic_set(entity->guilty, 1); + break; + } + } + spin_unlock(&rq->lock); +- if (found) ++ if (&entity->list != &rq->entities) + break; + } + } +@@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) + void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) + { + struct amd_sched_job *s_job, *tmp; ++ bool found_guilty = false; + int r; + + spin_lock(&sched->job_list_lock); +@@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { + struct amd_sched_fence *s_fence = s_job->s_fence; + struct dma_fence *fence; ++ uint64_t guilty_context; ++ ++ if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { ++ found_guilty = true; ++ guilty_context = s_job->s_fence->scheduled.context; ++ } ++ ++ if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) ++ dma_fence_set_error(&s_fence->finished, -ECANCELED); + + spin_unlock(&sched->job_list_lock); + fence = sched->ops->run_job(s_job); +@@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) + r); + dma_fence_put(fence); + } else { +- DRM_ERROR("Failed to run job!\n"); + amd_sched_process_job(NULL, &s_fence->cb); + } + spin_lock(&sched->job_list_lock); +@@ -663,7 +671,6 @@ static int amd_sched_main(void *param) + r); + dma_fence_put(fence); + } else { +- DRM_ERROR("Failed to run job!\n"); + amd_sched_process_job(NULL, &s_fence->cb); + } + +-- +2.7.4 + |