aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch180
1 files changed, 180 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch
new file mode 100644
index 00000000..ca17c44d
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2085-amd-scheduler-imple-job-skip-feature-v3.patch
@@ -0,0 +1,180 @@
+From e9519126a39f8cfe39323335f35f1d10d22f1702 Mon Sep 17 00:00:00 2001
+From: Monk Liu <Monk.Liu@amd.com>
+Date: Wed, 25 Oct 2017 16:21:08 +0800
+Subject: [PATCH 2085/4131] amd/scheduler:imple job skip feature(v3)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+jobs are skipped under two cases
+1)when the entity behind this job marked guilty, the job
+poped from this entity's queue will be dropped in sched_main loop.
+
+2)in job_recovery(), skip the scheduling job if its karma detected
+above limit, and also skipped as well for other jobs sharing the
+same fence context. this approach is becuase job_recovery() cannot
+access job->entity due to entity may already dead.
+
+v2:
+some logic fix
+
+v3:
+when entity detected guilty, don't drop the job in the poping
+stage, instead set its fence error as -ECANCELED
+
+in run_job(), skip the scheduling either:1) fence->error < 0
+or 2) there was a VRAM LOST occurred on this job.
+this way we can unify the job skipping logic.
+
+with this feature we can introduce new gpu recover feature.
+
+Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
+Signed-off-by: Monk Liu <Monk.Liu@amd.com>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+
+ Conflicts:
+ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 11 +++++---
+ drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 ++++++++++++++++-----------
+ 2 files changed, 30 insertions(+), 20 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+index f60662e..e97713a 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
+
+ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
+ {
+- struct dma_fence *fence = NULL;
++ struct dma_fence *fence = NULL, *finished;
+ struct amdgpu_device *adev;
+ struct amdgpu_job *job;
+ int r;
+@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
+ return NULL;
+ }
+ job = to_amdgpu_job(sched_job);
++ finished = &job->base.s_fence->finished;
+ adev = job->adev;
+
+ BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
+
+ trace_amdgpu_sched_run_job(job);
+ /* skip ib schedule when vram is lost */
+- if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
+- dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
+- DRM_ERROR("Skip scheduling IBs!\n");
++ if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
++ dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
++
++ if (finished->error < 0) {
++ DRM_INFO("Skip scheduling IBs!\n");
+ } else {
+ r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
+ &fence);
+diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+index 903ef8b..6f041e8 100644
+--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
++++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+@@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
+ if (amd_sched_entity_add_dependency_cb(entity))
+ return NULL;
+
++ /* skip jobs from entity that marked guilty */
++ if (entity->guilty && atomic_read(entity->guilty))
++ dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
++
+ spsc_queue_pop(&entity->job_queue);
+ return sched_job;
+ }
+@@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
+ job->sched->ops->timedout_job(job);
+ }
+
+-static void amd_sched_set_guilty(struct amd_sched_job *s_job,
+- struct amd_sched_entity *s_entity)
+-{
+- if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
+- if (s_entity->guilty)
+- atomic_set(s_entity->guilty, 1);
+-}
+-
+ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
+ {
+ struct amd_sched_job *s_job;
+@@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
+ spin_unlock(&sched->job_list_lock);
+
+ if (bad) {
+- bool found = false;
+-
+- for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
++ /* don't increase @bad's karma if it's from KERNEL RQ,
++ * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
++ * corrupt but keep in mind that kernel jobs always considered good.
++ */
++ for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
+ struct amd_sched_rq *rq = &sched->sched_rq[i];
+
+ spin_lock(&rq->lock);
+ list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
+ if (bad->s_fence->scheduled.context == entity->fence_context) {
+- found = true;
+- amd_sched_set_guilty(bad, entity);
++ if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
++ if (entity->guilty)
++ atomic_set(entity->guilty, 1);
+ break;
+ }
+ }
+ spin_unlock(&rq->lock);
+- if (found)
++ if (&entity->list != &rq->entities)
+ break;
+ }
+ }
+@@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
+ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
+ {
+ struct amd_sched_job *s_job, *tmp;
++ bool found_guilty = false;
+ int r;
+
+ spin_lock(&sched->job_list_lock);
+@@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
+ list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
+ struct amd_sched_fence *s_fence = s_job->s_fence;
+ struct dma_fence *fence;
++ uint64_t guilty_context;
++
++ if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
++ found_guilty = true;
++ guilty_context = s_job->s_fence->scheduled.context;
++ }
++
++ if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
++ dma_fence_set_error(&s_fence->finished, -ECANCELED);
+
+ spin_unlock(&sched->job_list_lock);
+ fence = sched->ops->run_job(s_job);
+@@ -525,7 +534,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
+ r);
+ dma_fence_put(fence);
+ } else {
+- DRM_ERROR("Failed to run job!\n");
+ amd_sched_process_job(NULL, &s_fence->cb);
+ }
+ spin_lock(&sched->job_list_lock);
+@@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
+ r);
+ dma_fence_put(fence);
+ } else {
+- DRM_ERROR("Failed to run job!\n");
+ amd_sched_process_job(NULL, &s_fence->cb);
+ }
+
+--
+2.7.4
+