aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch398
1 files changed, 398 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch
new file mode 100644
index 00000000..b0e7a77c
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch
@@ -0,0 +1,398 @@
+From 6cddd831befde582b4da8634487287642e695a36 Mon Sep 17 00:00:00 2001
+From: Emily Deng <Emily.Deng@amd.com>
+Date: Mon, 16 Apr 2018 10:07:02 +0800
+Subject: [PATCH 4195/5725] drm/gpu-sched: fix force APP kill hang(v4)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+issue:
+there are VMC page fault occurred if force APP kill during
+3dmark test, the cause is in entity_fini we manually signal
+all those jobs in entity's queue which confuse the sync/dep
+mechanism:
+
+1)page fault occurred in sdma's clear job which operate on
+shadow buffer, and shadow buffer's Gart table is cleaned by
+ttm_bo_release since the fence in its reservation was fake signaled
+by entity_fini() under the case of SIGKILL received.
+
+2)page fault occurred in gfx' job because during the lifetime
+of gfx job we manually fake signal all jobs from its entity
+in entity_fini(), thus the unmapping/clear PTE job depend on those
+result fence is satisfied and sdma start clearing the PTE and lead
+to GFX page fault.
+
+fix:
+1)should at least wait all jobs already scheduled complete in entity_fini()
+if SIGKILL is the case.
+
+2)if a fence signaled and try to clear some entity's dependency, should
+set this entity guilty to prevent its job really run since the dependency
+is fake signaled.
+
+v2:
+splitting drm_sched_entity_fini() into two functions:
+1)The first one is does the waiting, removes the entity from the
+runqueue and returns an error when the process was killed.
+2)The second one then goes over the entity, install it as
+completion signal for the remaining jobs and signals all jobs
+with an error code.
+
+v3:
+1)Replace the fini1 and fini2 with better name
+2)Call the first part before the VM teardown in
+amdgpu_driver_postclose_kms() and the second part
+after the VM teardown
+3)Keep the original function drm_sched_entity_fini to
+refine the code.
+
+v4:
+1)Rename entity->finished to entity->last_scheduled;
+2)Rename drm_sched_entity_fini_job_cb() to
+drm_sched_entity_kill_jobs_cb();
+3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT;
+4)Replace the type of entity->fini_status with "int";
+5)Remove the check about entity->finished.
+
+Signed-off-by: Monk Liu <Monk.Liu@amd.com>
+Signed-off-by: Emily Deng <Emily.Deng@amd.com>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>
+
+Conflicts:
+ drivers/gpu/drm/scheduler/gpu_scheduler.c
+
+Change-Id: I8c859960c8faf8ab36210f098e6514b455bea171
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 64 ++++++++++++++++++++++++----
+ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 5 ++-
+ drivers/gpu/drm/scheduler/gpu_scheduler.c | 71 ++++++++++++++++++++++++++-----
+ include/drm/gpu_scheduler.h | 7 +++
+ 5 files changed, 128 insertions(+), 21 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 526c7a0..01496d7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -702,6 +702,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
+ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
+
+ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
+ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
+
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+index 2e705f9..bdeec74 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+@@ -113,8 +113,9 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
+ return r;
+ }
+
+-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
++static void amdgpu_ctx_fini(struct kref *ref)
+ {
++ struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
+ struct amdgpu_device *adev = ctx->adev;
+ unsigned i, j;
+
+@@ -142,13 +143,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
+ kfree(ctx->fences);
+ ctx->fences = NULL;
+
+- for (i = 0; i < adev->num_rings; i++)
+- drm_sched_entity_fini(&adev->rings[i]->sched,
+- &ctx->rings[i].entity);
+-
+ amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
+
+ mutex_destroy(&ctx->lock);
++
++ kfree(ctx);
+ }
+
+ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
+@@ -187,12 +186,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
+ static void amdgpu_ctx_do_release(struct kref *ref)
+ {
+ struct amdgpu_ctx *ctx;
++ u32 i;
+
+ ctx = container_of(ref, struct amdgpu_ctx, refcount);
+
+- amdgpu_ctx_fini(ctx);
++ for (i = 0; i < ctx->adev->num_rings; i++)
++ drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
++ &ctx->rings[i].entity);
+
+- kfree(ctx);
++ amdgpu_ctx_fini(ref);
+ }
+
+ static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
+@@ -452,16 +454,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
+ idr_init(&mgr->ctx_handles);
+ }
+
++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
++{
++ struct amdgpu_ctx *ctx;
++ struct idr *idp;
++ uint32_t id, i;
++
++ idp = &mgr->ctx_handles;
++
++ idr_for_each_entry(idp, ctx, id) {
++
++ if (!ctx->adev)
++ return;
++
++ for (i = 0; i < ctx->adev->num_rings; i++)
++ if (kref_read(&ctx->refcount) == 1)
++ drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
++ &ctx->rings[i].entity);
++ else
++ DRM_ERROR("ctx %p is still alive\n", ctx);
++ }
++}
++
++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
++{
++ struct amdgpu_ctx *ctx;
++ struct idr *idp;
++ uint32_t id, i;
++
++ idp = &mgr->ctx_handles;
++
++ idr_for_each_entry(idp, ctx, id) {
++
++ if (!ctx->adev)
++ return;
++
++ for (i = 0; i < ctx->adev->num_rings; i++)
++ if (kref_read(&ctx->refcount) == 1)
++ drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
++ &ctx->rings[i].entity);
++ else
++ DRM_ERROR("ctx %p is still alive\n", ctx);
++ }
++}
++
+ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
+ {
+ struct amdgpu_ctx *ctx;
+ struct idr *idp;
+ uint32_t id;
+
++ amdgpu_ctx_mgr_entity_cleanup(mgr);
++
+ idp = &mgr->ctx_handles;
+
+ idr_for_each_entry(idp, ctx, id) {
+- if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
++ if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
+ DRM_ERROR("ctx %p is still alive\n", ctx);
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+index efbed62..79d1060 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+@@ -959,8 +959,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
+ return;
+
+ pm_runtime_get_sync(dev->dev);
+-
+- amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
++ amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
+
+ if (adev->asic_type != CHIP_RAVEN) {
+ amdgpu_uvd_free_handles(adev, file_priv);
+@@ -981,6 +980,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
+ pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
+
+ amdgpu_vm_fini(adev, &fpriv->vm);
++ amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
++
+ if (pasid)
+ amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
+ amdgpu_bo_unref(&pd);
+diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
+index 0d95888..f45b4fd 100644
+--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
++++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
+@@ -135,6 +135,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
+ entity->rq = rq;
+ entity->sched = sched;
+ entity->guilty = guilty;
++ entity->fini_status = 0;
++ entity->last_scheduled = NULL;
+
+ spin_lock_init(&entity->rq_lock);
+ spin_lock_init(&entity->queue_lock);
+@@ -196,19 +198,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
+ return true;
+ }
+
++static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
++ struct dma_fence_cb *cb)
++{
++ struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
++ finish_cb);
++ drm_sched_fence_finished(job->s_fence);
++ WARN_ON(job->s_fence->parent);
++ dma_fence_put(&job->s_fence->finished);
++ job->sched->ops->free_job(job);
++}
++
++
+ /**
+ * Destroy a context entity
+ *
+ * @sched Pointer to scheduler instance
+ * @entity The pointer to a valid scheduler entity
+ *
+- * Cleanup and free the allocated resources.
++ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
++ * removes the entity from the runqueue and returns an error when the process was killed.
+ */
+-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
+ struct drm_sched_entity *entity)
+ {
+- int r;
+-
+ if (!drm_sched_entity_is_initialized(sched, entity))
+ return;
+ /**
+@@ -216,13 +229,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ * queued IBs or discard them on SIGKILL
+ */
+ if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
+- r = -ERESTARTSYS;
++ entity->fini_status = -ERESTARTSYS;
+ else
+- r = wait_event_killable(sched->job_scheduled,
++ entity->fini_status = wait_event_killable(sched->job_scheduled,
+ drm_sched_entity_is_idle(entity));
+ drm_sched_entity_set_rq(entity, NULL);
+- if (r) {
++}
++EXPORT_SYMBOL(drm_sched_entity_do_release);
++
++/**
++ * Destroy a context entity
++ *
++ * @sched Pointer to scheduler instance
++ * @entity The pointer to a valid scheduler entity
++ *
++ * The second one then goes over the entity and signals all jobs with an error code.
++ */
++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
++ struct drm_sched_entity *entity)
++{
++ if (entity->fini_status) {
+ struct drm_sched_job *job;
++ int r;
+
+ /* Park the kernel for a moment to make sure it isn't processing
+ * our enity.
+@@ -240,13 +268,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ struct drm_sched_fence *s_fence = job->s_fence;
+ drm_sched_fence_scheduled(s_fence);
+ dma_fence_set_error(&s_fence->finished, -ESRCH);
+- drm_sched_fence_finished(s_fence);
+- WARN_ON(s_fence->parent);
+- dma_fence_put(&s_fence->finished);
+- sched->ops->free_job(job);
++ r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
++ drm_sched_entity_kill_jobs_cb);
++ if (r == -ENOENT)
++ drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
++ else if (r)
++ DRM_ERROR("fence add callback failed (%d)\n", r);
+ }
++
++ dma_fence_put(entity->last_scheduled);
++ entity->last_scheduled = NULL;
+ }
+ }
++EXPORT_SYMBOL(drm_sched_entity_cleanup);
++
++void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
++ struct drm_sched_entity *entity)
++{
++ drm_sched_entity_do_release(sched, entity);
++ drm_sched_entity_cleanup(sched, entity);
++}
+ EXPORT_SYMBOL(drm_sched_entity_fini);
+
+ static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
+@@ -529,6 +570,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
+ spin_unlock(&sched->job_list_lock);
+ fence = sched->ops->run_job(s_job);
+ atomic_inc(&sched->hw_rq_count);
++
++ dma_fence_put(s_job->entity->last_scheduled);
++ s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished);
++
+ if (fence) {
+ s_fence->parent = dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &s_fence->cb,
+@@ -555,6 +600,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
+ void *owner)
+ {
+ job->sched = sched;
++ job->entity = entity;
+ job->s_priority = entity->rq - sched->sched_rq;
+ job->s_fence = drm_sched_fence_create(entity, owner);
+ if (!job->s_fence)
+@@ -668,6 +714,9 @@ static int drm_sched_main(void *param)
+ fence = sched->ops->run_job(sched_job);
+ drm_sched_fence_scheduled(s_fence);
+
++ dma_fence_put(entity->last_scheduled);
++ entity->last_scheduled = dma_fence_get(&s_fence->finished);
++
+ if (fence) {
+ s_fence->parent = dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &s_fence->cb,
+diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
+index dfd54fb..1df6229 100644
+--- a/include/drm/gpu_scheduler.h
++++ b/include/drm/gpu_scheduler.h
+@@ -63,6 +63,8 @@ struct drm_sched_entity {
+ struct dma_fence *dependency;
+ struct dma_fence_cb cb;
+ atomic_t *guilty; /* points to ctx's guilty */
++ int fini_status;
++ struct dma_fence *last_scheduled;
+ };
+
+ /**
+@@ -99,6 +101,7 @@ struct drm_sched_job {
+ uint64_t id;
+ atomic_t karma;
+ enum drm_sched_priority s_priority;
++ struct drm_sched_entity *entity;
+ };
+
+ static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
+@@ -148,6 +151,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
+ struct drm_sched_entity *entity,
+ struct drm_sched_rq *rq,
+ uint32_t jobs, atomic_t *guilty);
++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
++ struct drm_sched_entity *entity);
++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
++ struct drm_sched_entity *entity);
+ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ struct drm_sched_entity *entity);
+ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
+--
+2.7.4
+