diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch | 398 |
1 files changed, 398 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch new file mode 100644 index 00000000..b0e7a77c --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch @@ -0,0 +1,398 @@ +From 6cddd831befde582b4da8634487287642e695a36 Mon Sep 17 00:00:00 2001 +From: Emily Deng <Emily.Deng@amd.com> +Date: Mon, 16 Apr 2018 10:07:02 +0800 +Subject: [PATCH 4195/5725] drm/gpu-sched: fix force APP kill hang(v4) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +issue: +there are VMC page fault occurred if force APP kill during +3dmark test, the cause is in entity_fini we manually signal +all those jobs in entity's queue which confuse the sync/dep +mechanism: + +1)page fault occurred in sdma's clear job which operate on +shadow buffer, and shadow buffer's Gart table is cleaned by +ttm_bo_release since the fence in its reservation was fake signaled +by entity_fini() under the case of SIGKILL received. + +2)page fault occurred in gfx' job because during the lifetime +of gfx job we manually fake signal all jobs from its entity +in entity_fini(), thus the unmapping/clear PTE job depend on those +result fence is satisfied and sdma start clearing the PTE and lead +to GFX page fault. + +fix: +1)should at least wait all jobs already scheduled complete in entity_fini() +if SIGKILL is the case. + +2)if a fence signaled and try to clear some entity's dependency, should +set this entity guilty to prevent its job really run since the dependency +is fake signaled. + +v2: +splitting drm_sched_entity_fini() into two functions: +1)The first one is does the waiting, removes the entity from the +runqueue and returns an error when the process was killed. +2)The second one then goes over the entity, install it as +completion signal for the remaining jobs and signals all jobs +with an error code. + +v3: +1)Replace the fini1 and fini2 with better name +2)Call the first part before the VM teardown in +amdgpu_driver_postclose_kms() and the second part +after the VM teardown +3)Keep the original function drm_sched_entity_fini to +refine the code. + +v4: +1)Rename entity->finished to entity->last_scheduled; +2)Rename drm_sched_entity_fini_job_cb() to +drm_sched_entity_kill_jobs_cb(); +3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT; +4)Replace the type of entity->fini_status with "int"; +5)Remove the check about entity->finished. + +Signed-off-by: Monk Liu <Monk.Liu@amd.com> +Signed-off-by: Emily Deng <Emily.Deng@amd.com> +Reviewed-by: Christian König <christian.koenig@amd.com> +Signed-off-by: Kalyan Alle <kalyan.alle@amd.com> + +Conflicts: + drivers/gpu/drm/scheduler/gpu_scheduler.c + +Change-Id: I8c859960c8faf8ab36210f098e6514b455bea171 +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 64 ++++++++++++++++++++++++---- + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 5 ++- + drivers/gpu/drm/scheduler/gpu_scheduler.c | 71 ++++++++++++++++++++++++++----- + include/drm/gpu_scheduler.h | 7 +++ + 5 files changed, 128 insertions(+), 21 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 526c7a0..01496d7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -702,6 +702,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, + int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id); + + void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr); ++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr); ++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr); + void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr); + + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +index 2e705f9..bdeec74 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +@@ -113,8 +113,9 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, + return r; + } + +-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx) ++static void amdgpu_ctx_fini(struct kref *ref) + { ++ struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount); + struct amdgpu_device *adev = ctx->adev; + unsigned i, j; + +@@ -142,13 +143,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx) + kfree(ctx->fences); + ctx->fences = NULL; + +- for (i = 0; i < adev->num_rings; i++) +- drm_sched_entity_fini(&adev->rings[i]->sched, +- &ctx->rings[i].entity); +- + amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); + + mutex_destroy(&ctx->lock); ++ ++ kfree(ctx); + } + + static int amdgpu_ctx_alloc(struct amdgpu_device *adev, +@@ -187,12 +186,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev, + static void amdgpu_ctx_do_release(struct kref *ref) + { + struct amdgpu_ctx *ctx; ++ u32 i; + + ctx = container_of(ref, struct amdgpu_ctx, refcount); + +- amdgpu_ctx_fini(ctx); ++ for (i = 0; i < ctx->adev->num_rings; i++) ++ drm_sched_entity_fini(&ctx->adev->rings[i]->sched, ++ &ctx->rings[i].entity); + +- kfree(ctx); ++ amdgpu_ctx_fini(ref); + } + + static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) +@@ -452,16 +454,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr) + idr_init(&mgr->ctx_handles); + } + ++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) ++{ ++ struct amdgpu_ctx *ctx; ++ struct idr *idp; ++ uint32_t id, i; ++ ++ idp = &mgr->ctx_handles; ++ ++ idr_for_each_entry(idp, ctx, id) { ++ ++ if (!ctx->adev) ++ return; ++ ++ for (i = 0; i < ctx->adev->num_rings; i++) ++ if (kref_read(&ctx->refcount) == 1) ++ drm_sched_entity_do_release(&ctx->adev->rings[i]->sched, ++ &ctx->rings[i].entity); ++ else ++ DRM_ERROR("ctx %p is still alive\n", ctx); ++ } ++} ++ ++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) ++{ ++ struct amdgpu_ctx *ctx; ++ struct idr *idp; ++ uint32_t id, i; ++ ++ idp = &mgr->ctx_handles; ++ ++ idr_for_each_entry(idp, ctx, id) { ++ ++ if (!ctx->adev) ++ return; ++ ++ for (i = 0; i < ctx->adev->num_rings; i++) ++ if (kref_read(&ctx->refcount) == 1) ++ drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched, ++ &ctx->rings[i].entity); ++ else ++ DRM_ERROR("ctx %p is still alive\n", ctx); ++ } ++} ++ + void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr) + { + struct amdgpu_ctx *ctx; + struct idr *idp; + uint32_t id; + ++ amdgpu_ctx_mgr_entity_cleanup(mgr); ++ + idp = &mgr->ctx_handles; + + idr_for_each_entry(idp, ctx, id) { +- if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1) ++ if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1) + DRM_ERROR("ctx %p is still alive\n", ctx); + } + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +index efbed62..79d1060 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +@@ -959,8 +959,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, + return; + + pm_runtime_get_sync(dev->dev); +- +- amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr); ++ amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr); + + if (adev->asic_type != CHIP_RAVEN) { + amdgpu_uvd_free_handles(adev, file_priv); +@@ -981,6 +980,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, + pd = amdgpu_bo_ref(fpriv->vm.root.base.bo); + + amdgpu_vm_fini(adev, &fpriv->vm); ++ amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr); ++ + if (pasid) + amdgpu_pasid_free_delayed(pd->tbo.resv, pasid); + amdgpu_bo_unref(&pd); +diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c +index 0d95888..f45b4fd 100644 +--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c ++++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c +@@ -135,6 +135,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, + entity->rq = rq; + entity->sched = sched; + entity->guilty = guilty; ++ entity->fini_status = 0; ++ entity->last_scheduled = NULL; + + spin_lock_init(&entity->rq_lock); + spin_lock_init(&entity->queue_lock); +@@ -196,19 +198,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity) + return true; + } + ++static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, ++ struct dma_fence_cb *cb) ++{ ++ struct drm_sched_job *job = container_of(cb, struct drm_sched_job, ++ finish_cb); ++ drm_sched_fence_finished(job->s_fence); ++ WARN_ON(job->s_fence->parent); ++ dma_fence_put(&job->s_fence->finished); ++ job->sched->ops->free_job(job); ++} ++ ++ + /** + * Destroy a context entity + * + * @sched Pointer to scheduler instance + * @entity The pointer to a valid scheduler entity + * +- * Cleanup and free the allocated resources. ++ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting, ++ * removes the entity from the runqueue and returns an error when the process was killed. + */ +-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, ++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity) + { +- int r; +- + if (!drm_sched_entity_is_initialized(sched, entity)) + return; + /** +@@ -216,13 +229,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, + * queued IBs or discard them on SIGKILL + */ + if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) +- r = -ERESTARTSYS; ++ entity->fini_status = -ERESTARTSYS; + else +- r = wait_event_killable(sched->job_scheduled, ++ entity->fini_status = wait_event_killable(sched->job_scheduled, + drm_sched_entity_is_idle(entity)); + drm_sched_entity_set_rq(entity, NULL); +- if (r) { ++} ++EXPORT_SYMBOL(drm_sched_entity_do_release); ++ ++/** ++ * Destroy a context entity ++ * ++ * @sched Pointer to scheduler instance ++ * @entity The pointer to a valid scheduler entity ++ * ++ * The second one then goes over the entity and signals all jobs with an error code. ++ */ ++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, ++ struct drm_sched_entity *entity) ++{ ++ if (entity->fini_status) { + struct drm_sched_job *job; ++ int r; + + /* Park the kernel for a moment to make sure it isn't processing + * our enity. +@@ -240,13 +268,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, + struct drm_sched_fence *s_fence = job->s_fence; + drm_sched_fence_scheduled(s_fence); + dma_fence_set_error(&s_fence->finished, -ESRCH); +- drm_sched_fence_finished(s_fence); +- WARN_ON(s_fence->parent); +- dma_fence_put(&s_fence->finished); +- sched->ops->free_job(job); ++ r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, ++ drm_sched_entity_kill_jobs_cb); ++ if (r == -ENOENT) ++ drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); ++ else if (r) ++ DRM_ERROR("fence add callback failed (%d)\n", r); + } ++ ++ dma_fence_put(entity->last_scheduled); ++ entity->last_scheduled = NULL; + } + } ++EXPORT_SYMBOL(drm_sched_entity_cleanup); ++ ++void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, ++ struct drm_sched_entity *entity) ++{ ++ drm_sched_entity_do_release(sched, entity); ++ drm_sched_entity_cleanup(sched, entity); ++} + EXPORT_SYMBOL(drm_sched_entity_fini); + + static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) +@@ -529,6 +570,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) + spin_unlock(&sched->job_list_lock); + fence = sched->ops->run_job(s_job); + atomic_inc(&sched->hw_rq_count); ++ ++ dma_fence_put(s_job->entity->last_scheduled); ++ s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished); ++ + if (fence) { + s_fence->parent = dma_fence_get(fence); + r = dma_fence_add_callback(fence, &s_fence->cb, +@@ -555,6 +600,7 @@ int drm_sched_job_init(struct drm_sched_job *job, + void *owner) + { + job->sched = sched; ++ job->entity = entity; + job->s_priority = entity->rq - sched->sched_rq; + job->s_fence = drm_sched_fence_create(entity, owner); + if (!job->s_fence) +@@ -668,6 +714,9 @@ static int drm_sched_main(void *param) + fence = sched->ops->run_job(sched_job); + drm_sched_fence_scheduled(s_fence); + ++ dma_fence_put(entity->last_scheduled); ++ entity->last_scheduled = dma_fence_get(&s_fence->finished); ++ + if (fence) { + s_fence->parent = dma_fence_get(fence); + r = dma_fence_add_callback(fence, &s_fence->cb, +diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h +index dfd54fb..1df6229 100644 +--- a/include/drm/gpu_scheduler.h ++++ b/include/drm/gpu_scheduler.h +@@ -63,6 +63,8 @@ struct drm_sched_entity { + struct dma_fence *dependency; + struct dma_fence_cb cb; + atomic_t *guilty; /* points to ctx's guilty */ ++ int fini_status; ++ struct dma_fence *last_scheduled; + }; + + /** +@@ -99,6 +101,7 @@ struct drm_sched_job { + uint64_t id; + atomic_t karma; + enum drm_sched_priority s_priority; ++ struct drm_sched_entity *entity; + }; + + static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, +@@ -148,6 +151,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, + struct drm_sched_rq *rq, + uint32_t jobs, atomic_t *guilty); ++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, ++ struct drm_sched_entity *entity); ++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, ++ struct drm_sched_entity *entity); + void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity); + void drm_sched_entity_push_job(struct drm_sched_job *sched_job, +-- +2.7.4 + |