1 files changed, 398 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch
new file mode 100644
index 00000000..b0e7a77c
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/4195-drm-gpu-sched-fix-force-APP-kill-hang-v4.patch
@@ -0,0 +1,398 @@
+From 6cddd831befde582b4da8634487287642e695a36 Mon Sep 17 00:00:00 2001
+From: Emily Deng <Emily.Deng@amd.com>
+Date: Mon, 16 Apr 2018 10:07:02 +0800
+Subject: [PATCH 4195/5725] drm/gpu-sched: fix force APP kill hang(v4)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+issue:
+there are VMC page fault occurred if force APP kill during
+3dmark test, the cause is in entity_fini we manually signal
+all those jobs in entity's queue which confuse the sync/dep
+mechanism:
+
+1)page fault occurred in sdma's clear job which operate on
+shadow buffer, and shadow buffer's Gart table is cleaned by
+ttm_bo_release since the fence in its reservation was fake signaled
+by entity_fini() under the case of SIGKILL received.
+
+2)page fault occurred in gfx' job because during the lifetime
+of gfx job we manually fake signal all jobs from its entity
+in entity_fini(), thus the unmapping/clear PTE job depend on those
+result fence is satisfied and sdma start clearing the PTE and lead
+to GFX page fault.
+
+fix:
+1)should at least wait all jobs already scheduled complete in entity_fini()
+if SIGKILL is the case.
+
+2)if a fence signaled and try to clear some entity's dependency, should
+set this entity guilty to prevent its job really run since the dependency
+is fake signaled.
+
+v2:
+splitting drm_sched_entity_fini() into two functions:
+1)The first one is does the waiting, removes the entity from the
+runqueue and returns an error when the process was killed.
+2)The second one then goes over the entity, install it as
+completion signal for the remaining jobs and signals all jobs
+with an error code.
+
+v3:
+1)Replace the fini1 and fini2 with better name
+2)Call the first part before the VM teardown in
+amdgpu_driver_postclose_kms() and the second part
+after the VM teardown
+3)Keep the original function drm_sched_entity_fini to
+refine the code.
+
+v4:
+1)Rename entity->finished to entity->last_scheduled;
+2)Rename drm_sched_entity_fini_job_cb() to
+drm_sched_entity_kill_jobs_cb();
+3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT;
+4)Replace the type of entity->fini_status with "int";
+5)Remove the check about entity->finished.
+
+Signed-off-by: Monk Liu <Monk.Liu@amd.com>
+Signed-off-by: Emily Deng <Emily.Deng@amd.com>
+Reviewed-by: Christian König <christian.koenig@amd.com>
+Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>
+
+Conflicts:
+      drivers/gpu/drm/scheduler/gpu_scheduler.c
+
+Change-Id: I8c859960c8faf8ab36210f098e6514b455bea171
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64 ++++++++++++++++++++++++----
+ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
+ drivers/gpu/drm/scheduler/gpu_scheduler.c | 71 ++++++++++++++++++++++++++-----
+ include/drm/gpu_scheduler.h               |  7 +++
+ 5 files changed, 128 insertions(+), 21 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 526c7a0..01496d7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -702,6 +702,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
+ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
+ 
+ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
+ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
+ 
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+index 2e705f9..bdeec74 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+@@ -113,8 +113,9 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
+ 	return r;
+ }
+ 
+-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
++static void amdgpu_ctx_fini(struct kref *ref)
+ {
++	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
+ 	struct amdgpu_device *adev = ctx->adev;
+ 	unsigned i, j;
+ 
+@@ -142,13 +143,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
+ 	kfree(ctx->fences);
+ 	ctx->fences = NULL;
+ 
+-	for (i = 0; i < adev->num_rings; i++)
+-		drm_sched_entity_fini(&adev->rings[i]->sched,
+-				      &ctx->rings[i].entity);
+-
+ 	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
+ 
+ 	mutex_destroy(&ctx->lock);
++
++	kfree(ctx);
+ }
+ 
+ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
+@@ -187,12 +186,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
+ static void amdgpu_ctx_do_release(struct kref *ref)
+ {
+ 	struct amdgpu_ctx *ctx;
++	u32 i;
+ 
+ 	ctx = container_of(ref, struct amdgpu_ctx, refcount);
+ 
+-	amdgpu_ctx_fini(ctx);
++	for (i = 0; i < ctx->adev->num_rings; i++)
++		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
++			&ctx->rings[i].entity);
+ 
+-	kfree(ctx);
++	amdgpu_ctx_fini(ref);
+ }
+ 
+ static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
+@@ -452,16 +454,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
+ 	idr_init(&mgr->ctx_handles);
+ }
+ 
++void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
++{
++	struct amdgpu_ctx *ctx;
++	struct idr *idp;
++	uint32_t id, i;
++
++	idp = &mgr->ctx_handles;
++
++	idr_for_each_entry(idp, ctx, id) {
++
++		if (!ctx->adev)
++			return;
++
++		for (i = 0; i < ctx->adev->num_rings; i++)
++			if (kref_read(&ctx->refcount) == 1)
++				drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
++						  &ctx->rings[i].entity);
++			else
++				DRM_ERROR("ctx %p is still alive\n", ctx);
++	}
++}
++
++void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
++{
++	struct amdgpu_ctx *ctx;
++	struct idr *idp;
++	uint32_t id, i;
++
++	idp = &mgr->ctx_handles;
++
++	idr_for_each_entry(idp, ctx, id) {
++
++		if (!ctx->adev)
++			return;
++
++		for (i = 0; i < ctx->adev->num_rings; i++)
++			if (kref_read(&ctx->refcount) == 1)
++				drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
++					&ctx->rings[i].entity);
++			else
++				DRM_ERROR("ctx %p is still alive\n", ctx);
++	}
++}
++
+ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
+ {
+ 	struct amdgpu_ctx *ctx;
+ 	struct idr *idp;
+ 	uint32_t id;
+ 
++	amdgpu_ctx_mgr_entity_cleanup(mgr);
++
+ 	idp = &mgr->ctx_handles;
+ 
+ 	idr_for_each_entry(idp, ctx, id) {
+-		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
++		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
+ 			DRM_ERROR("ctx %p is still alive\n", ctx);
+ 	}
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+index efbed62..79d1060 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+@@ -959,8 +959,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
+ 		return;
+ 
+ 	pm_runtime_get_sync(dev->dev);
+-
+-	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
++	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
+ 
+ 	if (adev->asic_type != CHIP_RAVEN) {
+ 		amdgpu_uvd_free_handles(adev, file_priv);
+@@ -981,6 +980,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
+ 	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
+ 
+ 	amdgpu_vm_fini(adev, &fpriv->vm);
++	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
++
+ 	if (pasid)
+ 		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
+ 	amdgpu_bo_unref(&pd);
+diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
+index 0d95888..f45b4fd 100644
+--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
++++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
+@@ -135,6 +135,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
+ 	entity->rq = rq;
+ 	entity->sched = sched;
+ 	entity->guilty = guilty;
++	entity->fini_status = 0;
++	entity->last_scheduled = NULL;
+ 
+ 	spin_lock_init(&entity->rq_lock);
+ 	spin_lock_init(&entity->queue_lock);
+@@ -196,19 +198,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
+ 	return true;
+ }
+ 
++static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
++				    struct dma_fence_cb *cb)
++{
++	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
++						 finish_cb);
++	drm_sched_fence_finished(job->s_fence);
++	WARN_ON(job->s_fence->parent);
++	dma_fence_put(&job->s_fence->finished);
++	job->sched->ops->free_job(job);
++}
++
++
+ /**
+  * Destroy a context entity
+  *
+  * @sched       Pointer to scheduler instance
+  * @entity	The pointer to a valid scheduler entity
+  *
+- * Cleanup and free the allocated resources.
++ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
++ * removes the entity from the runqueue and returns an error when the process was killed.
+  */
+-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
+ 			   struct drm_sched_entity *entity)
+ {
+-	int r;
+-
+ 	if (!drm_sched_entity_is_initialized(sched, entity))
+ 		return;
+ 	/**
+@@ -216,13 +229,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ 	 * queued IBs or discard them on SIGKILL
+ 	*/
+ 	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
+-		r = -ERESTARTSYS;
++		entity->fini_status = -ERESTARTSYS;
+ 	else
+-		r = wait_event_killable(sched->job_scheduled,
++		entity->fini_status = wait_event_killable(sched->job_scheduled,
+ 					drm_sched_entity_is_idle(entity));
+ 	drm_sched_entity_set_rq(entity, NULL);
+-	if (r) {
++}
++EXPORT_SYMBOL(drm_sched_entity_do_release);
++
++/**
++ * Destroy a context entity
++ *
++ * @sched       Pointer to scheduler instance
++ * @entity	The pointer to a valid scheduler entity
++ *
++ * The second one then goes over the entity and signals all jobs with an error code.
++ */
++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
++			   struct drm_sched_entity *entity)
++{
++	if (entity->fini_status) {
+ 		struct drm_sched_job *job;
++		int r;
+ 
+ 		/* Park the kernel for a moment to make sure it isn't processing
+ 		 * our enity.
+@@ -240,13 +268,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ 			struct drm_sched_fence *s_fence = job->s_fence;
+ 			drm_sched_fence_scheduled(s_fence);
+ 			dma_fence_set_error(&s_fence->finished, -ESRCH);
+-			drm_sched_fence_finished(s_fence);
+-			WARN_ON(s_fence->parent);
+-			dma_fence_put(&s_fence->finished);
+-			sched->ops->free_job(job);
++			r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
++							drm_sched_entity_kill_jobs_cb);
++			if (r == -ENOENT)
++				drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
++			else if (r)
++				DRM_ERROR("fence add callback failed (%d)\n", r);
+ 		}
++
++		dma_fence_put(entity->last_scheduled);
++		entity->last_scheduled = NULL;
+ 	}
+ }
++EXPORT_SYMBOL(drm_sched_entity_cleanup);
++
++void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
++				struct drm_sched_entity *entity)
++{
++	drm_sched_entity_do_release(sched, entity);
++	drm_sched_entity_cleanup(sched, entity);
++}
+ EXPORT_SYMBOL(drm_sched_entity_fini);
+ 
+ static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
+@@ -529,6 +570,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
+ 		spin_unlock(&sched->job_list_lock);
+ 		fence = sched->ops->run_job(s_job);
+ 		atomic_inc(&sched->hw_rq_count);
++
++		dma_fence_put(s_job->entity->last_scheduled);
++		s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished);
++
+ 		if (fence) {
+ 			s_fence->parent = dma_fence_get(fence);
+ 			r = dma_fence_add_callback(fence, &s_fence->cb,
+@@ -555,6 +600,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
+ 		       void *owner)
+ {
+ 	job->sched = sched;
++	job->entity = entity;
+ 	job->s_priority = entity->rq - sched->sched_rq;
+ 	job->s_fence = drm_sched_fence_create(entity, owner);
+ 	if (!job->s_fence)
+@@ -668,6 +714,9 @@ static int drm_sched_main(void *param)
+ 		fence = sched->ops->run_job(sched_job);
+ 		drm_sched_fence_scheduled(s_fence);
+ 
++		dma_fence_put(entity->last_scheduled);
++		entity->last_scheduled = dma_fence_get(&s_fence->finished);
++
+ 		if (fence) {
+ 			s_fence->parent = dma_fence_get(fence);
+ 			r = dma_fence_add_callback(fence, &s_fence->cb,
+diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
+index dfd54fb..1df6229 100644
+--- a/include/drm/gpu_scheduler.h
++++ b/include/drm/gpu_scheduler.h
+@@ -63,6 +63,8 @@ struct drm_sched_entity {
+ 	struct dma_fence		*dependency;
+ 	struct dma_fence_cb		cb;
+ 	atomic_t			*guilty; /* points to ctx's guilty */
++	int            fini_status;
++	struct dma_fence    *last_scheduled;
+ };
+ 
+ /**
+@@ -99,6 +101,7 @@ struct drm_sched_job {
+ 	uint64_t			id;
+ 	atomic_t			karma;
+ 	enum drm_sched_priority		s_priority;
++	struct drm_sched_entity  *entity;
+ };
+ 
+ static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
+@@ -148,6 +151,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
+ 			  struct drm_sched_entity *entity,
+ 			  struct drm_sched_rq *rq,
+ 			  uint32_t jobs, atomic_t *guilty);
++void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
++			   struct drm_sched_entity *entity);
++void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
++			   struct drm_sched_entity *entity);
+ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+ 			   struct drm_sched_entity *entity);
+ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
+-- 
+2.7.4
+