1 files changed, 478 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2777-drm-amdkfd-Implement-queue-based-suspend-resume.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2777-drm-amdkfd-Implement-queue-based-suspend-resume.patch
new file mode 100644
index 00000000..5b9121db
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2777-drm-amdkfd-Implement-queue-based-suspend-resume.patch
@@ -0,0 +1,478 @@
+From 7b5a8eee9cf58fbd76caeb49bc137749fc25e476 Mon Sep 17 00:00:00 2001
+From: Philip Cox <Philip.Cox@amd.com>
+Date: Thu, 11 Apr 2019 11:49:22 -0400
+Subject: [PATCH 2777/2940] drm/amdkfd: Implement queue based suspend/resume
+
+Rather than suspending and resuming all the queues on a node,
+we need the ability to specify a list of queues to suspend and resume.
+
+Change-Id: Id7d5a32e3bcc3806ecea3ae9b89cfafb28469beb
+Signed-off-by: Philip Cox <Philip.Cox@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  48 ++--
+ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 262 ++++++++++++++----
+ .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   4 +-
+ 4 files changed, 229 insertions(+), 99 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 4abf4f462fa1..d5416af01ac4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -2555,7 +2555,7 @@ static int kfd_ioctl_dbg_set_debug_trap(struct file *filep,
+ 	struct kfd_ioctl_dbg_trap_args *args = data;
+ 	struct kfd_process_device *pdd = NULL;
+ 	int r = 0;
+-	struct kfd_dev *dev;
++	struct kfd_dev *dev = NULL;
+ 	struct kfd_process *target = NULL;
+ 	struct pid *pid = NULL;
+ 	uint32_t *queue_id_array = NULL;
+@@ -2565,7 +2565,6 @@ static int kfd_ioctl_dbg_set_debug_trap(struct file *filep,
+ 	uint32_t data2;
+ 	uint32_t data3;
+ 	bool is_suspend_or_resume;
+-	uint8_t id;
+ 
+ 	debug_trap_action = args->op;
+ 	gpu_id = args->gpu_id;
+@@ -2720,39 +2719,24 @@ static int kfd_ioctl_dbg_set_debug_trap(struct file *filep,
+ 				data1,
+ 				dev->vm_info.last_vmid_kfd);
+ 		break;
+-	case KFD_IOC_DBG_TRAP_NODE_SUSPEND:
+-		id = 0;
+-		/* We need to loop over all of the topology devices */
+-		while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+-			if (!dev) {
+-				/* Not a GPU.  Skip it */
+-				id++;
+-				continue;
+-			}
+ 
+-			r = suspend_queues(dev->dqm, target, data1);
+-			if (r)
+-				goto unlock_out;
+-
+-			id++;
+-		}
++	case KFD_IOC_DBG_TRAP_NODE_SUSPEND:
++		r = suspend_queues(target,
++				data2, /* Number of queues */
++				data3, /* Grace Period */
++				data1, /* Flags */
++				queue_id_array); /* array of queue ids */
++		if (r)
++			goto unlock_out;
+ 		break;
+-	case KFD_IOC_DBG_TRAP_NODE_RESUME:
+-		id = 0;
+-		/* We need to loop over all of the topology devices */
+-		while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+-			if (!dev) {
+-				/* Not a GPU.  Skip it */
+-				id++;
+-				continue;
+-			}
+ 
+-			r = resume_queues(dev->dqm, target);
+-			if (r)
+-				goto unlock_out;
+-
+-			id++;
+-		}
++	case KFD_IOC_DBG_TRAP_NODE_RESUME:
++		r = resume_queues(target,
++				data2, /* Number of queues */
++				data1, /* Flags */
++				queue_id_array); /* array of queue ids */
++		if (r)
++			goto unlock_out;
+ 		break;
+ 	default:
+ 		pr_err("Invalid option: %i\n", debug_trap_action);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index d0c316b91af0..497d449fc6d0 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -600,6 +600,66 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 	return retval;
+ }
+ 
++/* suspend_single_queue does not lock the dqm like the
++ * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should
++ * lock the dqm before calling, and unlock after calling.
++ *
++ * The reason we don't lock the dqm is because this function may be
++ * called on multipe queues in a loop, so rather than locking/unlocking
++ * multiple times, we will just keep the dqm locked for all of the calls.
++ */
++static int suspend_single_queue(struct device_queue_manager *dqm,
++				      struct kfd_process_device *pdd,
++				      struct queue *q)
++{
++	int retval = 0;
++
++	pr_debug("Suspending PASID %u queue [%i]\n",
++			pdd->process->pasid,
++			q->properties.queue_id);
++
++	q->properties.is_suspended = true;
++	if (q->properties.is_active) {
++		dqm->queue_count--;
++		q->properties.is_active = false;
++	}
++
++	return retval;
++}
++
++/* resume_single_queue does not lock the dqm like the functions
++ * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should
++ * lock the dqm before calling, and unlock after calling.
++ *
++ * The reason we don't lock the dqm is because this function may be
++ * called on multipe queues in a loop, so rather than locking/unlocking
++ * multiple times, we will just keep the dqm locked for all of the calls.
++ */
++static int resume_single_queue(struct device_queue_manager *dqm,
++				      struct qcm_process_device *qpd,
++				      struct queue *q)
++{
++	struct kfd_process_device *pdd;
++	uint64_t pd_base;
++	int retval = 0;
++
++	pdd = qpd_to_pdd(qpd);
++	/* Retrieve PD base */
++	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->vm);
++
++	pr_debug("Restoring from suspend PASID %u queue [%i]\n",
++			    pdd->process->pasid,
++			    q->properties.queue_id);
++
++	q->properties.is_suspended = false;
++
++	if (QUEUE_IS_ACTIVE(q->properties)) {
++		q->properties.is_active = true;
++		dqm->queue_count++;
++	}
++
++	return retval;
++}
+ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+@@ -1227,7 +1287,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 	 * updates the is_evicted flag but is a no-op otherwise.
+ 	 */
+ 	q->properties.is_evicted = !!qpd->evicted;
+-
++	q->properties.is_suspended = false;
+ 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+ 	q->properties.tba_addr = qpd->tba_addr;
+ 	q->properties.tma_addr = qpd->tma_addr;
+@@ -1979,114 +2039,194 @@ int release_debug_trap_vmid(struct device_queue_manager *dqm)
+ 	return r;
+ }
+ 
++bool queue_id_in_array(unsigned int queue_id,
++		uint32_t num_queues,
++		uint32_t *queue_ids)
++{
++	int i;
++
++	for (i = 0; i < num_queues; i++)
++		if (queue_id == queue_ids[i])
++			return true;
++	return false;
++}
+ 
+ struct copy_context_work_handler_workarea {
+ 	struct work_struct copy_context_work;
+-	struct device_queue_manager *dqm;
+-	struct qcm_process_device *qpd;
+-	struct mm_struct *mm;
++	struct kfd_process *p;
+ };
+ 
+ void copy_context_work_handler (struct work_struct *work)
+ {
+ 	struct copy_context_work_handler_workarea *workarea;
+ 	struct mqd_manager *mqd_mgr;
+-	struct qcm_process_device *qpd;
+-	struct device_queue_manager *dqm;
++	struct kfd_process_device *pdd;
+ 	struct queue *q;
++	struct mm_struct *mm;
++	struct kfd_process *p;
+ 	uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size;
+ 
+ 	workarea = container_of(work,
+ 			struct copy_context_work_handler_workarea,
+ 			copy_context_work);
+ 
+-	qpd = workarea->qpd;
+-	dqm = workarea->dqm;
+-	use_mm(workarea->mm);
++	p = workarea->p;
++	mm = get_task_mm(p->lead_thread);
++	use_mm(mm);
++	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++		struct device_queue_manager *dqm = pdd->dev->dqm;
++		struct qcm_process_device *qpd = &pdd->qpd;
+ 
++		dqm_lock(dqm);
+ 
+-	list_for_each_entry(q, &qpd->queues_list, list) {
+-		mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
+ 
+-		/* We ignore the return value from get_wave_state because
+-		 * i) right now, it always returns 0, and
+-		 * ii) if we hit an error, we would continue to the next queue
+-		 *     anyway.
+-		 */
+-		mqd_mgr->get_wave_state(mqd_mgr,
+-				q->mqd,
+-				(void __user *)	q->properties.ctx_save_restore_area_address,
+-				&tmp_ctl_stack_used_size,
+-				&tmp_save_area_used_size);
+-	}
+-
+-	unuse_mm(workarea->mm);
+-}
++		list_for_each_entry(q, &qpd->queues_list, list) {
++			mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
+ 
++			/* We ignore the return value from get_wave_state
++			 * because
++			 * i) right now, it always returns 0, and
++			 * ii) if we hit an error, we would continue to the
++			 *      next queue anyway.
++			 */
++			mqd_mgr->get_wave_state(mqd_mgr,
++					q->mqd,
++					(void __user *)	q->properties.ctx_save_restore_area_address,
++					&tmp_ctl_stack_used_size,
++					&tmp_save_area_used_size);
++		}
+ 
++		dqm_unlock(dqm);
++	}
++	unuse_mm(mm);
++	mmput(mm);
++}
+ 
+-int suspend_queues(struct device_queue_manager *dqm,
+-			struct kfd_process *p,
+-			uint32_t flags)
++int suspend_queues(struct kfd_process *p,
++			uint32_t num_queues,
++			uint32_t grace_period,
++			uint32_t flags,
++			uint32_t *queue_ids)
+ {
+ 	int r = -ENODEV;
+-	struct kfd_dev *dev;
++	bool any_queues_suspended = false;
+ 	struct kfd_process_device *pdd;
++	struct queue *q;
+ 
+-	bool queues_suspended = false;
+-	struct copy_context_work_handler_workarea copy_context_worker;
++	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
++		bool queues_suspended_on_device = false;
++		struct device_queue_manager *dqm = pdd->dev->dqm;
++		struct qcm_process_device *qpd = &pdd->qpd;
+ 
+-	dev = dqm->dev;
++		dqm_lock(dqm);
+ 
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		if (dqm->dev == pdd->dev) {
+-			r = pdd->dev->dqm->ops.evict_process_queues(
+-					pdd->dev->dqm,
+-					&pdd->qpd);
++		/* We need to loop over all of the queues on this
++		 * device, and check if it is in the list passed in,
++		 * and if it is, we will evict it.
++		 */
++		list_for_each_entry(q, &qpd->queues_list, list) {
++			if (queue_id_in_array(q->properties.queue_id,
++						num_queues,
++						queue_ids)) {
++				if (q->properties.is_suspended)
++					continue;
++				r = suspend_single_queue(dqm,
++						pdd,
++						q);
++				if (r) {
++					pr_err("Failed to suspend process queues. queue_id == %i\n",
++							q->properties.queue_id);
++					dqm_unlock(dqm);
++					return r;
++				}
++				queues_suspended_on_device = true;
++				any_queues_suspended = true;
++			}
++		}
++
++		if (queues_suspended_on_device) {
++			r = execute_queues_cpsch(dqm,
++				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+ 			if (r) {
+-				pr_err("Failed to suspend process queues\n");
+-				break;
++				pr_err("Failed to suspend process queues.\n");
++				dqm_unlock(dqm);
++				return r;
+ 			}
++		}
+ 
+-			copy_context_worker.qpd = &pdd->qpd;
+-			copy_context_worker.dqm = dqm;
+-			copy_context_worker.mm = get_task_mm(p->lead_thread);
+-			queues_suspended = true;
++		dqm_unlock(dqm);
++		amdgpu_amdkfd_debug_mem_fence(dqm->dev->kgd);
++	}
+ 
+-			INIT_WORK_ONSTACK(
+-					&copy_context_worker.copy_context_work,
+-					copy_context_work_handler);
++	if (any_queues_suspended) {
++		struct copy_context_work_handler_workarea copy_context_worker;
++
++		INIT_WORK_ONSTACK(
++				&copy_context_worker.copy_context_work,
++				copy_context_work_handler);
++
++		copy_context_worker.p = p;
++
++		schedule_work(&copy_context_worker.copy_context_work);
+ 
+-			schedule_work(&copy_context_worker.copy_context_work);
+-			break;
+-		}
+-	}
+ 
+-	if (queues_suspended) {
+-		amdgpu_amdkfd_debug_mem_fence(dev->kgd);
+ 		flush_work(&copy_context_worker.copy_context_work);
+-		mmput(copy_context_worker.mm);
+ 		destroy_work_on_stack(&copy_context_worker.copy_context_work);
+ 	}
+ 	return r;
+ }
+ 
+-int resume_queues(struct device_queue_manager *dqm, struct kfd_process *p)
++int resume_queues(struct kfd_process *p,
++		uint32_t num_queues,
++		uint32_t flags,
++		uint32_t *queue_ids)
+ {
+ 	int r = -ENODEV;
+ 	struct kfd_process_device *pdd;
++	struct queue *q;
+ 
+ 	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		if (dqm->dev == pdd->dev) {
+-			r = pdd->dev->dqm->ops.restore_process_queues(
+-					pdd->dev->dqm,
+-					&pdd->qpd);
+-			if (r)
++		bool queues_resumed_on_device = false;
++		struct device_queue_manager *dqm = pdd->dev->dqm;
++		struct qcm_process_device *qpd = &pdd->qpd;
++
++		dqm_lock(dqm);
++
++		/* We need to loop over all of the queues on this
++		 * device, and check if it is in the list passed in,
++		 * and if it is, we will restore it.
++		 */
++		list_for_each_entry(q, &qpd->queues_list, list) {
++			if (queue_id_in_array(q->properties.queue_id,
++						num_queues,
++						queue_ids)) {
++				if (!q->properties.is_suspended)
++					continue;
++				r = resume_single_queue(dqm,
++							&pdd->qpd,
++							q);
++				if (r) {
++					pr_err("Failed to resume process queues\n");
++					dqm_unlock(dqm);
++					return r;
++				}
++				queues_resumed_on_device = true;
++			}
++		}
++
++		if (queues_resumed_on_device) {
++			r = execute_queues_cpsch(dqm,
++					KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
++					0);
++			if (r) {
+ 				pr_err("Failed to resume process queues\n");
+-			break;
++				dqm_unlock(dqm);
++				return r;
++			}
+ 		}
+-	}
+ 
++		dqm_unlock(dqm);
++	}
+ 	return r;
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index 01f8249cb2ed..8eb10f610c12 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -224,11 +224,15 @@ bool check_if_queues_active(struct device_queue_manager *dqm,
+ 		struct qcm_process_device *qpd);
+ int reserve_debug_trap_vmid(struct device_queue_manager *dqm);
+ int release_debug_trap_vmid(struct device_queue_manager *dqm);
+-int suspend_queues(struct device_queue_manager *dqm,
+-				struct kfd_process *p,
+-				uint32_t flags);
+-int resume_queues(struct device_queue_manager *dqm, struct kfd_process *p);
+-
++int suspend_queues(struct kfd_process *p,
++			uint32_t num_queues,
++			uint32_t grace_period,
++			uint32_t flags,
++			uint32_t *queue_ids);
++int resume_queues(struct kfd_process *p,
++		uint32_t num_queues,
++		uint32_t flags,
++		uint32_t *queue_ids);
+ 
+ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
+ {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index a0311b2ed5d6..7cd1404129f4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -498,6 +498,7 @@ struct queue_properties {
+ 	uint32_t doorbell_off;
+ 	bool is_interop;
+ 	bool is_evicted;
++	bool is_suspended;
+ 	bool is_active;
+ 	/* Not relevant for user mode queues in cp scheduling */
+ 	unsigned int vmid;
+@@ -521,7 +522,8 @@ struct queue_properties {
+ #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
+ 			    (q).queue_address != 0 &&	\
+ 			    (q).queue_percent > 0 &&	\
+-			    !(q).is_evicted)
++			    !(q).is_evicted && \
++			    !(q).is_suspended)
+ 
+ /**
+  * struct queue
+-- 
+2.17.1
+