1 files changed, 286 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/1502-drm-amdkfd-Fix-CU-masking-for-more-than-32-CUs.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/1502-drm-amdkfd-Fix-CU-masking-for-more-than-32-CUs.patch
new file mode 100644
index 00000000..1e4795d1
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/1502-drm-amdkfd-Fix-CU-masking-for-more-than-32-CUs.patch
@@ -0,0 +1,286 @@
+From c95a7ced9bd72a1f224ed09dac2d5dbe2a75a1c6 Mon Sep 17 00:00:00 2001
+From: Kent Russell <kent.russell@amd.com>
+Date: Thu, 18 Aug 2016 13:15:39 -0400
+Subject: [PATCH 1502/4131] drm/amdkfd Fix CU masking for more than 32 CUs
+
+The initial implementation of CU masking used a single uint32 instead of
+an array, which limited the number of CUs that could be masked to 32.
+Match the thunk/kernel spec and pass in the cu_mask_count (number of
+bits total) and a uint32 array to mask properly.
+BUG:KFD-277
+
+Change-Id: I61d17685809d9beb62fdc9a47a1c19d8a2107a54
+Signed-off-by: Kent Russell <kent.russell@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 36 +++++++++++--
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |  1 +
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   | 59 +++++++++++-----------
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    | 59 +++++++++++-----------
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  3 +-
+ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  7 +++
+ 6 files changed, 101 insertions(+), 64 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 831f63f..c144752 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -391,14 +391,44 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
+ 					void *data)
+ {
+ 	int retval;
++	const int max_num_cus = 1024;
+ 	struct kfd_ioctl_set_cu_mask_args *args = data;
+ 	struct queue_properties properties;
+ 	uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
++	size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32);
+ 
+-	if (get_user(properties.cu_mask, cu_mask_ptr))
++	if ((args->num_cu_mask % 32) != 0) {
++		pr_debug("kfd: num_cu_mask (0x%x) must be a multiple of 32",
++				args->num_cu_mask);
++		return -EINVAL;
++	}
++
++	properties.cu_mask_count = args->num_cu_mask;
++	if (properties.cu_mask_count == 0) {
++		pr_debug("kfd: CU Mask cannot be 0");
++		return -EINVAL;
++	}
++
++	/* To prevent an unreasonably large CU mask size, set an arbitrary
++	 * limit of max_num_cus bits.  We can then just drop any CU mask bits
++	 * past max_num_cus bits and just use the first max_num_cus bits.
++	 */
++	if (properties.cu_mask_count > max_num_cus) {
++		pr_debug("kfd: CU mask cannot be greater than 1024 bits");
++		properties.cu_mask_count = max_num_cus;
++		cu_mask_size = sizeof(uint32_t) * (max_num_cus/32);
++	}
++
++	properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL);
++	if (!properties.cu_mask)
++		return -ENOMEM;
++
++	retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size);
++	if (retval) {
++		pr_debug("kfd: Could not copy cu mask from userspace");
++		kfree(properties.cu_mask);
+ 		return -EFAULT;
+-	if (properties.cu_mask == 0)
+-		return 0;
++	}
+ 
+ 	down_write(&p->lock);
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+index 162a83f..f19f2b3 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+@@ -123,6 +123,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
+ 	prop.eop_ring_buffer_address = kq->eop_gpu_addr;
+ 	prop.eop_ring_buffer_size = PAGE_SIZE;
++	prop.cu_mask = NULL;
+ 
+ 	if (init_queue(&kq->queue, &prop) != 0)
+ 		goto err_init_queue;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+index 1badce1..959a7f1 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+@@ -48,12 +48,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ {
+ 	struct cik_mqd *m;
+ 	struct kfd_cu_info cu_info;
+-	uint32_t mgmt_se_mask;
+-	uint32_t cu_sh_mask, cu_sh_shift;
+-	uint32_t cu_mask;
+-	int se, sh;
++	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
++	uint32_t cu_mask_count = q->cu_mask_count;
++	const uint32_t *cu_mask = q->cu_mask;
++	int se, cu_per_sh, cu_index, i;
+ 
+-	if (q->cu_mask == 0)
++	if (WARN_ON(cu_mask_count == 0))
+ 		return;
+ 
+ 	m = get_mqd(mqd);
+@@ -63,32 +63,31 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ 	m->compute_static_thread_mgmt_se3 = 0;
+ 
+ 	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+-	cu_mask = q->cu_mask;
+-	for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
+-		mgmt_se_mask = 0;
+-		for (sh = 0; sh < 2 && cu_mask; sh++) {
+-			cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
+-			cu_sh_mask = (1 << cu_sh_shift) - 1;
+-			mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
+-			cu_mask >>= cu_sh_shift;
+-		}
+-		switch (se) {
+-		case 0:
+-			m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
+-			break;
+-		case 1:
+-			m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
+-			break;
+-		case 2:
+-			m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
+-			break;
+-		case 3:
+-			m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
+-			break;
+-		default:
+-			break;
+-		}
++
++	/* If # CU mask bits > # CUs, set it to the # of CUs */
++	if (cu_mask_count > cu_info.cu_active_number)
++		cu_mask_count = cu_info.cu_active_number;
++
++	cu_index = 0;
++	for (se = 0; se < cu_info.num_shader_engines; se++) {
++		cu_per_sh = 0;
++
++		/* Get the number of CUs on this Shader Engine */
++		for (i = 0; i < 4; i++)
++			cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]);
++
++		se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32);
++		if ((cu_per_sh + (cu_index % 32)) > 32)
++			se_mask[se] |= cu_mask[(cu_index / 32) + 1]
++					<< (32 - (cu_index % 32));
++		se_mask[se] &= (1 << cu_per_sh) - 1;
++		cu_index += cu_per_sh;
+ 	}
++	m->compute_static_thread_mgmt_se0 = se_mask[0];
++	m->compute_static_thread_mgmt_se1 = se_mask[1];
++	m->compute_static_thread_mgmt_se2 = se_mask[2];
++	m->compute_static_thread_mgmt_se3 = se_mask[3];
++
+ 	pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
+ 		m->compute_static_thread_mgmt_se0,
+ 		m->compute_static_thread_mgmt_se1,
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+index d78964c..59bc27e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+@@ -48,12 +48,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ {
+ 	struct vi_mqd *m;
+ 	struct kfd_cu_info cu_info;
+-	uint32_t mgmt_se_mask;
+-	uint32_t cu_sh_mask, cu_sh_shift;
+-	uint32_t cu_mask;
+-	int se, sh;
++	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
++	uint32_t cu_mask_count = q->cu_mask_count;
++	const uint32_t *cu_mask = q->cu_mask;
++	int se, cu_per_sh, cu_index, i;
+ 
+-	if (q->cu_mask == 0)
++	if (WARN_ON(cu_mask_count == 0))
+ 		return;
+ 
+ 	m = get_mqd(mqd);
+@@ -63,32 +63,31 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ 	m->compute_static_thread_mgmt_se3 = 0;
+ 
+ 	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+-	cu_mask = q->cu_mask;
+-	for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
+-		mgmt_se_mask = 0;
+-		for (sh = 0; sh < 2 && cu_mask; sh++) {
+-			cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
+-			cu_sh_mask = (1 << cu_sh_shift) - 1;
+-			mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
+-			cu_mask >>= cu_sh_shift;
+-		}
+-		switch (se) {
+-		case 0:
+-			m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
+-			break;
+-		case 1:
+-			m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
+-			break;
+-		case 2:
+-			m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
+-			break;
+-		case 3:
+-			m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
+-			break;
+-		default:
+-			break;
+-		}
++
++	/* If # CU mask bits > # CUs, set it to the # of CUs */
++	if (cu_mask_count > cu_info.cu_active_number)
++		cu_mask_count = cu_info.cu_active_number;
++
++	cu_index = 0;
++	for (se = 0; se < cu_info.num_shader_engines; se++) {
++		cu_per_sh = 0;
++
++		/* Get the number of CUs on this Shader Engine */
++		for (i = 0; i < 4; i++)
++			cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]);
++
++		se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32);
++		if ((cu_per_sh + (cu_index % 32)) > 32)
++			se_mask[se] |= cu_mask[(cu_index / 32) + 1]
++					<< (32 - (cu_index % 32));
++		se_mask[se] &= (1 << cu_per_sh) - 1;
++		cu_index += cu_per_sh;
+ 	}
++	m->compute_static_thread_mgmt_se0 = se_mask[0];
++	m->compute_static_thread_mgmt_se1 = se_mask[1];
++	m->compute_static_thread_mgmt_se2 = se_mask[2];
++	m->compute_static_thread_mgmt_se3 = se_mask[3];
++
+ 	pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
+ 		m->compute_static_thread_mgmt_se0,
+ 		m->compute_static_thread_mgmt_se1,
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 2bfe761..0a2afa7 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -425,7 +425,8 @@ struct queue_properties {
+ 	uint64_t tba_addr;
+ 	uint64_t tma_addr;
+ 	/* Relevant for CU */
+-	uint32_t cu_mask;
++	uint32_t cu_mask_count; /* Must be a multiple of 32 */
++	uint32_t *cu_mask;
+ };
+ 
+ /**
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+index cf08e824..b68776e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+@@ -337,6 +337,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 
+ 	if (pqn->q) {
+ 		dqm = pqn->q->device->dqm;
++		kfree(pqn->q->properties.cu_mask);
+ 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+ 		if (retval != 0) {
+ 			if (retval == -ETIME)
+@@ -400,6 +401,12 @@ int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+ 		return -EFAULT;
+ 	}
+ 
++	/* Free the old CU mask memory if it is already allocated, then
++	 * allocate memory for the new CU mask.
++	 */
++	kfree(pqn->q->properties.cu_mask);
++
++	pqn->q->properties.cu_mask_count = p->cu_mask_count;
+ 	pqn->q->properties.cu_mask = p->cu_mask;
+ 
+ 	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+-- 
+2.7.4
+