From aae9664f4449916f2f353727bdddceb1e98c3752 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Thu, 12 Apr 2018 14:56:17 -0400
Subject: [PATCH 4267/5725] drm/amdkfd: CMA: Support multi device VRAM copy

Support copy from VRAM on device1 to VRAM on device2. This is done using
an intermediate System BO and double copy.
	[VRAM]--gpu1-->[System BO]--gpu2-->[VRAM]

BUG: SWDEV-150755

Change-Id: I7edf2df3cc1688c1ebd1fa0ea8fa82d39cbf50d1
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 124 +++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b07fe36..66c294a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1831,7 +1831,8 @@ static void kfd_free_cma_bos(struct cma_iter *ci)
 		struct kfd_dev *dev = cma_bo->dev;
 
 		/* sg table is deleted by free_memory_of_gpu */
-		kfd_put_sg_table(cma_bo->sg);
+		if (cma_bo->sg)
+			kfd_put_sg_table(cma_bo->sg);
 		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem);
 		list_del(&cma_bo->list);
 		kfree(cma_bo);
@@ -1867,16 +1868,21 @@ static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf,
 	return ret;
 }
 
-/* Create a system BO by pinning underlying system pages of the given userptr
- * BO @ubo
- * @ubo: Userptr BO
- * @offset: Offset into ubo
+#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE)
+
+/* Create an equivalent system BO for the given @bo. If @bo is a userptr then
+ * create a new system BO by pinning underlying system pages of the given
+ * userptr BO. If @bo is in Local Memory then create an empty system BO and
+ * then copy @bo into this new BO.
+ * @bo: Userptr BO or Local Memory BO
+ * @offset: Offset into bo
  * @size: in/out: The size of the new BO could be less than requested if all
- *        the pages couldn't be pinned. This would be reflected in @size
- * @mm/@task: mm/task to which @ubo belongs to
+ *        the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would
+ *        be reflected in @size
+ * @mm/@task: mm/task to which @bo belongs to
  * @cma_bo: out: new system BO
  */
-static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
+static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo,
 				    uint64_t *size, uint64_t offset,
 				    int cma_write, struct kfd_process *p,
 				    struct mm_struct *mm,
@@ -1886,7 +1892,8 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
 	int ret;
 	struct kfd_process_device *pdd = NULL;
 	struct cma_system_bo *cbo;
-	uint64_t sg_size;
+	uint64_t bo_size = 0;
+	struct dma_fence *f;
 
 	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
 			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
@@ -1897,40 +1904,75 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&cbo->list);
-	ret = kfd_create_sg_table_from_userptr_bo(ubo, offset, cma_write, mm,
-						  task, size, &sg_size,
-						  &cbo->sg);
-	if (ret) {
-		pr_err("Failed to create system BO. sg table error %d\n", ret);
-		return ret;
+	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+		bo_size = min(*size, MAX_SYSTEM_BO_SIZE);
+	else if (bo->cpuva) {
+		ret = kfd_create_sg_table_from_userptr_bo(bo, offset,
+							  cma_write, mm, task,
+							  size, &bo_size,
+							  &cbo->sg);
+		if (ret) {
+			pr_err("CMA: BO create with sg failed %d\n", ret);
+			goto sg_fail;
+		}
+	} else {
+		WARN_ON(1);
+		ret = -EINVAL;
+		goto sg_fail;
 	}
-
 	mutex_lock(&p->mutex);
 	pdd = kfd_get_process_device_data(kdev, p);
 	if (!pdd) {
+		mutex_unlock(&p->mutex);
 		pr_err("Process device data doesn't exist\n");
 		ret = -EINVAL;
 		goto pdd_fail;
 	}
 
-	ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, sg_size,
+	ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size,
 						 pdd->vm, cbo->sg,
 						 &cbo->mem, NULL, flags);
+	mutex_unlock(&p->mutex);
 	if (ret) {
 		pr_err("Failed to create shadow system BO %d\n", ret);
 		goto pdd_fail;
 	}
-	mutex_unlock(&p->mutex);
+
+	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+		ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem,
+						     offset, cbo->mem, 0,
+						     bo_size, &f, size);
+		if (ret) {
+			pr_err("CMA: Intermediate copy failed %d\n", ret);
+			goto copy_fail;
+		}
+
+		/* Wait for the copy to finish as subsequent copy will be done
+		 * by different device
+		 */
+		ret = kfd_cma_fence_wait(f);
+		dma_fence_put(f);
+		if (ret) {
+			pr_err("CMA: Intermediate copy timed out %d\n", ret);
+			goto copy_fail;
+		}
+	}
+
 	cbo->dev = kdev;
 	*cma_bo = cbo;
 
 	return ret;
 
+copy_fail:
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem);
 pdd_fail:
-	mutex_unlock(&p->mutex);
-	kfd_put_sg_table(cbo->sg);
-	sg_free_table(cbo->sg);
-	kfree(cbo->sg);
+	if (cbo->sg) {
+		kfd_put_sg_table(cbo->sg);
+		sg_free_table(cbo->sg);
+		kfree(cbo->sg);
+	}
+sg_fail:
+	kfree(cbo);
 	return ret;
 }
 
@@ -2153,6 +2195,7 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 	uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
 	struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
 	struct kfd_dev *dev = dst_bo->dev;
+	struct cma_system_bo *tmp_bo = NULL;
 
 	*copied = 0;
 	if (f)
@@ -2188,11 +2231,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 		dst_offset = di->bo_offset & (PAGE_SIZE - 1);
 		list_add_tail(&di->cma_bo->list, &di->cma_list);
 	} else if (src_bo->dev->kgd != dst_bo->dev->kgd) {
-		/* This indicates that either or/both BOs are in local mem. */
+		/* This indicates that atleast on of the BO is in local mem.
+		 * If both are in local mem of different devices then create an
+		 * intermediate System BO and do a double copy
+		 * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM].
+		 * If only one BO is in VRAM then use that GPU to do the copy
+		 */
 		if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM &&
 		    dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-			pr_err("CMA fail. Local mem & not in same dev\n");
-			return -EINVAL;
+			dev = dst_bo->dev;
+			err = kfd_create_cma_system_bo(src_bo->dev, src_bo,
+						       &size, si->bo_offset,
+						       cma_write, si->p,
+						       si->mm, si->task,
+						       &tmp_bo);
+			src_mem = tmp_bo->mem;
+			src_offset = 0;
 		} else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
 			dev = src_bo->dev;
 		/* else already set to dst_bo->dev */
@@ -2203,10 +2257,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 		return -EINVAL;
 	}
 
-	err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem,
-						     src_offset, dst_mem,
-						     dst_offset, size, f,
-						     copied);
+	err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset,
+					    dst_mem, dst_offset, size, f,
+					    copied);
+	/* The tmp_bo allocates additional memory. So it is better to wait and
+	 * delete. Also since multiple GPUs are involved the copies are
+	 * currently not pipelined.
+	 */
+	if (tmp_bo) {
+		if (!err) {
+			kfd_cma_fence_wait(*f);
+			dma_fence_put(*f);
+			*f = NULL;
+		}
+		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem);
+		kfree(tmp_bo);
+	}
 	return err;
 }
 
-- 
2.7.4