aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch223
1 files changed, 223 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch
new file mode 100644
index 00000000..33b35239
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch
@@ -0,0 +1,223 @@
+From aae9664f4449916f2f353727bdddceb1e98c3752 Mon Sep 17 00:00:00 2001
+From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
+Date: Thu, 12 Apr 2018 14:56:17 -0400
+Subject: [PATCH 4267/5725] drm/amdkfd: CMA: Support multi device VRAM copy
+
+Support copy from VRAM on device1 to VRAM on device2. This is done using
+an intermediate System BO and double copy.
+ [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM]
+
+BUG: SWDEV-150755
+
+Change-Id: I7edf2df3cc1688c1ebd1fa0ea8fa82d39cbf50d1
+Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 124 +++++++++++++++++++++++--------
+ 1 file changed, 95 insertions(+), 29 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index b07fe36..66c294a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -1831,7 +1831,8 @@ static void kfd_free_cma_bos(struct cma_iter *ci)
+ struct kfd_dev *dev = cma_bo->dev;
+
+ /* sg table is deleted by free_memory_of_gpu */
+- kfd_put_sg_table(cma_bo->sg);
++ if (cma_bo->sg)
++ kfd_put_sg_table(cma_bo->sg);
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem);
+ list_del(&cma_bo->list);
+ kfree(cma_bo);
+@@ -1867,16 +1868,21 @@ static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf,
+ return ret;
+ }
+
+-/* Create a system BO by pinning underlying system pages of the given userptr
+- * BO @ubo
+- * @ubo: Userptr BO
+- * @offset: Offset into ubo
++#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE)
++
++/* Create an equivalent system BO for the given @bo. If @bo is a userptr then
++ * create a new system BO by pinning underlying system pages of the given
++ * userptr BO. If @bo is in Local Memory then create an empty system BO and
++ * then copy @bo into this new BO.
++ * @bo: Userptr BO or Local Memory BO
++ * @offset: Offset into bo
+ * @size: in/out: The size of the new BO could be less than requested if all
+- * the pages couldn't be pinned. This would be reflected in @size
+- * @mm/@task: mm/task to which @ubo belongs to
++ * the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would
++ * be reflected in @size
++ * @mm/@task: mm/task to which @bo belongs to
+ * @cma_bo: out: new system BO
+ */
+-static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
++static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo,
+ uint64_t *size, uint64_t offset,
+ int cma_write, struct kfd_process *p,
+ struct mm_struct *mm,
+@@ -1886,7 +1892,8 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
+ int ret;
+ struct kfd_process_device *pdd = NULL;
+ struct cma_system_bo *cbo;
+- uint64_t sg_size;
++ uint64_t bo_size = 0;
++ struct dma_fence *f;
+
+ uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
+ ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
+@@ -1897,40 +1904,75 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&cbo->list);
+- ret = kfd_create_sg_table_from_userptr_bo(ubo, offset, cma_write, mm,
+- task, size, &sg_size,
+- &cbo->sg);
+- if (ret) {
+- pr_err("Failed to create system BO. sg table error %d\n", ret);
+- return ret;
++ if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
++ bo_size = min(*size, MAX_SYSTEM_BO_SIZE);
++ else if (bo->cpuva) {
++ ret = kfd_create_sg_table_from_userptr_bo(bo, offset,
++ cma_write, mm, task,
++ size, &bo_size,
++ &cbo->sg);
++ if (ret) {
++ pr_err("CMA: BO create with sg failed %d\n", ret);
++ goto sg_fail;
++ }
++ } else {
++ WARN_ON(1);
++ ret = -EINVAL;
++ goto sg_fail;
+ }
+-
+ mutex_lock(&p->mutex);
+ pdd = kfd_get_process_device_data(kdev, p);
+ if (!pdd) {
++ mutex_unlock(&p->mutex);
+ pr_err("Process device data doesn't exist\n");
+ ret = -EINVAL;
+ goto pdd_fail;
+ }
+
+- ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, sg_size,
++ ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size,
+ pdd->vm, cbo->sg,
+ &cbo->mem, NULL, flags);
++ mutex_unlock(&p->mutex);
+ if (ret) {
+ pr_err("Failed to create shadow system BO %d\n", ret);
+ goto pdd_fail;
+ }
+- mutex_unlock(&p->mutex);
++
++ if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
++ ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem,
++ offset, cbo->mem, 0,
++ bo_size, &f, size);
++ if (ret) {
++ pr_err("CMA: Intermediate copy failed %d\n", ret);
++ goto copy_fail;
++ }
++
++ /* Wait for the copy to finish as subsequent copy will be done
++ * by different device
++ */
++ ret = kfd_cma_fence_wait(f);
++ dma_fence_put(f);
++ if (ret) {
++ pr_err("CMA: Intermediate copy timed out %d\n", ret);
++ goto copy_fail;
++ }
++ }
++
+ cbo->dev = kdev;
+ *cma_bo = cbo;
+
+ return ret;
+
++copy_fail:
++ kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem);
+ pdd_fail:
+- mutex_unlock(&p->mutex);
+- kfd_put_sg_table(cbo->sg);
+- sg_free_table(cbo->sg);
+- kfree(cbo->sg);
++ if (cbo->sg) {
++ kfd_put_sg_table(cbo->sg);
++ sg_free_table(cbo->sg);
++ kfree(cbo->sg);
++ }
++sg_fail:
++ kfree(cbo);
+ return ret;
+ }
+
+@@ -2153,6 +2195,7 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+ uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
+ struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
+ struct kfd_dev *dev = dst_bo->dev;
++ struct cma_system_bo *tmp_bo = NULL;
+
+ *copied = 0;
+ if (f)
+@@ -2188,11 +2231,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+ dst_offset = di->bo_offset & (PAGE_SIZE - 1);
+ list_add_tail(&di->cma_bo->list, &di->cma_list);
+ } else if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+- /* This indicates that either or/both BOs are in local mem. */
++ /* This indicates that atleast on of the BO is in local mem.
++ * If both are in local mem of different devices then create an
++ * intermediate System BO and do a double copy
++ * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM].
++ * If only one BO is in VRAM then use that GPU to do the copy
++ */
+ if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM &&
+ dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+- pr_err("CMA fail. Local mem & not in same dev\n");
+- return -EINVAL;
++ dev = dst_bo->dev;
++ err = kfd_create_cma_system_bo(src_bo->dev, src_bo,
++ &size, si->bo_offset,
++ cma_write, si->p,
++ si->mm, si->task,
++ &tmp_bo);
++ src_mem = tmp_bo->mem;
++ src_offset = 0;
+ } else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+ dev = src_bo->dev;
+ /* else already set to dst_bo->dev */
+@@ -2203,10 +2257,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+ return -EINVAL;
+ }
+
+- err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem,
+- src_offset, dst_mem,
+- dst_offset, size, f,
+- copied);
++ err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset,
++ dst_mem, dst_offset, size, f,
++ copied);
++ /* The tmp_bo allocates additional memory. So it is better to wait and
++ * delete. Also since multiple GPUs are involved the copies are
++ * currently not pipelined.
++ */
++ if (tmp_bo) {
++ if (!err) {
++ kfd_cma_fence_wait(*f);
++ dma_fence_put(*f);
++ *f = NULL;
++ }
++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem);
++ kfree(tmp_bo);
++ }
+ return err;
+ }
+
+--
+2.7.4
+