From aae9664f4449916f2f353727bdddceb1e98c3752 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Thu, 12 Apr 2018 14:56:17 -0400 Subject: [PATCH 4267/5725] drm/amdkfd: CMA: Support multi device VRAM copy Support copy from VRAM on device1 to VRAM on device2. This is done using an intermediate System BO and double copy. [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM] BUG: SWDEV-150755 Change-Id: I7edf2df3cc1688c1ebd1fa0ea8fa82d39cbf50d1 Signed-off-by: Harish Kasiviswanathan --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 124 +++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index b07fe36..66c294a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1831,7 +1831,8 @@ static void kfd_free_cma_bos(struct cma_iter *ci) struct kfd_dev *dev = cma_bo->dev; /* sg table is deleted by free_memory_of_gpu */ - kfd_put_sg_table(cma_bo->sg); + if (cma_bo->sg) + kfd_put_sg_table(cma_bo->sg); dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem); list_del(&cma_bo->list); kfree(cma_bo); @@ -1867,16 +1868,21 @@ static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf, return ret; } -/* Create a system BO by pinning underlying system pages of the given userptr - * BO @ubo - * @ubo: Userptr BO - * @offset: Offset into ubo +#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE) + +/* Create an equivalent system BO for the given @bo. If @bo is a userptr then + * create a new system BO by pinning underlying system pages of the given + * userptr BO. If @bo is in Local Memory then create an empty system BO and + * then copy @bo into this new BO. + * @bo: Userptr BO or Local Memory BO + * @offset: Offset into bo * @size: in/out: The size of the new BO could be less than requested if all - * the pages couldn't be pinned. This would be reflected in @size - * @mm/@task: mm/task to which @ubo belongs to + * the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would + * be reflected in @size + * @mm/@task: mm/task to which @bo belongs to * @cma_bo: out: new system BO */ -static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo, +static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo, uint64_t *size, uint64_t offset, int cma_write, struct kfd_process *p, struct mm_struct *mm, @@ -1886,7 +1892,8 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo, int ret; struct kfd_process_device *pdd = NULL; struct cma_system_bo *cbo; - uint64_t sg_size; + uint64_t bo_size = 0; + struct dma_fence *f; uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | ALLOC_MEM_FLAGS_NO_SUBSTITUTE; @@ -1897,40 +1904,75 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo, return -ENOMEM; INIT_LIST_HEAD(&cbo->list); - ret = kfd_create_sg_table_from_userptr_bo(ubo, offset, cma_write, mm, - task, size, &sg_size, - &cbo->sg); - if (ret) { - pr_err("Failed to create system BO. sg table error %d\n", ret); - return ret; + if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) + bo_size = min(*size, MAX_SYSTEM_BO_SIZE); + else if (bo->cpuva) { + ret = kfd_create_sg_table_from_userptr_bo(bo, offset, + cma_write, mm, task, + size, &bo_size, + &cbo->sg); + if (ret) { + pr_err("CMA: BO create with sg failed %d\n", ret); + goto sg_fail; + } + } else { + WARN_ON(1); + ret = -EINVAL; + goto sg_fail; } - mutex_lock(&p->mutex); pdd = kfd_get_process_device_data(kdev, p); if (!pdd) { + mutex_unlock(&p->mutex); pr_err("Process device data doesn't exist\n"); ret = -EINVAL; goto pdd_fail; } - ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, sg_size, + ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size, pdd->vm, cbo->sg, &cbo->mem, NULL, flags); + mutex_unlock(&p->mutex); if (ret) { pr_err("Failed to create shadow system BO %d\n", ret); goto pdd_fail; } - mutex_unlock(&p->mutex); + + if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { + ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem, + offset, cbo->mem, 0, + bo_size, &f, size); + if (ret) { + pr_err("CMA: Intermediate copy failed %d\n", ret); + goto copy_fail; + } + + /* Wait for the copy to finish as subsequent copy will be done + * by different device + */ + ret = kfd_cma_fence_wait(f); + dma_fence_put(f); + if (ret) { + pr_err("CMA: Intermediate copy timed out %d\n", ret); + goto copy_fail; + } + } + cbo->dev = kdev; *cma_bo = cbo; return ret; +copy_fail: + kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem); pdd_fail: - mutex_unlock(&p->mutex); - kfd_put_sg_table(cbo->sg); - sg_free_table(cbo->sg); - kfree(cbo->sg); + if (cbo->sg) { + kfd_put_sg_table(cbo->sg); + sg_free_table(cbo->sg); + kfree(cbo->sg); + } +sg_fail: + kfree(cbo); return ret; } @@ -2153,6 +2195,7 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset; struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem; struct kfd_dev *dev = dst_bo->dev; + struct cma_system_bo *tmp_bo = NULL; *copied = 0; if (f) @@ -2188,11 +2231,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, dst_offset = di->bo_offset & (PAGE_SIZE - 1); list_add_tail(&di->cma_bo->list, &di->cma_list); } else if (src_bo->dev->kgd != dst_bo->dev->kgd) { - /* This indicates that either or/both BOs are in local mem. */ + /* This indicates that atleast on of the BO is in local mem. + * If both are in local mem of different devices then create an + * intermediate System BO and do a double copy + * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM]. + * If only one BO is in VRAM then use that GPU to do the copy + */ if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM && dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { - pr_err("CMA fail. Local mem & not in same dev\n"); - return -EINVAL; + dev = dst_bo->dev; + err = kfd_create_cma_system_bo(src_bo->dev, src_bo, + &size, si->bo_offset, + cma_write, si->p, + si->mm, si->task, + &tmp_bo); + src_mem = tmp_bo->mem; + src_offset = 0; } else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) dev = src_bo->dev; /* else already set to dst_bo->dev */ @@ -2203,10 +2257,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, return -EINVAL; } - err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, - src_offset, dst_mem, - dst_offset, size, f, - copied); + err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset, + dst_mem, dst_offset, size, f, + copied); + /* The tmp_bo allocates additional memory. So it is better to wait and + * delete. Also since multiple GPUs are involved the copies are + * currently not pipelined. + */ + if (tmp_bo) { + if (!err) { + kfd_cma_fence_wait(*f); + dma_fence_put(*f); + *f = NULL; + } + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem); + kfree(tmp_bo); + } return err; } -- 2.7.4