From 4d98ca2586f4857e43946b29175cb5d953d79b15 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Mon, 26 Mar 2018 16:45:06 -0400 Subject: [PATCH 4162/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy CMA userptr implementations are incomplete because it doesn't properly handle if the BO is evicted. This patch handles the case where both source and destination BOs are userptr. It is more efficient to use CPU to do the copy in this case, very similar to process_vm_read/write() functions. Change-Id: I5d01d906f04190d71e8663785718060411dede4e Signed-off-by: Harish Kasiviswanathan Signed-off-by: Kalyan Alle Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_chardev.c --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 179 +++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + 2 files changed, 172 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ef1bd27..bd09647 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" @@ -1714,6 +1715,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep, return r; } +/* Maximum number of entries for process pages array which lives on stack */ +#define MAX_PP_STACK_COUNT 16 +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2) +#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *)) + /* Update cma_iter.cur_bo with KFD BO that is assocaited with * cma_iter.array.va_addr */ @@ -1762,7 +1769,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size) } static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, - struct kfd_process *p, struct cma_iter *ci) + struct kfd_process *p, struct mm_struct *mm, + struct task_struct *task, struct cma_iter *ci) { int ret; int nr; @@ -1775,6 +1783,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, ci->nr_segs = segs; ci->p = p; ci->offset = 0; + ci->mm = mm; + ci->task = task; for (nr = 0; nr < segs; nr++) ci->total += arr[nr].size; @@ -1795,6 +1805,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci) return false; } +/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes + * both source and dest. BOs are userptr BOs. Both BOs can either belong to + * current process or one of the BOs can belong to a differnt + * process. @Returns 0 on success, -ve on failure + * + * @si: Source iter + * @di: Dest. iter + * @cma_write: Indicates if it is write to remote or read from remote + * @size: amount of bytes to be copied + * @copied: Return number of bytes actually copied. + */ +static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di, + bool cma_write, uint64_t size, + uint64_t *copied) +{ + int i, ret = 0, locked; + unsigned int nents, nl; + unsigned int offset_in_page; + struct page *pp_stack[MAX_PP_STACK_COUNT]; + struct page **process_pages = pp_stack; + unsigned long rva, lva = 0, flags = 0; + uint64_t copy_size, to_copy = size; + struct cma_iter *li, *ri; + + if (cma_write) { + ri = di; + li = si; + flags |= FOLL_WRITE; + } else { + li = di; + ri = si; + } + /* rva: remote virtual address. Page aligned to start page. + * rva + offset_in_page: Points to remote start address + * lva: local virtual address. Points to the start address. + * nents: computes number of remote pages to request + */ + offset_in_page = ri->bo_offset & (PAGE_SIZE - 1); + rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK; + lva = li->cur_bo->cpuva + li->bo_offset; + + nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE; + + copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page); + *copied = 0; + + if (nents > MAX_PP_STACK_COUNT) { + /* For reliability kmalloc only 2 pages worth */ + process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nents), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + while (nents && to_copy) { + nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents); + locked = 1; + down_read(&ri->mm->mmap_sem); + nl = get_user_pages_remote(ri->task, ri->mm, rva, nl, + flags, process_pages, NULL, + &locked); + if (locked) + up_read(&ri->mm->mmap_sem); + if (nl <= 0) { + pr_err("CMA: Invalid virtual address 0x%lx\n", rva); + ret = -EFAULT; + break; + } + + for (i = 0; i < nl; i++) { + unsigned int n; + void *kaddr = kmap_atomic(process_pages[i]); + + if (cma_write) { + n = copy_from_user(kaddr+offset_in_page, + (void *)lva, copy_size); + set_page_dirty(process_pages[i]); + } else { + n = copy_to_user((void *)lva, + kaddr+offset_in_page, + copy_size); + } + kunmap_atomic(kaddr); + if (n) { + ret = -EFAULT; + break; + } + to_copy -= copy_size; + if (!to_copy) + break; + lva += copy_size; + rva += (copy_size + offset_in_page); + WARN_ONCE(rva & (PAGE_SIZE - 1), + "CMA: Error in remote VA computation"); + offset_in_page = 0; + copy_size = min_t(uint64_t, to_copy, PAGE_SIZE); + } + + for (i = 0; i < nl; i++) + put_page(process_pages[i]); + + if (ret) + break; + nents -= nl; + } + + if (process_pages != pp_stack) + kfree(process_pages); + + *copied = (size - to_copy); + return ret; + +} + +/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their + * respective offset. + * @si: Source iter + * @di: Dest. iter + * @cma_write: Indicates if it is write to remote or read from remote + * @size: amount of bytes to be copied + * @f: Return the last fence if any + * @copied: Return number of bytes actually copied. + */ +static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, + int cma_write, uint64_t size, + struct dma_fence **f, uint64_t *copied) +{ + int err = 0; + struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo; + uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset; + struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem; + + *copied = 0; + if (f) + *f = NULL; + if (src_bo->cpuva && dst_bo->cpuva) + return kfd_copy_userptr_bos(si, di, cma_write, size, copied); + + if (src_bo->dev->kgd != dst_bo->dev->kgd) { + pr_err("CMA %d fail. Not same dev\n", cma_write); + err = -EINVAL; + } + + err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem, + src_offset, dst_mem, + dst_offset, size, f, + copied); + + return err; +} + /* Copy single range from source iterator @si to destination iterator @di. * @si will move to next range and @di will move by bytes copied. * @return : 0 for success or -ve for failure @@ -1829,11 +1992,9 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di, return -EINVAL; } - err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, - src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset, - copy_size, &fence, &n); + err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n); if (err) { - pr_err("GPU CMA %d failed\n", err); + pr_err("CMA %d failed\n", err); break; } @@ -1942,11 +2103,11 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep, cma_op = "WRITE"; pr_debug("CMA WRITE: local -> remote\n"); err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - remote_p, &di); + remote_p, remote_mm, remote_task, &di); if (err) goto kfd_process_fail; err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - local_p, &si); + local_p, current->mm, current, &si); if (err) goto kfd_process_fail; } else { @@ -1954,11 +2115,11 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep, pr_debug("CMA READ: remote -> local\n"); err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - local_p, &di); + local_p, current->mm, current, &di); if (err) goto kfd_process_fail; err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - remote_p, &si); + remote_p, remote_mm, remote_task, &si); if (err) goto kfd_process_fail; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index facd9d9..2744154 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -316,6 +316,8 @@ struct cma_iter { /* offset into the entry pointed by cma_iter.array */ unsigned long offset; struct kfd_process *p; + struct mm_struct *mm; + struct task_struct *task; /* current kfd_bo associated with cma_iter.array.va_addr */ struct kfd_bo *cur_bo; /* offset w.r.t cur_bo */ -- 2.7.4