From de7edd2adbdcbd3a34f3d1df96884b4a59904b29 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Mon, 26 Mar 2018 16:45:06 -0400 Subject: [PATCH 5637/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy CMA userptr implementations are incomplete because it doesn't properly handle if the BO is evicted. This patch handles the case where both source and destination BOs are userptr. It is more efficient to use CPU to do the copy in this case, very similar to process_vm_read/write() functions. Change-Id: I5d01d906f04190d71e8663785718060411dede4e Signed-off-by: Harish Kasiviswanathan Signed-off-by: Kalyan Alle Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_chardev.c --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 273 ++++++++++++++++++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + mm/gup.c | 11 ++ 3 files changed, 230 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 91223e2..8941312 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" @@ -1681,6 +1682,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep, } +/* Maximum number of entries for process pages array which lives on stack */ +#define MAX_PP_STACK_COUNT 16 +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2) +#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *)) + /* Update cma_iter.cur_bo with KFD BO that is assocaited with * cma_iter.array.va_addr */ @@ -1729,7 +1736,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size) } static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, - struct kfd_process *p, struct cma_iter *ci) + struct kfd_process *p, struct mm_struct *mm, + struct task_struct *task, struct cma_iter *ci) { int ret; int nr; @@ -1742,6 +1750,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, ci->nr_segs = segs; ci->p = p; ci->offset = 0; + ci->mm = mm; + ci->task = task; for (nr = 0; nr < segs; nr++) ci->total += arr[nr].size; @@ -1762,6 +1772,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci) return false; } +/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes + * both source and dest. BOs are userptr BOs. Both BOs can either belong to + * current process or one of the BOs can belong to a differnt + * process. @Returns 0 on success, -ve on failure + * + * @si: Source iter + * @di: Dest. iter + * @cma_write: Indicates if it is write to remote or read from remote + * @size: amount of bytes to be copied + * @copied: Return number of bytes actually copied. + */ +static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di, + bool cma_write, uint64_t size, + uint64_t *copied) +{ + int i, ret = 0, locked; + unsigned int nents, nl; + unsigned int offset_in_page; + struct page *pp_stack[MAX_PP_STACK_COUNT]; + struct page **process_pages = pp_stack; + unsigned long rva, lva = 0, flags = 0; + uint64_t copy_size, to_copy = size; + struct cma_iter *li, *ri; + + if (cma_write) { + ri = di; + li = si; + flags |= FOLL_WRITE; + } else { + li = di; + ri = si; + } + /* rva: remote virtual address. Page aligned to start page. + * rva + offset_in_page: Points to remote start address + * lva: local virtual address. Points to the start address. + * nents: computes number of remote pages to request + */ + offset_in_page = ri->bo_offset & (PAGE_SIZE - 1); + rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK; + lva = li->cur_bo->cpuva + li->bo_offset; + + nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE; + + copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page); + *copied = 0; + + if (nents > MAX_PP_STACK_COUNT) { + /* For reliability kmalloc only 2 pages worth */ + process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nents), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + while (nents && to_copy) { + nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents); + locked = 1; + down_read(&ri->mm->mmap_sem); + nl = get_user_pages_remote(ri->task, ri->mm, rva, nl, + flags, process_pages, NULL, + &locked); + if (locked) + up_read(&ri->mm->mmap_sem); + if (nl <= 0) { + pr_err("CMA: Invalid virtual address 0x%lx\n", rva); + ret = -EFAULT; + break; + } + + for (i = 0; i < nl; i++) { + unsigned int n; + void *kaddr = kmap_atomic(process_pages[i]); + + if (cma_write) { + n = copy_from_user(kaddr+offset_in_page, + (void *)lva, copy_size); + set_page_dirty(process_pages[i]); + } else { + n = copy_to_user((void *)lva, + kaddr+offset_in_page, + copy_size); + } + kunmap_atomic(kaddr); + if (n) { + ret = -EFAULT; + break; + } + to_copy -= copy_size; + if (!to_copy) + break; + lva += copy_size; + rva += (copy_size + offset_in_page); + WARN_ONCE(rva & (PAGE_SIZE - 1), + "CMA: Error in remote VA computation"); + offset_in_page = 0; + copy_size = min_t(uint64_t, to_copy, PAGE_SIZE); + } + + for (i = 0; i < nl; i++) + put_page(process_pages[i]); + + if (ret) + break; + nents -= nl; + } + + if (process_pages != pp_stack) + kfree(process_pages); + + *copied = (size - to_copy); + return ret; + +} + +/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their + * respective offset. + * @si: Source iter + * @di: Dest. iter + * @cma_write: Indicates if it is write to remote or read from remote + * @size: amount of bytes to be copied + * @f: Return the last fence if any + * @copied: Return number of bytes actually copied. + */ +static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, + int cma_write, uint64_t size, + struct dma_fence **f, uint64_t *copied) +{ + int err = 0; + struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo; + uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset; + struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem; + + *copied = 0; + if (f) + *f = NULL; + if (src_bo->cpuva && dst_bo->cpuva) + return kfd_copy_userptr_bos(si, di, cma_write, size, copied); + + if (src_bo->dev->kgd != dst_bo->dev->kgd) { + pr_err("CMA %d fail. Not same dev\n", cma_write); + err = -EINVAL; + } + + err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem, + src_offset, dst_mem, + dst_offset, size, f, + copied); + + return err; +} + /* Copy single range from source iterator @si to destination iterator @di. * @si will move to next range and @di will move by bytes copied. * @return : 0 for success or -ve for failure @@ -1772,57 +1935,55 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di, bool cma_write, struct dma_fence **f, uint64_t *copied) { - int err = 0; - uint64_t copy_size, n; - uint64_t size = si->array->size; - struct kfd_bo *src_bo = si->cur_bo; - struct dma_fence *lfence = NULL; - - if (!src_bo || !di || !copied) - return -EINVAL; - *copied = 0; - if (f) - *f = NULL; - - while (size && !kfd_cma_iter_end(di)) { - struct dma_fence *fence = NULL; - struct kfd_bo *dst_bo = di->cur_bo; - - copy_size = min(size, (di->array->size - di->offset)); - - /* Check both BOs belong to same device */ - if (src_bo->dev->kgd != dst_bo->dev->kgd) { - pr_err("CMA fail. Not same dev\n"); - return -EINVAL; - } - - err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, - src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset, - copy_size, &fence, &n); - if (err) { - pr_err("GPU CMA %d failed\n", err); - break; - } - - if (fence) { - dma_fence_put(lfence); - lfence = fence; - } - size -= n; - *copied += n; - err = kfd_cma_iter_advance(si, n); - if (err) - break; - err = kfd_cma_iter_advance(di, n); - if (err) - break; - } - - if (f) - *f = dma_fence_get(lfence); - dma_fence_put(lfence); - - return err; + int err = 0; + uint64_t copy_size, n; + uint64_t size = si->array->size; + struct kfd_bo *src_bo = si->cur_bo; + struct dma_fence *lfence = NULL; + + if (!src_bo || !di || !copied) + return -EINVAL; + *copied = 0; + if (f) + *f = NULL; + + while (size && !kfd_cma_iter_end(di)) { + struct dma_fence *fence = NULL; + struct kfd_bo *dst_bo = di->cur_bo; + + copy_size = min(size, (di->array->size - di->offset)); + + /* Check both BOs belong to same device */ + if (src_bo->dev->kgd != dst_bo->dev->kgd) { + pr_err("CMA fail. Not same dev\n"); + return -EINVAL; + } + + err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n); + if (err) { + pr_err("CMA %d failed\n", err); + break; + } + + if (fence) { + dma_fence_put(lfence); + lfence = fence; + } + size -= n; + *copied += n; + err = kfd_cma_iter_advance(si, n); + if (err) + break; + err = kfd_cma_iter_advance(di, n); + if (err) + break; + } + + if (f) + *f = dma_fence_get(lfence); + dma_fence_put(lfence); + + return err; } static int kfd_ioctl_cross_memory_copy(struct file *filep, @@ -1910,22 +2071,22 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep, cma_op = "WRITE"; pr_debug("CMA WRITE: local -> remote\n"); err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - remote_p, &di); + remote_p, remote_mm, remote_task, &di); if (err) goto kfd_process_fail; err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - local_p, &si); + local_p, current->mm, current, &si); if (err) goto kfd_process_fail; } else { cma_op = "READ"; pr_debug("CMA READ: remote -> local\n"); err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - local_p, &di); + local_p, current->mm, current, &di); if (err) goto kfd_process_fail; err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - remote_p, &si); + remote_p, remote_mm, remote_task, &si); if (err) goto kfd_process_fail; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 8adfe21..93462fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -316,6 +316,8 @@ struct cma_iter { /* offset into the entry pointed by cma_iter.array */ unsigned long offset; struct kfd_process *p; + struct mm_struct *mm; + struct task_struct *task; /* current kfd_bo associated with cma_iter.array.va_addr */ struct kfd_bo *cur_bo; /* offset w.r.t cur_bo */ diff --git a/mm/gup.c b/mm/gup.c index 4cc8a6f..35c9f0b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1081,6 +1081,17 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); +long get_user_pages_remote_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + locked, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote_locked); + /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task -- 2.7.4