diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch new file mode 100644 index 00000000..71bba1fa --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch @@ -0,0 +1,399 @@ +From de7edd2adbdcbd3a34f3d1df96884b4a59904b29 Mon Sep 17 00:00:00 2001 +From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com> +Date: Mon, 26 Mar 2018 16:45:06 -0400 +Subject: [PATCH 5637/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy + +CMA userptr implementations are incomplete because it doesn't properly +handle if the BO is evicted. This patch handles the case where both +source and destination BOs are userptr. It is more efficient to use CPU +to do the copy in this case, very similar to process_vm_read/write() +functions. + +Change-Id: I5d01d906f04190d71e8663785718060411dede4e +Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com> +Signed-off-by: Kalyan Alle <kalyan.alle@amd.com> + +Conflicts: + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +--- + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 273 ++++++++++++++++++++++++------- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + + mm/gup.c | 11 ++ + 3 files changed, 230 insertions(+), 56 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index 91223e2..8941312 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -35,6 +35,7 @@ + #include <linux/mman.h> + #include <asm/processor.h> + #include <linux/ptrace.h> ++#include <linux/pagemap.h> + + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" +@@ -1681,6 +1682,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep, + } + + ++/* Maximum number of entries for process pages array which lives on stack */ ++#define MAX_PP_STACK_COUNT 16 ++/* Maximum number of pages kmalloc'd to hold struct page's during copy */ ++#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2) ++#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *)) ++ + /* Update cma_iter.cur_bo with KFD BO that is assocaited with + * cma_iter.array.va_addr + */ +@@ -1729,7 +1736,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size) + } + + static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, +- struct kfd_process *p, struct cma_iter *ci) ++ struct kfd_process *p, struct mm_struct *mm, ++ struct task_struct *task, struct cma_iter *ci) + { + int ret; + int nr; +@@ -1742,6 +1750,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, + ci->nr_segs = segs; + ci->p = p; + ci->offset = 0; ++ ci->mm = mm; ++ ci->task = task; + for (nr = 0; nr < segs; nr++) + ci->total += arr[nr].size; + +@@ -1762,6 +1772,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci) + return false; + } + ++/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes ++ * both source and dest. BOs are userptr BOs. Both BOs can either belong to ++ * current process or one of the BOs can belong to a differnt ++ * process. @Returns 0 on success, -ve on failure ++ * ++ * @si: Source iter ++ * @di: Dest. iter ++ * @cma_write: Indicates if it is write to remote or read from remote ++ * @size: amount of bytes to be copied ++ * @copied: Return number of bytes actually copied. ++ */ ++static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di, ++ bool cma_write, uint64_t size, ++ uint64_t *copied) ++{ ++ int i, ret = 0, locked; ++ unsigned int nents, nl; ++ unsigned int offset_in_page; ++ struct page *pp_stack[MAX_PP_STACK_COUNT]; ++ struct page **process_pages = pp_stack; ++ unsigned long rva, lva = 0, flags = 0; ++ uint64_t copy_size, to_copy = size; ++ struct cma_iter *li, *ri; ++ ++ if (cma_write) { ++ ri = di; ++ li = si; ++ flags |= FOLL_WRITE; ++ } else { ++ li = di; ++ ri = si; ++ } ++ /* rva: remote virtual address. Page aligned to start page. ++ * rva + offset_in_page: Points to remote start address ++ * lva: local virtual address. Points to the start address. ++ * nents: computes number of remote pages to request ++ */ ++ offset_in_page = ri->bo_offset & (PAGE_SIZE - 1); ++ rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK; ++ lva = li->cur_bo->cpuva + li->bo_offset; ++ ++ nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE; ++ ++ copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page); ++ *copied = 0; ++ ++ if (nents > MAX_PP_STACK_COUNT) { ++ /* For reliability kmalloc only 2 pages worth */ ++ process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES, ++ sizeof(struct pages *)*nents), ++ GFP_KERNEL); ++ ++ if (!process_pages) ++ return -ENOMEM; ++ } ++ ++ while (nents && to_copy) { ++ nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents); ++ locked = 1; ++ down_read(&ri->mm->mmap_sem); ++ nl = get_user_pages_remote(ri->task, ri->mm, rva, nl, ++ flags, process_pages, NULL, ++ &locked); ++ if (locked) ++ up_read(&ri->mm->mmap_sem); ++ if (nl <= 0) { ++ pr_err("CMA: Invalid virtual address 0x%lx\n", rva); ++ ret = -EFAULT; ++ break; ++ } ++ ++ for (i = 0; i < nl; i++) { ++ unsigned int n; ++ void *kaddr = kmap_atomic(process_pages[i]); ++ ++ if (cma_write) { ++ n = copy_from_user(kaddr+offset_in_page, ++ (void *)lva, copy_size); ++ set_page_dirty(process_pages[i]); ++ } else { ++ n = copy_to_user((void *)lva, ++ kaddr+offset_in_page, ++ copy_size); ++ } ++ kunmap_atomic(kaddr); ++ if (n) { ++ ret = -EFAULT; ++ break; ++ } ++ to_copy -= copy_size; ++ if (!to_copy) ++ break; ++ lva += copy_size; ++ rva += (copy_size + offset_in_page); ++ WARN_ONCE(rva & (PAGE_SIZE - 1), ++ "CMA: Error in remote VA computation"); ++ offset_in_page = 0; ++ copy_size = min_t(uint64_t, to_copy, PAGE_SIZE); ++ } ++ ++ for (i = 0; i < nl; i++) ++ put_page(process_pages[i]); ++ ++ if (ret) ++ break; ++ nents -= nl; ++ } ++ ++ if (process_pages != pp_stack) ++ kfree(process_pages); ++ ++ *copied = (size - to_copy); ++ return ret; ++ ++} ++ ++/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their ++ * respective offset. ++ * @si: Source iter ++ * @di: Dest. iter ++ * @cma_write: Indicates if it is write to remote or read from remote ++ * @size: amount of bytes to be copied ++ * @f: Return the last fence if any ++ * @copied: Return number of bytes actually copied. ++ */ ++static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, ++ int cma_write, uint64_t size, ++ struct dma_fence **f, uint64_t *copied) ++{ ++ int err = 0; ++ struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo; ++ uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset; ++ struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem; ++ ++ *copied = 0; ++ if (f) ++ *f = NULL; ++ if (src_bo->cpuva && dst_bo->cpuva) ++ return kfd_copy_userptr_bos(si, di, cma_write, size, copied); ++ ++ if (src_bo->dev->kgd != dst_bo->dev->kgd) { ++ pr_err("CMA %d fail. Not same dev\n", cma_write); ++ err = -EINVAL; ++ } ++ ++ err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem, ++ src_offset, dst_mem, ++ dst_offset, size, f, ++ copied); ++ ++ return err; ++} ++ + /* Copy single range from source iterator @si to destination iterator @di. + * @si will move to next range and @di will move by bytes copied. + * @return : 0 for success or -ve for failure +@@ -1772,57 +1935,55 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di, + bool cma_write, struct dma_fence **f, + uint64_t *copied) + { +- int err = 0; +- uint64_t copy_size, n; +- uint64_t size = si->array->size; +- struct kfd_bo *src_bo = si->cur_bo; +- struct dma_fence *lfence = NULL; +- +- if (!src_bo || !di || !copied) +- return -EINVAL; +- *copied = 0; +- if (f) +- *f = NULL; +- +- while (size && !kfd_cma_iter_end(di)) { +- struct dma_fence *fence = NULL; +- struct kfd_bo *dst_bo = di->cur_bo; +- +- copy_size = min(size, (di->array->size - di->offset)); +- +- /* Check both BOs belong to same device */ +- if (src_bo->dev->kgd != dst_bo->dev->kgd) { +- pr_err("CMA fail. Not same dev\n"); +- return -EINVAL; +- } +- +- err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, +- src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset, +- copy_size, &fence, &n); +- if (err) { +- pr_err("GPU CMA %d failed\n", err); +- break; +- } +- +- if (fence) { +- dma_fence_put(lfence); +- lfence = fence; +- } +- size -= n; +- *copied += n; +- err = kfd_cma_iter_advance(si, n); +- if (err) +- break; +- err = kfd_cma_iter_advance(di, n); +- if (err) +- break; +- } +- +- if (f) +- *f = dma_fence_get(lfence); +- dma_fence_put(lfence); +- +- return err; ++ int err = 0; ++ uint64_t copy_size, n; ++ uint64_t size = si->array->size; ++ struct kfd_bo *src_bo = si->cur_bo; ++ struct dma_fence *lfence = NULL; ++ ++ if (!src_bo || !di || !copied) ++ return -EINVAL; ++ *copied = 0; ++ if (f) ++ *f = NULL; ++ ++ while (size && !kfd_cma_iter_end(di)) { ++ struct dma_fence *fence = NULL; ++ struct kfd_bo *dst_bo = di->cur_bo; ++ ++ copy_size = min(size, (di->array->size - di->offset)); ++ ++ /* Check both BOs belong to same device */ ++ if (src_bo->dev->kgd != dst_bo->dev->kgd) { ++ pr_err("CMA fail. Not same dev\n"); ++ return -EINVAL; ++ } ++ ++ err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n); ++ if (err) { ++ pr_err("CMA %d failed\n", err); ++ break; ++ } ++ ++ if (fence) { ++ dma_fence_put(lfence); ++ lfence = fence; ++ } ++ size -= n; ++ *copied += n; ++ err = kfd_cma_iter_advance(si, n); ++ if (err) ++ break; ++ err = kfd_cma_iter_advance(di, n); ++ if (err) ++ break; ++ } ++ ++ if (f) ++ *f = dma_fence_get(lfence); ++ dma_fence_put(lfence); ++ ++ return err; + } + + static int kfd_ioctl_cross_memory_copy(struct file *filep, +@@ -1910,22 +2071,22 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep, + cma_op = "WRITE"; + pr_debug("CMA WRITE: local -> remote\n"); + err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, +- remote_p, &di); ++ remote_p, remote_mm, remote_task, &di); + if (err) + goto kfd_process_fail; + err = kfd_cma_iter_init(src_array, args->src_mem_array_size, +- local_p, &si); ++ local_p, current->mm, current, &si); + if (err) + goto kfd_process_fail; + } else { + cma_op = "READ"; + pr_debug("CMA READ: remote -> local\n"); + err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, +- local_p, &di); ++ local_p, current->mm, current, &di); + if (err) + goto kfd_process_fail; + err = kfd_cma_iter_init(src_array, args->src_mem_array_size, +- remote_p, &si); ++ remote_p, remote_mm, remote_task, &si); + if (err) + goto kfd_process_fail; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 8adfe21..93462fa 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -316,6 +316,8 @@ struct cma_iter { + /* offset into the entry pointed by cma_iter.array */ + unsigned long offset; + struct kfd_process *p; ++ struct mm_struct *mm; ++ struct task_struct *task; + /* current kfd_bo associated with cma_iter.array.va_addr */ + struct kfd_bo *cur_bo; + /* offset w.r.t cur_bo */ +diff --git a/mm/gup.c b/mm/gup.c +index 4cc8a6f..35c9f0b 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1081,6 +1081,17 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + } + EXPORT_SYMBOL(get_user_pages_remote); + ++long get_user_pages_remote_locked(struct task_struct *tsk, struct mm_struct *mm, ++ unsigned long start, unsigned long nr_pages, ++ unsigned int gup_flags, struct page **pages, ++ struct vm_area_struct **vmas, int *locked) ++{ ++ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, ++ locked, false, ++ gup_flags | FOLL_TOUCH | FOLL_REMOTE); ++} ++EXPORT_SYMBOL(get_user_pages_remote_locked); ++ + /* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task +-- +2.7.4 + |