aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch399
1 files changed, 399 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch
new file mode 100644
index 00000000..71bba1fa
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch
@@ -0,0 +1,399 @@
+From de7edd2adbdcbd3a34f3d1df96884b4a59904b29 Mon Sep 17 00:00:00 2001
+From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
+Date: Mon, 26 Mar 2018 16:45:06 -0400
+Subject: [PATCH 5637/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy
+
+CMA userptr implementations are incomplete because it doesn't properly
+handle if the BO is evicted. This patch handles the case where both
+source and destination BOs are userptr. It is more efficient to use CPU
+to do the copy in this case, very similar to process_vm_read/write()
+functions.
+
+Change-Id: I5d01d906f04190d71e8663785718060411dede4e
+Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
+Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>
+
+Conflicts:
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 273 ++++++++++++++++++++++++-------
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +
+ mm/gup.c | 11 ++
+ 3 files changed, 230 insertions(+), 56 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 91223e2..8941312 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -35,6 +35,7 @@
+ #include <linux/mman.h>
+ #include <asm/processor.h>
+ #include <linux/ptrace.h>
++#include <linux/pagemap.h>
+
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+@@ -1681,6 +1682,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep,
+ }
+
+
++/* Maximum number of entries for process pages array which lives on stack */
++#define MAX_PP_STACK_COUNT 16
++/* Maximum number of pages kmalloc'd to hold struct page's during copy */
++#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
++#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *))
++
+ /* Update cma_iter.cur_bo with KFD BO that is assocaited with
+ * cma_iter.array.va_addr
+ */
+@@ -1729,7 +1736,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size)
+ }
+
+ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
+- struct kfd_process *p, struct cma_iter *ci)
++ struct kfd_process *p, struct mm_struct *mm,
++ struct task_struct *task, struct cma_iter *ci)
+ {
+ int ret;
+ int nr;
+@@ -1742,6 +1750,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
+ ci->nr_segs = segs;
+ ci->p = p;
+ ci->offset = 0;
++ ci->mm = mm;
++ ci->task = task;
+ for (nr = 0; nr < segs; nr++)
+ ci->total += arr[nr].size;
+
+@@ -1762,6 +1772,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci)
+ return false;
+ }
+
++/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes
++ * both source and dest. BOs are userptr BOs. Both BOs can either belong to
++ * current process or one of the BOs can belong to a differnt
++ * process. @Returns 0 on success, -ve on failure
++ *
++ * @si: Source iter
++ * @di: Dest. iter
++ * @cma_write: Indicates if it is write to remote or read from remote
++ * @size: amount of bytes to be copied
++ * @copied: Return number of bytes actually copied.
++ */
++static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di,
++ bool cma_write, uint64_t size,
++ uint64_t *copied)
++{
++ int i, ret = 0, locked;
++ unsigned int nents, nl;
++ unsigned int offset_in_page;
++ struct page *pp_stack[MAX_PP_STACK_COUNT];
++ struct page **process_pages = pp_stack;
++ unsigned long rva, lva = 0, flags = 0;
++ uint64_t copy_size, to_copy = size;
++ struct cma_iter *li, *ri;
++
++ if (cma_write) {
++ ri = di;
++ li = si;
++ flags |= FOLL_WRITE;
++ } else {
++ li = di;
++ ri = si;
++ }
++ /* rva: remote virtual address. Page aligned to start page.
++ * rva + offset_in_page: Points to remote start address
++ * lva: local virtual address. Points to the start address.
++ * nents: computes number of remote pages to request
++ */
++ offset_in_page = ri->bo_offset & (PAGE_SIZE - 1);
++ rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK;
++ lva = li->cur_bo->cpuva + li->bo_offset;
++
++ nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE;
++
++ copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page);
++ *copied = 0;
++
++ if (nents > MAX_PP_STACK_COUNT) {
++ /* For reliability kmalloc only 2 pages worth */
++ process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES,
++ sizeof(struct pages *)*nents),
++ GFP_KERNEL);
++
++ if (!process_pages)
++ return -ENOMEM;
++ }
++
++ while (nents && to_copy) {
++ nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents);
++ locked = 1;
++ down_read(&ri->mm->mmap_sem);
++ nl = get_user_pages_remote(ri->task, ri->mm, rva, nl,
++ flags, process_pages, NULL,
++ &locked);
++ if (locked)
++ up_read(&ri->mm->mmap_sem);
++ if (nl <= 0) {
++ pr_err("CMA: Invalid virtual address 0x%lx\n", rva);
++ ret = -EFAULT;
++ break;
++ }
++
++ for (i = 0; i < nl; i++) {
++ unsigned int n;
++ void *kaddr = kmap_atomic(process_pages[i]);
++
++ if (cma_write) {
++ n = copy_from_user(kaddr+offset_in_page,
++ (void *)lva, copy_size);
++ set_page_dirty(process_pages[i]);
++ } else {
++ n = copy_to_user((void *)lva,
++ kaddr+offset_in_page,
++ copy_size);
++ }
++ kunmap_atomic(kaddr);
++ if (n) {
++ ret = -EFAULT;
++ break;
++ }
++ to_copy -= copy_size;
++ if (!to_copy)
++ break;
++ lva += copy_size;
++ rva += (copy_size + offset_in_page);
++ WARN_ONCE(rva & (PAGE_SIZE - 1),
++ "CMA: Error in remote VA computation");
++ offset_in_page = 0;
++ copy_size = min_t(uint64_t, to_copy, PAGE_SIZE);
++ }
++
++ for (i = 0; i < nl; i++)
++ put_page(process_pages[i]);
++
++ if (ret)
++ break;
++ nents -= nl;
++ }
++
++ if (process_pages != pp_stack)
++ kfree(process_pages);
++
++ *copied = (size - to_copy);
++ return ret;
++
++}
++
++/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their
++ * respective offset.
++ * @si: Source iter
++ * @di: Dest. iter
++ * @cma_write: Indicates if it is write to remote or read from remote
++ * @size: amount of bytes to be copied
++ * @f: Return the last fence if any
++ * @copied: Return number of bytes actually copied.
++ */
++static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
++ int cma_write, uint64_t size,
++ struct dma_fence **f, uint64_t *copied)
++{
++ int err = 0;
++ struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo;
++ uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
++ struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
++
++ *copied = 0;
++ if (f)
++ *f = NULL;
++ if (src_bo->cpuva && dst_bo->cpuva)
++ return kfd_copy_userptr_bos(si, di, cma_write, size, copied);
++
++ if (src_bo->dev->kgd != dst_bo->dev->kgd) {
++ pr_err("CMA %d fail. Not same dev\n", cma_write);
++ err = -EINVAL;
++ }
++
++ err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem,
++ src_offset, dst_mem,
++ dst_offset, size, f,
++ copied);
++
++ return err;
++}
++
+ /* Copy single range from source iterator @si to destination iterator @di.
+ * @si will move to next range and @di will move by bytes copied.
+ * @return : 0 for success or -ve for failure
+@@ -1772,57 +1935,55 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di,
+ bool cma_write, struct dma_fence **f,
+ uint64_t *copied)
+ {
+- int err = 0;
+- uint64_t copy_size, n;
+- uint64_t size = si->array->size;
+- struct kfd_bo *src_bo = si->cur_bo;
+- struct dma_fence *lfence = NULL;
+-
+- if (!src_bo || !di || !copied)
+- return -EINVAL;
+- *copied = 0;
+- if (f)
+- *f = NULL;
+-
+- while (size && !kfd_cma_iter_end(di)) {
+- struct dma_fence *fence = NULL;
+- struct kfd_bo *dst_bo = di->cur_bo;
+-
+- copy_size = min(size, (di->array->size - di->offset));
+-
+- /* Check both BOs belong to same device */
+- if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+- pr_err("CMA fail. Not same dev\n");
+- return -EINVAL;
+- }
+-
+- err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd,
+- src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset,
+- copy_size, &fence, &n);
+- if (err) {
+- pr_err("GPU CMA %d failed\n", err);
+- break;
+- }
+-
+- if (fence) {
+- dma_fence_put(lfence);
+- lfence = fence;
+- }
+- size -= n;
+- *copied += n;
+- err = kfd_cma_iter_advance(si, n);
+- if (err)
+- break;
+- err = kfd_cma_iter_advance(di, n);
+- if (err)
+- break;
+- }
+-
+- if (f)
+- *f = dma_fence_get(lfence);
+- dma_fence_put(lfence);
+-
+- return err;
++ int err = 0;
++ uint64_t copy_size, n;
++ uint64_t size = si->array->size;
++ struct kfd_bo *src_bo = si->cur_bo;
++ struct dma_fence *lfence = NULL;
++
++ if (!src_bo || !di || !copied)
++ return -EINVAL;
++ *copied = 0;
++ if (f)
++ *f = NULL;
++
++ while (size && !kfd_cma_iter_end(di)) {
++ struct dma_fence *fence = NULL;
++ struct kfd_bo *dst_bo = di->cur_bo;
++
++ copy_size = min(size, (di->array->size - di->offset));
++
++ /* Check both BOs belong to same device */
++ if (src_bo->dev->kgd != dst_bo->dev->kgd) {
++ pr_err("CMA fail. Not same dev\n");
++ return -EINVAL;
++ }
++
++ err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n);
++ if (err) {
++ pr_err("CMA %d failed\n", err);
++ break;
++ }
++
++ if (fence) {
++ dma_fence_put(lfence);
++ lfence = fence;
++ }
++ size -= n;
++ *copied += n;
++ err = kfd_cma_iter_advance(si, n);
++ if (err)
++ break;
++ err = kfd_cma_iter_advance(di, n);
++ if (err)
++ break;
++ }
++
++ if (f)
++ *f = dma_fence_get(lfence);
++ dma_fence_put(lfence);
++
++ return err;
+ }
+
+ static int kfd_ioctl_cross_memory_copy(struct file *filep,
+@@ -1910,22 +2071,22 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep,
+ cma_op = "WRITE";
+ pr_debug("CMA WRITE: local -> remote\n");
+ err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
+- remote_p, &di);
++ remote_p, remote_mm, remote_task, &di);
+ if (err)
+ goto kfd_process_fail;
+ err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
+- local_p, &si);
++ local_p, current->mm, current, &si);
+ if (err)
+ goto kfd_process_fail;
+ } else {
+ cma_op = "READ";
+ pr_debug("CMA READ: remote -> local\n");
+ err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
+- local_p, &di);
++ local_p, current->mm, current, &di);
+ if (err)
+ goto kfd_process_fail;
+ err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
+- remote_p, &si);
++ remote_p, remote_mm, remote_task, &si);
+ if (err)
+ goto kfd_process_fail;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 8adfe21..93462fa 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -316,6 +316,8 @@ struct cma_iter {
+ /* offset into the entry pointed by cma_iter.array */
+ unsigned long offset;
+ struct kfd_process *p;
++ struct mm_struct *mm;
++ struct task_struct *task;
+ /* current kfd_bo associated with cma_iter.array.va_addr */
+ struct kfd_bo *cur_bo;
+ /* offset w.r.t cur_bo */
+diff --git a/mm/gup.c b/mm/gup.c
+index 4cc8a6f..35c9f0b 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1081,6 +1081,17 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ }
+ EXPORT_SYMBOL(get_user_pages_remote);
+
++long get_user_pages_remote_locked(struct task_struct *tsk, struct mm_struct *mm,
++ unsigned long start, unsigned long nr_pages,
++ unsigned int gup_flags, struct page **pages,
++ struct vm_area_struct **vmas, int *locked)
++{
++ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
++ locked, false,
++ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
++}
++EXPORT_SYMBOL(get_user_pages_remote_locked);
++
+ /*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+--
+2.7.4
+