diff options
Diffstat (limited to 'meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1640-drm-amdgpu-New-HSA-MMU-notifiers-to-work-under-memor.patch')
-rw-r--r-- | meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1640-drm-amdgpu-New-HSA-MMU-notifiers-to-work-under-memor.patch | 1434 |
1 files changed, 1434 insertions, 0 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1640-drm-amdgpu-New-HSA-MMU-notifiers-to-work-under-memor.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1640-drm-amdgpu-New-HSA-MMU-notifiers-to-work-under-memor.patch new file mode 100644 index 00000000..faa756f6 --- /dev/null +++ b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1640-drm-amdgpu-New-HSA-MMU-notifiers-to-work-under-memor.patch @@ -0,0 +1,1434 @@ +From 8683beb2f68e2116a28f5a922ec9d7eb81c35faa Mon Sep 17 00:00:00 2001 +From: Felix Kuehling <Felix.Kuehling@amd.com> +Date: Tue, 21 Mar 2017 11:55:47 -0400 +Subject: [PATCH 1640/4131] drm/amdgpu: New HSA MMU notifiers to work under + memory pressure + +MMU notifiers can be invoked in reclaim-fs context under memory +pressure. Any locks held in this case are subject to limitations to +prevent potential deadlocks: memory allocations performed while +holding those locks in any context must not cause memory reclaims. + +Particularly reservation objects are problematic in this respect, +because they are locked (reserved) in too many places, potentially +even other drivers, to enforce such limitations. + +This commit rewrites the HSA MMU notifier to avoid locking reservation +objects. To allow this, the MMU notifier cannot take the current +mapping state of a BO into account. MMU notifiers on HSA userptr +BOs cause evictions on all GPUs. The only locks taken by the new +MMU notifiers are the rmn->lock and the KFD DQM lock. + +Restore is still done by a delayed worker, but handles multiple BOs +in a single per-process worker, rather than per-BO workers. + +Change-Id: I011fd11c95747caa117a592845ba0a85dc6e77be +Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 170 ----- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 27 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 843 +++++++++++++---------- + drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 7 +- + 4 files changed, 480 insertions(+), 567 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +index 3a1776b..f27d67bc 100755 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +@@ -202,176 +202,6 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) + return r; + } + +-/* Cancel any scheduled restore work or wait for it to finish. Must be +- * called with the mem->lock held. First drop the mm reference. If the +- * worker has already started, it will detect that mm was dropped and +- * cancel itself. +- * +- * If the worker has already started, it needs to take the +- * mem->lock. To prevent deadlocks, we need to briefly drop the lock +- * while waiting. During that time someone else may schedule another +- * restore. So repeat the process if necessary. +- * +- * mmput needs to be called without holding the lock to prevent +- * circular lock dependencies. +- */ +-static void cancel_restore_locked(struct kgd_mem *mem) +-{ +- struct mm_struct *mm; +- +- while (mem->mm) { +- /* update_user_pages needs to drop the lock +- * briefly. Therefore holding the lock is no guarantee +- * that no restore is in progress +- */ +- if (mem->busy) { +- mutex_unlock(&mem->lock); +- schedule_timeout_uninterruptible(1); +- mutex_lock(&mem->lock); +- continue; +- } +- +- mm = mem->mm; +- mem->mm = NULL; +- +- mutex_unlock(&mem->lock); +- +- mmput(mm); +- cancel_delayed_work_sync(&mem->work); +- +- mutex_lock(&mem->lock); +- } +-} +- +-void amdgpu_amdkfd_cancel_restore_mem(struct kgd_mem *mem) +-{ +- mutex_lock(&mem->lock); +- cancel_restore_locked(mem); +- mutex_unlock(&mem->lock); +-} +- +-int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, +- struct mm_struct *mm) +-{ +- int r; +- +- if (!adev->kfd) +- return -ENODEV; +- +- mutex_lock(&mem->lock); +- +- if (++mem->evicted > 1) { +- /* Memory was already evicted. It may have been +- * scheduled for restoration, but that restoration +- * hasn't happened yet. When the worker starts it will +- * know and abort. +- */ +- mutex_unlock(&mem->lock); +- return 0; +- } +- +- r = amdgpu_amdkfd_gpuvm_evict_mem(mem, mm); +- +- if (r != 0) +- /* First eviction failed, setting count back to 0 will +- * make the corresponding restore fail gracefully */ +- mem->evicted = 0; +- else +- /* First eviction counts as 2. Eviction counter == 1 +- * means that restoration is scheduled. */ +- mem->evicted = 2; +- +- mutex_unlock(&mem->lock); +- +- return r; +-} +- +-static void amdgdu_amdkfd_restore_mem_worker(struct work_struct *work) +-{ +- struct delayed_work *dwork = to_delayed_work(work); +- struct kgd_mem *mem = container_of(dwork, struct kgd_mem, work); +- struct amdgpu_device *adev; +- struct mm_struct *mm; +- int ret = 0; +- +- mutex_lock(&mem->lock); +- +- adev = amdgpu_ttm_adev(mem->bo->tbo.bdev); +- mm = mem->mm; +- +- /* Check if restore was canceled */ +- if (!mm) { +- mutex_unlock(&mem->lock); +- return; +- } +- +- /* Only restore if no other eviction happened since restore +- * was scheduled. +- */ +- if (mem->evicted == 1) { +- ret = amdgpu_amdkfd_gpuvm_restore_mem(mem, mm); +- if (ret != -EBUSY && ret != -EDEADLK) +- mem->evicted = 0; +- } +- +- /* If restore failed due to the VM being updated concurrently, +- * reschedule restore again in a jiffie +- */ +- if (ret == -EDEADLK && mem->evicted == 1) { +- pr_err("Rescheduling restore\n"); +- mm = NULL; +- schedule_delayed_work(&mem->work, 1); +- } else { +- BUG_ON(mem->mm != mm); +- mem->mm = NULL; +- } +- +- mutex_unlock(&mem->lock); +- +- if (mm) +- mmput(mm); +-} +- +-int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, +- struct kgd_mem *mem, +- struct mm_struct *mm, +- unsigned long delay) +-{ +- int r = 0; +- +- if (!adev->kfd) +- return -ENODEV; +- +- mutex_lock(&mem->lock); +- +- if (mem->evicted <= 1) { +- /* Buffer is not evicted (== 0) or its restoration is +- * already scheduled (== 1) */ +- pr_err("Unbalanced restore of evicted buffer %p\n", mem); +- mutex_unlock(&mem->lock); +- return -EFAULT; +- } else if (--mem->evicted > 1) { +- mutex_unlock(&mem->lock); +- return 0; +- } +- +- /* mem->evicted is 1 after decrementing. If a restoration was +- * already scheduled, just let it do its job. Otherwise +- * schedule another one. +- */ +- if (!mem->mm) { +- mem->mm = mm; +- atomic_inc(&mm->mm_users); +- INIT_DELAYED_WORK(&mem->work, +- amdgdu_amdkfd_restore_mem_worker); +- schedule_delayed_work(&mem->work, delay); +- } +- +- mutex_unlock(&mem->lock); +- +- return r; +-} +- + int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, + uint32_t vmid, uint64_t gpu_addr, + uint32_t *ib_cmd, uint32_t ib_len) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +index a6a538ac..9e89aee 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +@@ -57,17 +57,18 @@ struct kgd_mem { + unsigned int mapped_to_gpu_memory; + void *kptr; + uint64_t va; +- unsigned int evicted; /* eviction counter */ +- struct delayed_work work; /* for restore evicted mem */ +- struct mm_struct *mm; /* for restore */ + + uint32_t mapping_flags; + ++ atomic_t invalid; ++ struct amdkfd_process_info *process_info; ++ struct page **user_pages; ++ ++ + /* flags bitfield */ + bool coherent : 1; + bool no_substitute : 1; + bool aql_queue : 1; +- bool busy : 1; + }; + + +@@ -89,6 +90,9 @@ struct amdkfd_process_info { + struct list_head vm_list_head; + /* List head for all KFD BOs that belong to a KFD process. */ + struct list_head kfd_bo_list; ++ /* List of userptr BOs that are valid or invalid */ ++ struct list_head userptr_valid_list; ++ struct list_head userptr_inval_list; + /* Lock to protect kfd_bo_list */ + struct mutex lock; + +@@ -96,6 +100,11 @@ struct amdkfd_process_info { + unsigned int n_vms; + /* Eviction Fence */ + struct amdgpu_amdkfd_fence *eviction_fence; ++ ++ /* MMU-notifier related fields */ ++ atomic_t evicted_bos; ++ struct delayed_work work; ++ struct pid *pid; + }; + + /* struct amdkfd_vm - +@@ -130,13 +139,9 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); + void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); + +-int amdgpu_amdkfd_evict_mem(struct amdgpu_device *adev, struct kgd_mem *mem, +- struct mm_struct *mm); +-int amdgpu_amdkfd_schedule_restore_mem(struct amdgpu_device *adev, +- struct kgd_mem *mem, +- struct mm_struct *mm, +- unsigned long delay); +-void amdgpu_amdkfd_cancel_restore_mem(struct kgd_mem *mem); ++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); ++int amdgpu_amdkfd_schedule_restore_userptr(struct kgd_mem *mem, ++ unsigned long delay); + int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, + uint32_t vmid, uint64_t gpu_addr, + uint32_t *ib_cmd, uint32_t ib_len); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +index 29ca428..443348e 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +@@ -74,6 +74,8 @@ static const char * const domain_bit_to_string[] = { + + #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] + ++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); ++ + + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) + { +@@ -389,44 +391,28 @@ static void remove_bo_from_vm(struct amdgpu_device *adev, + static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, + bool wait) + { +- int ret = 0; ++ int ret; + +- if (!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { +- amdgpu_ttm_placement_from_domain(bo, domain); ++ if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), ++ "Called with userptr BO")) ++ return -EINVAL; + +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, +- false, false); ++ amdgpu_ttm_placement_from_domain(bo, domain); ++ ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ if (ret) ++ goto validate_fail; ++ if (wait) { ++ struct amdgpu_amdkfd_fence **ef_list; ++ unsigned int ef_count; ++ ++ ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, ++ &ef_count); + if (ret) + goto validate_fail; +- if (wait) { +- struct amdgpu_amdkfd_fence **ef_list; +- unsigned int ef_count; +- +- ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, +- &ef_list, +- &ef_count); +- if (ret) +- goto validate_fail; +- +- ttm_bo_wait(&bo->tbo, false, false); +- amdgpu_amdkfd_add_eviction_fence(bo, ef_list, +- ef_count); +- } +- } else { +- amdgpu_ttm_placement_from_domain(bo, domain); +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, +- true, false); +- if (ret) { +- /* Don't leak the pages. If validate failed, +- * the pages aren't bound, and won't be +- * released by unbind later on +- */ +- release_pages(bo->tbo.ttm->pages, +- bo->tbo.ttm->num_pages, 0); +- goto validate_fail; +- } +- if (wait) +- ttm_bo_wait(&bo->tbo, false, false); ++ ++ ttm_bo_wait(&bo->tbo, false, false); ++ amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); + } + + validate_fail: +@@ -440,19 +426,6 @@ static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) + return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); + } + +-static int amdgpu_amdkfd_bo_invalidate(struct amdgpu_bo *bo) +-{ +- int ret = 0; +- +- if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { +- amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); +- ret = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); +- if (ret != 0) +- pr_err("Failed to invalidate userptr BO\n"); +- } +- return ret; +-} +- + static int validate_pt_pd_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm) + { + struct amdgpu_bo *pd = vm->root.bo; +@@ -479,7 +452,8 @@ static int validate_pt_pd_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm) + } + + static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, +- struct amdkfd_process_info *process_info) ++ struct amdkfd_process_info *process_info, ++ bool userptr) + { + struct ttm_validate_buffer *entry = &mem->validate_list; + struct amdgpu_bo *bo = mem->bo; +@@ -488,10 +462,97 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, + entry->shared = true; + entry->bo = &bo->tbo; + mutex_lock(&process_info->lock); +- list_add_tail(&entry->head, &process_info->kfd_bo_list); ++ if (userptr) ++ list_add_tail(&entry->head, &process_info->userptr_valid_list); ++ else ++ list_add_tail(&entry->head, &process_info->kfd_bo_list); + mutex_unlock(&process_info->lock); + } + ++/* Initializes user pages. It registers the MMU notifier and validates ++ * the userptr BO in the GTT domain. ++ * ++ * The BO must already be on the userptr_valid_list. Otherwise an ++ * eviction and restore may happen that leaves the new BO unmapped ++ * with the user mode queues running. ++ * ++ * Takes the process_info->lock to protect against concurrent restore ++ * workers. ++ * ++ * Returns 0 for success, negative errno for errors. ++ */ ++static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, ++ uint64_t user_addr) ++{ ++ struct amdkfd_process_info *process_info = mem->process_info; ++ struct amdgpu_bo *bo = mem->bo; ++ int ret = 0; ++ ++ mutex_lock(&process_info->lock); ++ ++ ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); ++ if (ret) { ++ pr_err("%s: Failed to set userptr: %d\n", __func__, ret); ++ goto out; ++ } ++ ++ ret = amdgpu_mn_register(bo, user_addr); ++ if (ret) { ++ pr_err("%s: Failed to register MMU notifier: %d\n", ++ __func__, ret); ++ goto out; ++ } ++ ++ /* If no restore worker is running concurrently, user_pages ++ * should not be allocated ++ */ ++ WARN(mem->user_pages, "Leaking user_pages array"); ++ ++ mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, ++ sizeof(struct page *)); ++ if (!mem->user_pages) { ++ pr_err("%s: Failed to allocate pages array\n", __func__); ++ ret = -ENOMEM; ++ goto unregister_out; ++ } ++ ++ down_read(&mm->mmap_sem); ++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); ++ up_read(&mm->mmap_sem); ++ if (ret) { ++ pr_err("%s: Failed to get user pages\n", __func__); ++ goto free_out; ++ } ++ ++ memcpy(bo->tbo.ttm->pages, mem->user_pages, ++ sizeof(struct page *) * bo->tbo.ttm->num_pages); ++ ++ ret = amdgpu_bo_reserve(bo, true); ++ if (ret) { ++ pr_err("%s: Failed to reserve BO\n", __func__); ++ goto release_out; ++ } ++ amdgpu_ttm_placement_from_domain(bo, mem->domain); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, ++ true, false); ++ if (ret) ++ pr_err("%s: failed to validate BO\n", __func__); ++ amdgpu_bo_unreserve(bo); ++ ++release_out: ++ if (ret) ++ release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); ++free_out: ++ drm_free_large(mem->user_pages); ++ mem->user_pages = NULL; ++unregister_out: ++ if (ret) ++ amdgpu_mn_unregister(bo); ++out: ++ mutex_unlock(&process_info->lock); ++ return ret; ++} ++ + static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, + uint64_t size, void *vm, struct kgd_mem **mem, + uint64_t *offset, void **kptr, +@@ -578,21 +639,6 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, + if (userptr) + bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; + +- if (userptr) { +- ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); +- if (ret) { +- pr_err("Failed to set userptr. ret %d\n", ret); +- goto allocate_mem_set_userptr_failed; +- } +- +- ret = amdgpu_mn_register(bo, user_addr); +- if (ret) { +- pr_err("Failed to register MMU notifier %d\n", +- ret); +- goto allocate_mem_set_userptr_failed; +- } +- } +- + if (kptr) { + ret = amdgpu_bo_reserve(bo, true); + if (ret) { +@@ -621,7 +667,18 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, + (*mem)->va = va; + (*mem)->domain = domain; + (*mem)->mapped_to_gpu_memory = 0; +- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info); ++ (*mem)->process_info = kfd_vm->process_info; ++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); ++ ++ if (userptr) { ++ ret = init_user_pages(*mem, current->mm, user_addr); ++ if (ret) { ++ mutex_lock(&kfd_vm->process_info->lock); ++ list_del(&(*mem)->validate_list.head); ++ mutex_unlock(&kfd_vm->process_info->lock); ++ goto allocate_init_user_pages_failed; ++ } ++ } + + if (offset) + *offset = amdgpu_bo_mmap_offset(bo); +@@ -633,9 +690,8 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, + allocate_mem_pin_bo_failed: + amdgpu_bo_unreserve(bo); + allocate_mem_reserve_bo_failed: +- if (userptr) +- amdgpu_mn_unregister(bo); +-allocate_mem_set_userptr_failed: ++ ++allocate_init_user_pages_failed: + amdgpu_bo_unref(&bo); + err_bo_create: + kfree(*mem); +@@ -807,90 +863,6 @@ static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, + ctx->vm_pd = NULL; + } + +-/* Must be called with mem->lock held and a BO/VM reservation +- * context. Temporarily drops the lock and reservation for updating +- * user pointers, to avoid circular lock dependencies between MM locks +- * and buffer reservations. If user pages are invalidated while the +- * lock and reservation are dropped, try again. */ +-static int update_user_pages(struct kgd_mem *mem, struct mm_struct *mm, +- struct bo_vm_reservation_context *ctx) +-{ +- struct amdgpu_bo *bo; +- unsigned tries = 10; +- int ret; +- +- bo = mem->bo; +- if (!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) +- return 0; +- +- if (bo->tbo.ttm->state != tt_bound) { +- struct page **pages; +- int invalidated; +- +- /* get user pages without locking the BO to avoid +- * circular lock dependency with MMU notifier. Retry +- * until we have the current version. */ +- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); +- ctx->reserved = false; +- pages = drm_calloc_large(bo->tbo.ttm->num_pages, +- sizeof(struct page *)); +- if (!pages) +- return -ENOMEM; +- +- mem->busy = true; +- mutex_unlock(&mem->lock); +- +- while (true) { +- down_read(&mm->mmap_sem); +- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, pages); +- up_read(&mm->mmap_sem); +- +- mutex_lock(&mem->lock); +- mem->busy = false; +- if (ret != 0) +- return ret; +- +- BUG_ON(bo != mem->bo); +- +- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, +- false, &ctx->duplicates); +- if (unlikely(ret != 0)) { +- release_pages(pages, bo->tbo.ttm->num_pages, 0); +- drm_free_large(pages); +- return ret; +- } +- ctx->reserved = true; +- if (!amdgpu_ttm_tt_userptr_invalidated(bo->tbo.ttm, +- &invalidated) || +- bo->tbo.ttm->state == tt_bound || +- --tries == 0) +- break; +- +- release_pages(pages, bo->tbo.ttm->num_pages, 0); +- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); +- ctx->reserved = false; +- mem->busy = true; +- mutex_unlock(&mem->lock); +- } +- +- /* If someone else already bound it, release our pages +- * array, otherwise copy it into the ttm BO. */ +- if (bo->tbo.ttm->state == tt_bound || tries == 0) +- release_pages(pages, bo->tbo.ttm->num_pages, 0); +- else +- memcpy(bo->tbo.ttm->pages, pages, +- sizeof(struct page *) * bo->tbo.ttm->num_pages); +- drm_free_large(pages); +- } +- +- if (tries == 0) { +- pr_err("Gave up trying to update user pages\n"); +- return -EDEADLK; +- } +- +- return 0; +-} +- + static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, + struct kfd_bo_va_list *entry, + struct amdgpu_sync *sync) +@@ -976,7 +948,8 @@ static int update_gpuvm_pte(struct amdgpu_device *adev, + } + + static int map_bo_to_gpuvm(struct amdgpu_device *adev, +- struct kfd_bo_va_list *entry, struct amdgpu_sync *sync) ++ struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, ++ bool no_update_pte) + { + int ret; + struct amdgpu_bo *bo = entry->bo_va->bo; +@@ -1025,6 +998,9 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev, + /* Add the eviction fence back */ + amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); + ++ if (no_update_pte) ++ return 0; ++ + ret = update_gpuvm_pte(adev, entry, sync); + if (ret != 0) { + pr_err("update_gpuvm_pte() failed\n"); +@@ -1169,8 +1145,23 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( + /* lock is not needed after this, since mem is unused and will + * be freed anyway */ + ++ /* No more MMU notifiers */ + amdgpu_mn_unregister(mem->bo); +- amdgpu_amdkfd_cancel_restore_mem(mem); ++ ++ /* Make sure restore workers don't access the BO any more */ ++ bo_list_entry = &mem->validate_list; ++ mutex_lock(&process_info->lock); ++ list_del(&bo_list_entry->head); ++ mutex_unlock(&process_info->lock); ++ ++ /* Free user pages if necessary */ ++ if (mem->user_pages) { ++ pr_debug("%s: Freeing user_pages array\n", __func__); ++ if (mem->user_pages[0]) ++ release_pages(mem->user_pages, ++ mem->bo->tbo.ttm->num_pages, 0); ++ drm_free_large(mem->user_pages); ++ } + + ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); + if (unlikely(ret != 0)) +@@ -1203,11 +1194,6 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( + } + + /* Free the BO*/ +- bo_list_entry = &mem->validate_list; +- mutex_lock(&process_info->lock); +- list_del(&bo_list_entry->head); +- mutex_unlock(&process_info->lock); +- + amdgpu_bo_unref(&mem->bo); + kfree(mem); + +@@ -1226,14 +1212,28 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + struct kfd_bo_va_list *bo_va_entry = NULL; + struct kfd_bo_va_list *bo_va_entry_aql = NULL; + struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; +- int num_to_quiesce = 0; + unsigned long bo_size; ++ bool is_invalid_userptr; + + BUG_ON(kgd == NULL); + BUG_ON(mem == NULL); + + adev = get_amdgpu_device(kgd); + ++ /* Make sure restore is not running concurrently. Since we ++ * don't map invalid userptr BOs, we rely on the next restore ++ * worker to do the mapping ++ */ ++ mutex_lock(&mem->process_info->lock); ++ ++ /* Lock mmap-sem. If we find an invalid userptr BO, we can be ++ * sure that the MMU notifier is no longer running ++ * concurrently and the queues are actually stopped ++ */ ++ down_read(¤t->mm->mmap_sem); ++ is_invalid_userptr = atomic_read(&mem->invalid); ++ up_read(¤t->mm->mmap_sem); ++ + mutex_lock(&mem->lock); + + bo = mem->bo; +@@ -1252,6 +1252,14 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + if (unlikely(ret != 0)) + goto bo_reserve_failed; + ++ /* Userptr can be marked as "not invalid", but not actually be ++ * validated yet (still in the system domain). In that case ++ * the queues are still stopped and we can leave mapping for ++ * the next restore worker ++ */ ++ if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) ++ is_invalid_userptr = true; ++ + if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { + ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, + &bo_va_entry); +@@ -1265,13 +1273,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + } + } + +- if (mem->mapped_to_gpu_memory == 0 && !mem->evicted) { +- ret = update_user_pages(mem, current->mm, &ctx); +- if (ret != 0) { +- pr_err("update_user_pages failed\n"); +- goto update_user_pages_failed; +- } +- ++ if (mem->mapped_to_gpu_memory == 0 && ++ !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { + /* Validate BO only once. The eviction fence gets added to BO + * the first time it is mapped. Validate will wait for all + * background evictions to complete. +@@ -1285,22 +1288,12 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if (entry->bo_va->vm == vm && !entry->is_mapped) { +- if (mem->evicted) { +- /* If the BO is evicted, just mark the +- * mapping as mapped and the GPU's queues +- * will be stopped later. +- */ +- entry->is_mapped = true; +- mem->mapped_to_gpu_memory++; +- num_to_quiesce++; +- continue; +- } +- + pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", + entry->va, entry->va + bo_size, + entry); + +- ret = map_bo_to_gpuvm(adev, entry, &ctx.sync); ++ ret = map_bo_to_gpuvm(adev, entry, &ctx.sync, ++ is_invalid_userptr); + if (ret != 0) { + pr_err("Failed to map radeon bo to gpuvm\n"); + goto map_bo_to_gpuvm_failed; +@@ -1318,24 +1311,11 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + true); + unreserve_bo_and_vms(&ctx, true); + +- while (num_to_quiesce--) { +- /* Now stop the GPU's queues while bo and VMs are unreserved. +- * quiesce_mm() is reference counted, and that is why we can +- * call it multiple times. +- */ +- ret = kgd2kfd->quiesce_mm(adev->kfd, current->mm); +- if (ret != 0) { +- pr_err("quiesce_mm() failed\n"); +- reserve_bo_and_vm(mem, vm, &ctx); +- goto map_bo_to_gpuvm_failed; +- } +- } +- ++ mutex_unlock(&mem->process_info->lock); + mutex_unlock(&mem->lock); + return ret; + + map_bo_to_gpuvm_failed: +-update_user_pages_failed: + if (bo_va_entry_aql) + remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); + add_bo_to_vm_failed_aql: +@@ -1344,6 +1324,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + add_bo_to_vm_failed: + unreserve_bo_and_vms(&ctx, false); + bo_reserve_failed: ++ mutex_unlock(&mem->process_info->lock); + mutex_unlock(&mem->lock); + return ret; + } +@@ -1407,6 +1388,8 @@ int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, + mutex_init(&info->lock); + INIT_LIST_HEAD(&info->vm_list_head); + INIT_LIST_HEAD(&info->kfd_bo_list); ++ INIT_LIST_HEAD(&info->userptr_valid_list); ++ INIT_LIST_HEAD(&info->userptr_inval_list); + + info->eviction_fence = + amdgpu_amdkfd_fence_create(fence_context_alloc(1), +@@ -1416,6 +1399,12 @@ int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, + goto create_evict_fence_fail; + } + ++ info->pid = get_task_pid(current->group_leader, ++ PIDTYPE_PID); ++ atomic_set(&info->evicted_bos, 0); ++ INIT_DELAYED_WORK(&info->work, ++ amdgpu_amdkfd_restore_userptr_worker); ++ + *process_info = info; + } + +@@ -1468,9 +1457,15 @@ void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) + list_del(&kfd_vm->vm_list_node); + mutex_unlock(&process_info->lock); + +- /* Release eviction fence */ ++ /* Release per-process resources */ + if (!process_info->n_vms) { ++ WARN_ON(!list_empty(&process_info->kfd_bo_list)); ++ WARN_ON(!list_empty(&process_info->userptr_valid_list)); ++ WARN_ON(!list_empty(&process_info->userptr_inval_list)); ++ + fence_put(&process_info->eviction_fence->base); ++ cancel_delayed_work_sync(&process_info->work); ++ put_pid(process_info->pid); + kfree(process_info); + } + +@@ -1521,7 +1516,6 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + int ret = 0; + struct bo_vm_reservation_context ctx; + struct amdkfd_process_info *process_info; +- int num_to_resume = 0; + unsigned long bo_size; + + BUG_ON(kgd == NULL); +@@ -1561,17 +1555,6 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if (entry->bo_va->vm == vm && entry->is_mapped) { +- if (mem->evicted) { +- /* If the BO is evicted, just mark the +- * mapping as unmapped and the GPU's queues +- * will be resumed later. +- */ +- entry->is_mapped = false; +- mem->mapped_to_gpu_memory--; +- num_to_resume++; +- continue; +- } +- + pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", + entry->va, + entry->va + bo_size, +@@ -1593,14 +1576,13 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + } + + /* If BO is unmapped from all VMs, unfence it. It can be evicted if +- * required. User pages of userptr BOs can be released. ++ * required. + */ +- if (mem->mapped_to_gpu_memory == 0) { ++ if (mem->mapped_to_gpu_memory == 0 && ++ !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) + amdgpu_amdkfd_remove_eviction_fence(mem->bo, + process_info->eviction_fence, + NULL, NULL); +- amdgpu_amdkfd_bo_invalidate(mem->bo); +- } + + if (mapped_before == mem->mapped_to_gpu_memory) { + pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", +@@ -1610,21 +1592,6 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + + unreserve_out: + unreserve_bo_and_vms(&ctx, false); +- +- while (current->mm && num_to_resume--) { +- /* Now resume GPU's queues while bo and VMs are +- * unreserved. This function runs in a work queue +- * during process termination. Only resume queues if +- * we're running in process context. resume_mm() is +- * reference counted, and that is why we can call it +- * multiple times. +- */ +- ret = kgd2kfd->resume_mm(adev->kfd, current->mm); +- if (ret != 0) { +- pr_err("resume_mm() failed.\n"); +- break; +- } +- } + out: + mutex_unlock(&mem->lock); + return ret; +@@ -1875,7 +1842,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, + (*mem)->domain = (bo->prefered_domains & AMDGPU_GEM_DOMAIN_VRAM) ? + AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; + (*mem)->mapped_to_gpu_memory = 0; +- add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info); ++ (*mem)->process_info = kfd_vm->process_info; ++ add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); + + return 0; + } +@@ -1938,208 +1906,321 @@ static int validate_pd_pt_bos(struct amdkfd_process_info *process_info) + return 0; + } + +-/* Runs out of process context. mem->lock must be held. */ +-int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm) ++/* Evict a userptr BO by stopping the queues if necessary ++ * ++ * Runs in MMU notifier, may be in RECLAIM_FS context. This means it ++ * cannot do any memory allocations, and cannot take any locks that ++ * are held elsewhere while allocating memory. Therefore this is as ++ * simple as possible, using atomic counters. ++ * ++ * It doesn't do anything to the BO itself. The real work happens in ++ * restore, where we get updated page addresses. This function only ++ * ensures that GPU access to the BO is stopped. ++ */ ++int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, ++ struct mm_struct *mm) + { +- struct kfd_bo_va_list *entry; +- unsigned int n_evicted = 0, n_unmapped = 0; ++ struct amdkfd_process_info *process_info = mem->process_info; ++ int invalid, evicted_bos; + int r = 0; +- struct bo_vm_reservation_context ctx; + +- pr_debug("Evicting buffer %p\n", mem); +- +- if (mem->mapped_to_gpu_memory == 0) +- return 0; ++ invalid = atomic_inc_return(&mem->invalid); ++ evicted_bos = atomic_inc_return(&process_info->evicted_bos); ++ if (evicted_bos == 1) { ++ /* First eviction, stop the queues */ ++ r = kgd2kfd->quiesce_mm(NULL, mm); ++ if (r != 0) ++ pr_err("Failed to quiesce KFD\n"); ++ } + +- /* Remove all GPU mappings of the buffer, but don't change any +- * of the is_mapped flags so we can restore it later. The +- * queues of the affected GPUs are quiesced first. Count the +- * number of evicted mappings so we can roll back if something +- * goes wrong. */ ++ return r; ++} + +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- struct amdgpu_device *adev; ++/* Update invalid userptr BOs ++ * ++ * Moves invalidated (evicted) userptr BOs from userptr_valid_list to ++ * userptr_inval_list and updates user pages for all BOs that have ++ * been invalidated since their last update. ++ */ ++static int update_invalid_user_pages(struct amdkfd_process_info *process_info, ++ struct mm_struct *mm) ++{ ++ struct kgd_mem *mem, *tmp_mem; ++ struct amdgpu_bo *bo; ++ int invalid, ret = 0; + +- if (!entry->is_mapped) +- continue; ++ /* Move all invalidated BOs to the userptr_inval_list and ++ * release their user pages by migration to the CPU domain ++ */ ++ list_for_each_entry_safe(mem, tmp_mem, ++ &process_info->userptr_valid_list, ++ validate_list.head) { ++ if (!atomic_read(&mem->invalid)) ++ continue; /* BO is still valid */ + +- adev = (struct amdgpu_device *)entry->kgd_dev; ++ bo = mem->bo; + +- r = kgd2kfd->quiesce_mm(adev->kfd, mm); +- if (r != 0) { +- pr_err("Failed to quiesce KFD\n"); +- goto fail; ++ if (amdgpu_bo_reserve(bo, true)) ++ return -EAGAIN; ++ amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); ++ amdgpu_bo_unreserve(bo); ++ if (ret) { ++ pr_err("%s: Failed to invalidate userptr BO\n", ++ __func__); ++ return -EAGAIN; + } + +- n_evicted++; ++ list_move_tail(&mem->validate_list.head, ++ &process_info->userptr_inval_list); + } + +- r = reserve_bo_and_cond_vms(mem, NULL, VA_MAPPED, &ctx); +- if (unlikely(r != 0)) +- goto fail; +- +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- struct amdgpu_device *adev; ++ if (list_empty(&process_info->userptr_inval_list)) ++ return 0; /* All evicted userptr BOs were freed */ + +- if (!entry->is_mapped) ++ /* Go through userptr_inval_list and update any invalid user_pages */ ++ down_read(&mm->mmap_sem); ++ list_for_each_entry(mem, &process_info->userptr_inval_list, ++ validate_list.head) { ++ invalid = atomic_read(&mem->invalid); ++ if (!invalid) ++ /* BO hasn't been invalidated since the last ++ * revalidation attempt. Keep its BO list. ++ */ + continue; + +- adev = (struct amdgpu_device *)entry->kgd_dev; ++ bo = mem->bo; ++ ++ if (!mem->user_pages) { ++ mem->user_pages = ++ drm_calloc_large(bo->tbo.ttm->num_pages, ++ sizeof(struct page *)); ++ if (!mem->user_pages) { ++ ret = -ENOMEM; ++ pr_err("%s: Failed to allocate pages array\n", ++ __func__); ++ goto unlock_mmap_out; ++ } ++ } else if (mem->user_pages[0]) { ++ release_pages(mem->user_pages, ++ bo->tbo.ttm->num_pages, 0); ++ } + +- r = unmap_bo_from_gpuvm(adev, entry, &ctx.sync); +- if (r != 0) { +- pr_err("Failed unmap VA 0x%llx\n", +- mem->va); +- unreserve_bo_and_vms(&ctx, true); +- goto fail; ++ /* Get updated user pages */ ++ ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, ++ mem->user_pages); ++ if (ret) { ++ mem->user_pages[0] = NULL; ++ pr_err("%s: Failed to get user pages\n", __func__); ++ goto unlock_mmap_out; + } + +- n_unmapped++; ++ /* Mark the BO as valid unless it was invalidated ++ * again concurrently ++ */ ++ if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) { ++ ret = -EAGAIN; ++ goto unlock_mmap_out; ++ } + } ++unlock_mmap_out: ++ up_read(&mm->mmap_sem); ++ return ret; ++} + +- amdgpu_amdkfd_bo_invalidate(mem->bo); +- +- unreserve_bo_and_vms(&ctx, true); +- +- return 0; ++/* Validate invalid userptr BOs ++ * ++ * Validates BOs on the userptr_inval_list, and moves them back to the ++ * userptr_valid_list. Also updates GPUVM page tables with new page ++ * addresses and waits for the page table updates to complete. ++ */ ++static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) ++{ ++ struct amdgpu_bo_list_entry *pd_bo_list_entries; ++ struct list_head resv_list, duplicates; ++ struct ww_acquire_ctx ticket; ++ struct amdgpu_sync sync; + +-fail: +- /* To avoid hangs and keep state consistent, roll back partial +- * eviction by restoring queues and marking mappings as +- * unmapped. Access to now unmapped buffers will fault. */ +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- struct amdgpu_device *adev; ++ struct amdkfd_vm *peer_vm; ++ struct kgd_mem *mem, *tmp_mem; ++ struct amdgpu_bo *bo; ++ int i, ret; + +- if (n_evicted == 0) +- break; +- if (!entry->is_mapped) +- continue; ++ pd_bo_list_entries = kcalloc(process_info->n_vms, ++ sizeof(struct amdgpu_bo_list_entry), ++ GFP_KERNEL); ++ if (!pd_bo_list_entries) { ++ pr_err("%s: Failed to allocate PD BO list entries\n", __func__); ++ return -ENOMEM; ++ } + +- if (n_unmapped) { +- entry->is_mapped = false; +- n_unmapped--; +- } ++ INIT_LIST_HEAD(&resv_list); ++ INIT_LIST_HEAD(&duplicates); + +- adev = (struct amdgpu_device *)entry->kgd_dev; +- if (kgd2kfd->resume_mm(adev->kfd, mm)) +- pr_err("Failed to resume KFD\n"); +- n_evicted--; ++ /* Get all the page directory BOs that need to be reserved */ ++ i = 0; ++ list_for_each_entry(peer_vm, &process_info->vm_list_head, ++ vm_list_node) ++ amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, ++ &pd_bo_list_entries[i++]); ++ /* Add the userptr_inval_list entries to resv_list */ ++ list_for_each_entry(mem, &process_info->userptr_inval_list, ++ validate_list.head) { ++ list_add_tail(&mem->resv_list.head, &resv_list); ++ mem->resv_list.bo = mem->validate_list.bo; ++ mem->resv_list.shared = mem->validate_list.shared; + } + +- return r; +-} ++ /* Reserve all BOs and page tables for validation */ ++ ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); ++ WARN(!list_empty(&duplicates), "Duplicates should be empty"); ++ if (ret) ++ goto out; + +-/* Runs out of process context. mem->lock must be held. */ +-int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm) +-{ +- struct bo_vm_reservation_context ctx; +- struct kfd_bo_va_list *entry; +- uint32_t domain; +- int r, ret = 0; +- bool have_pages = false; ++ amdgpu_sync_create(&sync); + +- pr_debug("Restoring buffer %p\n", mem); ++ ret = validate_pd_pt_bos(process_info); ++ if (ret) ++ goto unreserve_out; + +- if (mem->mapped_to_gpu_memory == 0) +- return 0; ++ /* Validate BOs and update GPUVM page tables */ ++ list_for_each_entry_safe(mem, tmp_mem, ++ &process_info->userptr_inval_list, ++ validate_list.head) { ++ struct kfd_bo_va_list *bo_va_entry; + +- domain = mem->domain; ++ bo = mem->bo; + +- ret = reserve_bo_and_cond_vms(mem, NULL, VA_MAPPED, &ctx); +- if (likely(ret == 0)) { +- ret = update_user_pages(mem, mm, &ctx); +- have_pages = !ret; +- if (!have_pages) { +- unreserve_bo_and_vms(&ctx, false); +- if (ret == -ESRCH) +- /* process terminating, fail quiet and fast */ +- return ret; +- else if (ret == -EDEADLK) +- /* Someone else is still updating the +- * VM, let's try again later +- */ +- return ret; +- pr_err("get_user_pages failed. Probably userptr is freed. %d\n", +- ret); ++ /* Copy pages array and validate the BO */ ++ memcpy(bo->tbo.ttm->pages, mem->user_pages, ++ sizeof(struct page *) * bo->tbo.ttm->num_pages); ++ amdgpu_ttm_placement_from_domain(bo, mem->domain); ++ ret = ttm_bo_validate(&bo->tbo, &bo->placement, ++ false, false); ++ if (ret) { ++ pr_err("%s: failed to validate BO\n", __func__); ++ goto unreserve_out; + } +- /* update_user_pages drops the lock briefly. Check if +- * someone else evicted or restored the buffer in the +- * mean time. Return -EBUSY to let the caller know. ++ ++ /* Validate succeeded, now the BO owns the pages, free ++ * our copy of the pointer array. Put this BO back on ++ * the userptr_valid_list. If we need to revalidate ++ * it, we need to start from scratch. + */ +- if (mem->evicted != 1) { +- if (have_pages) +- unreserve_bo_and_vms(&ctx, false); +- return -EBUSY; +- } +- } ++ drm_free_large(mem->user_pages); ++ mem->user_pages = NULL; ++ list_move_tail(&mem->validate_list.head, ++ &process_info->userptr_valid_list); + +- if (have_pages) { +- r = amdgpu_amdkfd_bo_validate(mem->bo, domain, true); +- if (unlikely(r != 0)) { +- pr_err("Failed to validate BO %p\n", mem); +- have_pages = false; +- unreserve_bo_and_vms(&ctx, false); ++ list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { ++ if (!bo_va_entry->is_mapped) ++ continue; ++ ++ ret = update_gpuvm_pte((struct amdgpu_device *) ++ bo_va_entry->kgd_dev, ++ bo_va_entry, &sync); ++ if (ret) { ++ pr_err("%s: update PTE failed\n", __func__); ++ /* make sure this gets validated again */ ++ atomic_inc(&mem->invalid); ++ goto unreserve_out; ++ } + } + } ++unreserve_out: ++ ttm_eu_backoff_reservation(&ticket, &resv_list); ++ amdgpu_sync_wait(&sync); ++ amdgpu_sync_free(&sync); ++out: ++ kfree(pd_bo_list_entries); + +- /* Try to restore all mappings. Mappings that fail to restore +- * will be marked as unmapped. If we failed to get the user +- * pages, all mappings will be marked as unmapped. */ +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- struct amdgpu_device *adev; +- +- if (!entry->is_mapped) +- continue; +- +- adev = (struct amdgpu_device *)entry->kgd_dev; ++ return ret; ++} + +- if (unlikely(!have_pages)) { +- entry->map_fail = true; +- continue; +- } ++/* Worker callback to restore evicted userptr BOs ++ * ++ * Tries to update and validate all userptr BOs. If successful and no ++ * concurrent evictions happened, the queues are restarted. Otherwise, ++ * reschedule for another attempt later. ++ */ ++static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct amdkfd_process_info *process_info = ++ container_of(dwork, struct amdkfd_process_info, work); ++ struct task_struct *usertask; ++ struct mm_struct *mm; ++ int evicted_bos; ++ ++ evicted_bos = atomic_read(&process_info->evicted_bos); ++ if (!evicted_bos) ++ return; + +- r = map_bo_to_gpuvm(adev, entry, &ctx.sync); +- if (unlikely(r != 0)) { +- pr_err("Failed to map BO to gpuvm\n"); +- entry->map_fail = true; +- if (ret == 0) +- ret = r; +- } ++ /* Reference task and mm in case of concurrent process termination */ ++ usertask = get_pid_task(process_info->pid, PIDTYPE_PID); ++ if (!usertask) ++ return; ++ mm = get_task_mm(usertask); ++ if (!mm) { ++ put_task_struct(usertask); ++ return; + } + +- if (have_pages) +- unreserve_bo_and_vms(&ctx, true); ++ mutex_lock(&process_info->lock); + +- /* Resume queues after unreserving the BOs and most +- * importantly, waiting for the BO fences to guarantee that +- * the page table updates have completed. ++ if (update_invalid_user_pages(process_info, mm)) ++ goto unlock_out; ++ /* userptr_inval_list can be empty if all evicted userptr BOs ++ * have been freed. In that case there is nothing to validate ++ * and we can just restart the queues. + */ +- list_for_each_entry(entry, &mem->bo_va_list, bo_list) { +- struct amdgpu_device *adev; +- +- if (!entry->is_mapped) +- continue; ++ if (!list_empty(&process_info->userptr_inval_list)) { ++ if (atomic_read(&process_info->evicted_bos) != evicted_bos) ++ goto unlock_out; /* Concurrent eviction, try again */ + +- /* Mapping failed. To be in a consistent state, mark the +- * buffer as unmapped, but state of the buffer will be +- * not evicted. A vm fault will generated if user space tries +- * to access this buffer. ++ if (validate_invalid_user_pages(process_info)) ++ goto unlock_out; ++ } ++ /* Final check for concurrent evicton and atomic update. If ++ * another eviction happens after successful update, it will ++ * be a first eviction that calls quiesce_mm. The eviction ++ * reference counting inside KFD will handle this case. ++ */ ++ if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != ++ evicted_bos) ++ goto unlock_out; ++ evicted_bos = 0; ++ if (kgd2kfd->resume_mm(NULL, mm)) { ++ pr_err("%s: Failed to resume KFD\n", __func__); ++ /* No recovery from this failure. Probably the CP is ++ * hanging. No point trying again. + */ +- if (entry->map_fail) { +- entry->is_mapped = false; +- mem->mapped_to_gpu_memory--; +- } +- adev = (struct amdgpu_device *)entry->kgd_dev; +- +- r = kgd2kfd->resume_mm(adev->kfd, mm); +- if (r != 0) { +- pr_err("Failed to resume KFD\n"); +- if (ret == 0) +- ret = r; +- } + } ++unlock_out: ++ mutex_unlock(&process_info->lock); ++ mmput(mm); ++ put_task_struct(usertask); + +- return ret; ++ /* If validation failed, reschedule another attempt */ ++ if (evicted_bos) ++ schedule_delayed_work(&process_info->work, 1); ++} ++ ++/* Schedule delayed restoring of userptr BOs ++ * ++ * This runs in an MMU notifier. See limitations above. The scheduled ++ * worker is free of those limitations. Delaying the restore allows ++ * multiple MMU notifiers to happen in rapid succession, for example ++ * when fork COWs many BOs at once. ++ */ ++int amdgpu_amdkfd_schedule_restore_userptr(struct kgd_mem *mem, ++ unsigned long delay) ++{ ++ struct amdkfd_process_info *process_info = mem->process_info; ++ ++ schedule_delayed_work(&process_info->work, delay); ++ ++ return 0; + } + + /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +index 7076d08..9d78a4f 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +@@ -88,8 +88,6 @@ static void amdgpu_mn_destroy(struct work_struct *work) + list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { + bo->mn = NULL; + list_del_init(&bo->mn_list); +- if (rmn->type == AMDGPU_MN_TYPE_HSA) +- amdgpu_amdkfd_cancel_restore_mem(bo->kfd_bo); + } + kfree(node); + } +@@ -230,7 +228,7 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, + + if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, + start, end)) +- amdgpu_amdkfd_evict_mem(amdgpu_ttm_adev(bo->tbo.bdev), mem, mm); ++ amdgpu_amdkfd_evict_userptr(mem, mm); + } + } + +@@ -277,8 +275,7 @@ static void amdgpu_mn_invalidate_range_end_hsa(struct mmu_notifier *mn, + + if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, + start, end)) +- amdgpu_amdkfd_schedule_restore_mem(amdgpu_ttm_adev(bo->tbo.bdev), +- mem, mm, 1); ++ amdgpu_amdkfd_schedule_restore_userptr(mem, 1); + } + } + +-- +2.7.4 + |