From 4727a62ebab2ab4b1476e67c1f8e67a25c70bced Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Fri, 2 Sep 2016 15:01:41 -0400 Subject: [PATCH 1194/4131] drm/amdkfd: Add kgd2kfd schedule_evict_and_restore_process Change-Id: I27860af58c54449a9ba1fc0a04e0436edb7fae8b Signed-off-by: Harish Kasiviswanathan Conflicts: drivers/gpu/drm/amd/include/kgd_kfd_interface.h --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 116 +++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_module.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 11 ++- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 + 4 files changed, 130 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0ce9572..6acc5fc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers.h" @@ -745,6 +746,42 @@ int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) return r; } +/* quiesce_process_mm - + * Quiesce all user queues that belongs to given process p + */ +static int quiesce_process_mm(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + int r = 0; + unsigned int n_evicted = 0; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + r = process_evict_queues(pdd->dev->dqm, &pdd->qpd); + if (r != 0) { + pr_err("Failed to evict process queues\n"); + goto fail; + } + n_evicted++; + } + + return r; + +fail: + /* To keep state consistent, roll back partial eviction by + * restoring queues + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + if (n_evicted == 0) + break; + if (process_restore_queues(pdd->dev->dqm, &pdd->qpd)) + pr_err("Failed to restore queues\n"); + + n_evicted--; + } + + return r; +} + /* resume_process_mm - * Resume all user queues that belongs to given process p. The caller must * ensure that process p context is valid. @@ -827,6 +864,85 @@ void kfd_restore_bo_worker(struct work_struct *work) pr_err("Failed to resume user queues\n"); } +/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will + * prepare for safe eviction of KFD BOs that belong to the specified + * process. + * + * @mm: mm_struct that identifies the specified KFD process + * @fence: eviction fence attached to KFD process BOs + * + */ +int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, + struct fence *fence) +{ + struct kfd_process *p; + + if (!fence) + return -EINVAL; + + if (fence_is_signaled(fence)) + return 0; + + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ENODEV; + + if (work_pending(&p->eviction_work.work)) { + /* It is possible has TTM has lined up couple of BOs of the same + * process to be evicted. Check if the fence is same which + * indicates that previous work item scheduled is not complted + */ + if (p->eviction_work.eviction_fence == fence) + goto out; + else { + WARN(1, "Starting new evict with previous evict is not completed\n"); + cancel_work_sync(&p->eviction_work.work); + } + } + + /* During process initialization eviction_work.work is initialized + * to kfd_evict_bo_worker + */ + p->eviction_work.eviction_fence = fence_get(fence); + schedule_work(&p->eviction_work.work); +out: + kfd_unref_process(p); + return 0; +} + +void kfd_evict_bo_worker(struct work_struct *work) +{ + int ret; + struct kfd_process *p; + struct kfd_eviction_work *eviction_work; + + eviction_work = container_of(work, struct kfd_eviction_work, + work); + + /* Process termination destroys this worker thread. So during the + * lifetime of this thread, kfd_process p will be valid + */ + p = container_of(eviction_work, struct kfd_process, eviction_work); + + /* Narrow window of overlap between restore and evict work item is + * possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos unreserves + * KFD BOs, it is possible to evicted again. But restore has few more + * steps of finish. So lets wait for the restore work to complete + */ + if (delayed_work_pending(&p->restore_work)) + flush_delayed_work(&p->restore_work); + + ret = quiesce_process_mm(p); + if (!ret) { + fence_signal(eviction_work->eviction_fence); + fence_put(eviction_work->eviction_fence); + kfd_schedule_restore_bos_and_queues(p); + } else { + pr_err("Failed to quiesce user queues. Cannot evict BOs\n"); + } + +} + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 914a0cd..42c559b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -47,6 +47,8 @@ static const struct kgd2kfd_calls kgd2kfd = { .restore = kgd2kfd_restore, .quiesce_mm = kgd2kfd_quiesce_mm, .resume_mm = kgd2kfd_resume_mm, + .schedule_evict_and_restore_process = + kgd2kfd_schedule_evict_and_restore_process, }; int sched_policy = KFD_SCHED_POLICY_HWS; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b3f2665..9dea8f2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -542,12 +542,20 @@ struct qcm_process_device { }; /* KFD Memory Eviction */ +struct kfd_eviction_work { + struct work_struct work; + struct fence *eviction_fence; +}; + /* Appox. wait time before attempting to restore evicted BOs */ #define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 1000 +void kfd_evict_bo_worker(struct work_struct *work); void kfd_restore_bo_worker(struct work_struct *work); +int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, + struct fence *fence); /*8 byte handle containing GPU ID in the most significant 4 bytes and @@ -681,7 +689,8 @@ struct kfd_process { void *master_vm; - /* For restoring BOs after eviction */ + /* Work items for evicting and restoring BOs */ + struct kfd_eviction_work eviction_work; struct delayed_work restore_work; }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 542f0df..54ed2a1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -509,6 +509,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, p = container_of(mn, struct kfd_process, mmu_notifier); BUG_ON(p->mm != mm); + cancel_work_sync(&p->eviction_work.work); cancel_delayed_work_sync(&p->restore_work); mutex_lock(&kfd_processes_mutex); @@ -679,6 +680,7 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (err) goto err_init_cwsr; + INIT_WORK(&process->eviction_work.work, kfd_evict_bo_worker); INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker); return process; -- 2.7.4