From 909b82ea5625d797f9bde9be6378ba3ee8a55ec5 Mon Sep 17 00:00:00 2001 From: Lan Xiao Date: Fri, 23 Jun 2017 16:06:48 -0400 Subject: [PATCH 1727/4131] drm/amd: Implement parallel memory mapping on mGPUs Alter the KFD-KGD interface to optimize multi-GPU memory mappings to work concurrently instead of sequentially. Return the fences during the process, wait for all fences after the mappings are done. The fences are stored in the associated kgd_mem object. This change also enables interruptible waiting with proper signal handling Change-Id: I9ae7f4bd54165b14dd5b37df5df6516aa80cba83 Signed-off-by: Lan Xiao --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 74 +++++++++++++++++------ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 7 +++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 +++ drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 2 + 8 files changed, 78 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index ba1e24c9..924e28a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -64,6 +64,8 @@ struct kgd_mem { struct amdkfd_process_info *process_info; struct page **user_pages; + struct amdgpu_sync sync; + /* flags bitfield */ bool coherent : 1; @@ -190,6 +192,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, }) /* GPUVM API */ +int amdgpu_amdkfd_gpuvm_sync_memory( + struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( struct kgd_dev *kgd, uint64_t va, uint64_t size, void *vm, struct kgd_mem **mem, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 4549dc0..0b2595e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -216,6 +216,7 @@ static const struct kfd2kgd_calls kfd2kgd = { .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, .write_vmid_invalidate_request = write_vmid_invalidate_request, .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 76e3d5d..08da99f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -189,6 +189,7 @@ static const struct kfd2kgd_calls kfd2kgd = { get_atc_vmid_pasid_mapping_valid, .write_vmid_invalidate_request = write_vmid_invalidate_request, .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index d10d213..42e0094 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -234,6 +234,7 @@ static const struct kfd2kgd_calls kfd2kgd = { get_atc_vmid_pasid_mapping_valid, .write_vmid_invalidate_request = write_vmid_invalidate_request, .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 8384dfb..475e7fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -655,6 +655,8 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; + amdgpu_sync_create(&(*mem)->sync); + ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); if (ret) { pr_err("Insufficient system memory\n"); @@ -730,7 +732,7 @@ struct bo_vm_reservation_context { struct amdgpu_bo_list_entry *vm_pd; struct ww_acquire_ctx ticket; struct list_head list, duplicates; - struct amdgpu_sync sync; + struct amdgpu_sync *sync; bool reserved; }; @@ -751,7 +753,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem, ctx->reserved = false; ctx->n_vms = 1; - amdgpu_sync_create(&ctx->sync); + ctx->sync = &mem->sync; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->duplicates); @@ -812,7 +814,7 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem, ctx->reserved = false; ctx->n_vms = 0; ctx->vm_pd = NULL; - amdgpu_sync_create(&ctx->sync); + ctx->sync = &mem->sync; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->duplicates); @@ -867,19 +869,27 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem, return ret; } -static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, - bool wait) +static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, + bool wait, bool intr) { - if (wait) /* FIXME: when called from user context, this needs to be interruptible */ - amdgpu_sync_wait(&ctx->sync, false); + int ret = 0; + + if (wait) { + ret = amdgpu_sync_wait(ctx->sync, intr); + if (ret) + return ret; + } if (ctx->reserved) ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); kfree(ctx->vm_pd); - amdgpu_sync_free(&ctx->sync); + ctx->sync = NULL; + ctx->reserved = false; ctx->vm_pd = NULL; + + return ret; } static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, @@ -1051,6 +1061,25 @@ static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) return sg; } +int amdgpu_amdkfd_gpuvm_sync_memory( + struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) +{ + int ret = 0; + struct amdgpu_sync sync; + struct amdgpu_device *adev; + + adev = get_amdgpu_device(kgd); + amdgpu_sync_create(&sync); + + mutex_lock(&mem->lock); + amdgpu_sync_clone(adev, &mem->sync, &sync); + mutex_unlock(&mem->lock); + + ret = amdgpu_sync_wait(&sync, intr); + amdgpu_sync_free(&sync); + return ret; +} + #define BOOL_TO_STR(b) (b == true) ? "true" : "false" int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( @@ -1137,7 +1166,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( struct amdgpu_device *adev; struct kfd_bo_va_list *entry, *tmp; struct bo_vm_reservation_context ctx; - int ret; + int ret = 0; struct ttm_validate_buffer *bo_list_entry; struct amdkfd_process_info *process_info; unsigned long bo_size; @@ -1199,7 +1228,10 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( entry, bo_size); } - unreserve_bo_and_vms(&ctx, false); + ret = unreserve_bo_and_vms(&ctx, false, true); + + /* Free the sync object */ + amdgpu_sync_free(&mem->sync); /* If the SG is not NULL, it's one we created for a doorbell * BO. We need to free it. @@ -1213,7 +1245,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( amdgpu_bo_unref(&mem->bo); kfree(mem); - return 0; + return ret; } int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( @@ -1308,7 +1340,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( entry->va, entry->va + bo_size, entry); - ret = map_bo_to_gpuvm(adev, entry, &ctx.sync, + ret = map_bo_to_gpuvm(adev, entry, ctx.sync, is_invalid_userptr); if (ret != 0) { pr_err("Failed to map radeon bo to gpuvm\n"); @@ -1325,7 +1357,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( amdgpu_bo_fence(bo, &kfd_vm->process_info->eviction_fence->base, true); - unreserve_bo_and_vms(&ctx, true); + ret = unreserve_bo_and_vms(&ctx, false, true); mutex_unlock(&mem->process_info->lock); mutex_unlock(&mem->lock); @@ -1338,7 +1370,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( if (bo_va_entry) remove_bo_from_vm(adev, bo_va_entry, bo_size); add_bo_to_vm_failed: - unreserve_bo_and_vms(&ctx, false); + unreserve_bo_and_vms(&ctx, false, false); bo_reserve_failed: mutex_unlock(&mem->process_info->lock); mutex_unlock(&mem->lock); @@ -1569,7 +1601,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( entry->va + bo_size, entry); - ret = unmap_bo_from_gpuvm(adev, entry, &ctx.sync); + ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); if (ret == 0) { entry->is_mapped = false; } else { @@ -1600,7 +1632,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( } unreserve_out: - unreserve_bo_and_vms(&ctx, false); + unreserve_bo_and_vms(&ctx, false, false); out: mutex_unlock(&mem->lock); return ret; @@ -2235,6 +2267,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info) struct amdgpu_amdkfd_fence *old_fence; int ret = 0, i; struct list_head duplicate_save; + struct amdgpu_sync sync_obj; INIT_LIST_HEAD(&duplicate_save); INIT_LIST_HEAD(&ctx.list); @@ -2287,7 +2320,8 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info) if (!list_empty(&duplicate_save)) pr_err("BUG: list of BOs to reserve has duplicates!\n"); - amdgpu_sync_create(&ctx.sync); + amdgpu_sync_create(&sync_obj); + ctx.sync = &sync_obj; /* Validate PDs and PTs */ ret = process_validate_vms(process_info); @@ -2322,7 +2356,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info) ret = update_gpuvm_pte((struct amdgpu_device *) bo_va_entry->kgd_dev, bo_va_entry, - &ctx.sync); + ctx.sync); if (ret) { pr_debug("Memory eviction: update PTE failed. Try again\n"); goto validate_map_fail; @@ -2330,7 +2364,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info) } } - amdgpu_sync_wait(&ctx.sync, false); + amdgpu_sync_wait(ctx.sync, false); /* Wait for validate to finish and attach new eviction fence */ list_for_each_entry(mem, &process_info->kfd_bo_list, @@ -2350,7 +2384,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info) } validate_map_fail: ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); - amdgpu_sync_free(&ctx.sync); + amdgpu_sync_free(&sync_obj); ttm_reserve_fail: mutex_unlock(&process_info->lock); evict_fence_fail: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 64a4373..dbc3afd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1398,6 +1398,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, pr_err("Failed to map\n"); } + err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; + } + if (args->device_ids_array_size > 0 && devices_arr) kfree(devices_arr); @@ -1407,6 +1413,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, up_write(&p->lock); get_mem_obj_from_handle_failed: copy_from_user_failed: +sync_memory_failed: kfree(devices_arr); return err; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d1ef118..f5e2282 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -136,6 +136,13 @@ static int kfd_process_alloc_gpuvm(struct kfd_process *p, if (err) goto err_map_mem; + err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, + true); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; + } + /* Create an obj handle so kfd_process_device_remove_obj_handle * will take care of the bo removal when the process finishes. * We do not need to take p->lock, because the process is just @@ -151,6 +158,7 @@ static int kfd_process_alloc_gpuvm(struct kfd_process *p, return err; free_gpuvm: +sync_memory_failed: kfd_process_free_gpuvm(mem, pdd); return err; diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index c0c1cc7..1364429 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -337,6 +337,8 @@ struct kfd2kgd_calls { int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); + int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); + int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, uint64_t size, void *vm, struct kgd_mem **mem, uint64_t *offset, -- 2.7.4