From 817ccd6f0987f83ddbf989602f0fbf320157f0a9 Mon Sep 17 00:00:00 2001 From: Chaudhary Amit Kumar Date: Thu, 18 Oct 2018 12:42:04 +0530 Subject: [PATCH 1353/4131] compilation fix for amdkfd porting Signed-off-by: Sanjay R Mehta Signed-off-by: Chaudhary Amit Kumar --- drivers/gpu/drm/amd/amdgpu/Makefile | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 346 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 185 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 537 ++++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 590 ++++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h | 62 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1227 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2578 +++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 - drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 46 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h | 2 + drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 + drivers/gpu/drm/amd/amdgpu/vid.h | 2 + drivers/gpu/drm/amd/amdkfd/Makefile | 2 - drivers/gpu/drm/amd/amdkfd/backport/backport.h | 7 - drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 - drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 4 - drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 - drivers/gpu/drm/amd/amdkfd/kfd_events.c | 28 - drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 10 - drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 8 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 4 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 4 - drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 2 - drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 52 +- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 - drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 6 - drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 - drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 231 +- drivers/gpu/drm/amd/include/v9_structs.h | 48 +- drivers/gpu/drm/amd/include/vi_structs.h | 2 + drivers/pci/pci.c | 81 + include/drm/amd_rdma.h | 70 + include/linux/pci.h | 1 + include/uapi/linux/kfd_ioctl.h | 442 +++- include/uapi/linux/pci_regs.h | 3 + kernel/fork.c | 1 + 44 files changed, 6315 insertions(+), 537 deletions(-) mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/Makefile mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/soc15d.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdgpu/vid.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/Makefile mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h mode change 100644 => 100755 drivers/gpu/drm/amd/include/kgd_kfd_interface.h mode change 100644 => 100755 drivers/gpu/drm/amd/include/v9_structs.h mode change 100644 => 100755 drivers/gpu/drm/amd/include/vi_structs.h mode change 100644 => 100755 drivers/pci/pci.c create mode 100644 include/drm/amd_rdma.h mode change 100644 => 100755 include/linux/pci.h mode change 100644 => 100755 include/uapi/linux/pci_regs.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile old mode 100644 new mode 100755 index 57b8d5f..6b373d0 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -32,12 +32,11 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ - amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o + amdgpu_queue_mgr.o amdgpu_vf_error.o amdgpu_sem.o amdgpu_amdkfd_fence.o # add asic specific block amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \ ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \ - amdgpu_amdkfd_gfx_v7.o amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o si_ih.o si_dma.o dce_v6_0.o si_dpm.o si_smc.o @@ -109,7 +108,10 @@ amdgpu-y += \ # add amdkfd interfaces amdgpu-y += \ amdgpu_amdkfd.o \ - amdgpu_amdkfd_gfx_v8.o + amdgpu_amdkfd_gfx_v7.o \ + amdgpu_amdkfd_gfx_v8.o \ + amdgpu_amdkfd_gfx_v9.o \ + amdgpu_amdkfd_gpuvm.o # add cgs amdgpu-y += amdgpu_cgs.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h old mode 100644 new mode 100755 index fe23de8..bcf95e7 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -184,6 +184,7 @@ struct amdgpu_cs_parser; struct amdgpu_job; struct amdgpu_irq_src; struct amdgpu_fpriv; +struct kfd_vm_fault_info; struct amdgpu_bo_va_mapping; enum amdgpu_cp_irq { @@ -403,6 +404,7 @@ struct amdgpu_gem_object { struct amdgpu_bo *bo; }; +struct kgd_mem; #define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_gem_object, base)->bo void amdgpu_gem_object_free(struct drm_gem_object *obj); @@ -543,6 +545,9 @@ struct amdgpu_mc { u64 private_aperture_end; /* protects concurrent invalidation */ spinlock_t invalidate_lock; + + struct kfd_vm_fault_info *vm_fault_info; + atomic_t vm_fault_info_updated; }; /* @@ -961,6 +966,7 @@ struct amdgpu_gfx_config { }; struct amdgpu_cu_info { + uint32_t simd_per_cu; uint32_t max_waves_per_simd; uint32_t wave_front_size; uint32_t max_scratch_slots_per_cu; @@ -1649,6 +1655,7 @@ struct amdgpu_device { /* record hw reset is performed */ bool has_hw_reset; u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; + spinlock_t tlb_invalidation_lock; /* record last mm index being written through WREG32*/ unsigned long last_mm_index; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c old mode 100644 new mode 100755 index 7ec1915..ec8141f --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -20,23 +20,29 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#undef pr_fmt +#define pr_fmt(fmt) "kfd2kgd: " fmt + #include "amdgpu_amdkfd.h" -#include "amd_shared.h" +#include #include #include "amdgpu.h" #include "amdgpu_gfx.h" #include -const struct kfd2kgd_calls *kfd2kgd; +#define AMDKFD_SKIP_UNCOMPILED_CODE 1 + const struct kgd2kfd_calls *kgd2kfd; -bool (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); +bool (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); + +unsigned int global_compute_vmid_bitmap = 0xFF00; int amdgpu_amdkfd_init(void) { int ret; #if defined(CONFIG_HSA_AMD_MODULE) - int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**); + int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**); kgd2kfd_init_p = symbol_request(kgd2kfd_init); @@ -57,56 +63,68 @@ int amdgpu_amdkfd_init(void) #else ret = -ENOENT; #endif - + amdgpu_amdkfd_gpuvm_init_mem_limits(); return ret; } -bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev) +void amdgpu_amdkfd_fini(void) { + if (kgd2kfd) { + kgd2kfd->exit(); + symbol_put(kgd2kfd_init); + } +} + +void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) +{ + const struct kfd2kgd_calls *kfd2kgd; + + if (!kgd2kfd) + return; + switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_CIK case CHIP_KAVERI: + case CHIP_HAWAII: kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions(); break; #endif case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); break; + case CHIP_VEGA10: + case CHIP_RAVEN: + kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); + break; default: - return false; - } - - return true; -} - -void amdgpu_amdkfd_fini(void) -{ - if (kgd2kfd) { - kgd2kfd->exit(); - symbol_put(kgd2kfd_init); + dev_info(adev->dev, "kfd not supported on this ASIC\n"); + return; } -} -void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) -{ - if (kgd2kfd) - adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, - adev->pdev, kfd2kgd); + adev->kfd = kgd2kfd->probe((struct kgd_dev *)adev, + adev->pdev, kfd2kgd); } void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) { int i; int last_valid_bit; + if (adev->kfd) { struct kgd2kfd_shared_resources gpu_resources = { - .compute_vmid_bitmap = 0xFF00, + .compute_vmid_bitmap = global_compute_vmid_bitmap, .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec, - .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe + .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe, + .gpuvm_size = (uint64_t)amdgpu_vm_size << 30 }; /* this is going to have a few of the MSBs set that we need to - * clear */ + * clear + */ bitmap_complement(gpu_resources.queue_bitmap, adev->gfx.mec.queue_bitmap, KGD_MAX_QUEUES); @@ -120,7 +138,8 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) gpu_resources.queue_bitmap); /* According to linux/bitmap.h we shouldn't use bitmap_clear if - * nbits is not compile time constant */ + * nbits is not compile time constant + */ last_valid_bit = 1 /* only first MEC can have compute queues */ * adev->gfx.mec.num_pipe_per_mec * adev->gfx.mec.num_queue_per_pipe; @@ -131,6 +150,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) &gpu_resources.doorbell_physical_address, &gpu_resources.doorbell_aperture_size, &gpu_resources.doorbell_start_offset); + if (adev->asic_type >= CHIP_VEGA10) { + /* On SOC15 the BIF is involved in routing + * doorbells using the low 12 bits of the + * address. Communicate the assignments to + * KFD. KFD uses two doorbell pages per + * process in case of 64-bit doorbells so we + * can use each doorbell assignment twice. + */ + gpu_resources.sdma_doorbell[0][0] = + AMDGPU_DOORBELL64_sDMA_ENGINE0; + gpu_resources.sdma_doorbell[0][1] = + AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; + gpu_resources.sdma_doorbell[1][0] = + AMDGPU_DOORBELL64_sDMA_ENGINE1; + gpu_resources.sdma_doorbell[1][1] = + AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; + /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for + * SDMA, IH and VCN. So don't use them for the CP. + */ + gpu_resources.reserved_doorbell_mask = 0x1f0; + gpu_resources.reserved_doorbell_val = 0x0f0; + } kgd2kfd->device_init(adev->kfd, &gpu_resources); } @@ -167,24 +208,81 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev) return r; } +int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, + uint32_t vmid, uint64_t gpu_addr, + uint32_t *ib_cmd, uint32_t ib_len) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct amdgpu_job *job; + struct amdgpu_ib *ib; + struct amdgpu_ring *ring; + struct dma_fence *f = NULL; + int ret; + + switch (engine) { + case KGD_ENGINE_MEC1: + ring = &adev->gfx.compute_ring[0]; + break; + case KGD_ENGINE_SDMA1: + ring = &adev->sdma.instance[0].ring; + break; + case KGD_ENGINE_SDMA2: + ring = &adev->sdma.instance[1].ring; + break; + default: + pr_err("Invalid engine in IB submission: %d\n", engine); + ret = -EINVAL; + goto err; + } + + ret = amdgpu_job_alloc(adev, 1, &job, NULL); + if (ret) + goto err; + + ib = &job->ibs[0]; + memset(ib, 0, sizeof(struct amdgpu_ib)); + + ib->gpu_addr = gpu_addr; + ib->ptr = ib_cmd; + ib->length_dw = ib_len; + /* This works for NO_HWS. TODO: need to handle without knowing VMID */ + job->vm_id = vmid; + + ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); + if (ret) { + DRM_ERROR("amdgpu: failed to schedule IB.\n"); + goto err_ib_sched; + } + + ret = dma_fence_wait(f, false); + +err_ib_sched: + dma_fence_put(f); + amdgpu_job_free(job); +err: + return ret; +} + +u32 pool_to_domain(enum kgd_memory_pool p) +{ + switch (p) { + case KGD_POOL_FRAMEBUFFER: return AMDGPU_GEM_DOMAIN_VRAM; + default: return AMDGPU_GEM_DOMAIN_GTT; + } +} + int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - struct kgd_mem **mem = (struct kgd_mem **) mem_obj; + struct amdgpu_bo *bo = NULL; int r; - - BUG_ON(kgd == NULL); - BUG_ON(gpu_addr == NULL); - BUG_ON(cpu_ptr == NULL); - - *mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL); - if ((*mem) == NULL) - return -ENOMEM; + uint64_t gpu_addr_tmp = 0; + void *cpu_ptr_tmp = NULL; r = amdgpu_bo_create(adev, size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, - AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, &(*mem)->bo); + AMDGPU_GEM_CREATE_CPU_GTT_USWC, NULL, NULL, 0, &bo); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); @@ -192,64 +290,87 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, } /* map the buffer */ - r = amdgpu_bo_reserve((*mem)->bo, true); + r = amdgpu_bo_reserve(bo, true); if (r) { dev_err(adev->dev, "(%d) failed to reserve bo for amdkfd\n", r); goto allocate_mem_reserve_bo_failed; } - r = amdgpu_bo_pin((*mem)->bo, AMDGPU_GEM_DOMAIN_GTT, - &(*mem)->gpu_addr); + r = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT, + &gpu_addr_tmp); if (r) { dev_err(adev->dev, "(%d) failed to pin bo for amdkfd\n", r); goto allocate_mem_pin_bo_failed; } - *gpu_addr = (*mem)->gpu_addr; - r = amdgpu_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr); + r = amdgpu_bo_kmap(bo, &cpu_ptr_tmp); if (r) { dev_err(adev->dev, "(%d) failed to map bo to kernel for amdkfd\n", r); goto allocate_mem_kmap_bo_failed; } - *cpu_ptr = (*mem)->cpu_ptr; - amdgpu_bo_unreserve((*mem)->bo); + *mem_obj = bo; + *gpu_addr = gpu_addr_tmp; + *cpu_ptr = cpu_ptr_tmp; + + amdgpu_bo_unreserve(bo); return 0; allocate_mem_kmap_bo_failed: - amdgpu_bo_unpin((*mem)->bo); + amdgpu_bo_unpin(bo); allocate_mem_pin_bo_failed: - amdgpu_bo_unreserve((*mem)->bo); + amdgpu_bo_unreserve(bo); allocate_mem_reserve_bo_failed: - amdgpu_bo_unref(&(*mem)->bo); + amdgpu_bo_unref(&bo); return r; } void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { - struct kgd_mem *mem = (struct kgd_mem *) mem_obj; + struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; - BUG_ON(mem == NULL); - - amdgpu_bo_reserve(mem->bo, true); - amdgpu_bo_kunmap(mem->bo); - amdgpu_bo_unpin(mem->bo); - amdgpu_bo_unreserve(mem->bo); - amdgpu_bo_unref(&(mem->bo)); - kfree(mem); + amdgpu_bo_reserve(bo, true); + amdgpu_bo_kunmap(bo); + amdgpu_bo_unpin(bo); + amdgpu_bo_unreserve(bo); + amdgpu_bo_unref(&(bo)); } -uint64_t get_vmem_size(struct kgd_dev *kgd) +void get_local_mem_info(struct kgd_dev *kgd, + struct kfd_local_mem_info *mem_info) { - struct amdgpu_device *adev = - (struct amdgpu_device *)kgd; + uint64_t address_mask; + resource_size_t aper_limit; + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; - BUG_ON(kgd == NULL); + address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask : + ~((1ULL << 32) - 1); + aper_limit = adev->mc.aper_base + adev->mc.aper_size; + + memset(mem_info, 0, sizeof(*mem_info)); + if (!(adev->mc.aper_base & address_mask || + aper_limit & address_mask)) { + mem_info->local_mem_size_public = adev->mc.visible_vram_size; + mem_info->local_mem_size_private = adev->mc.real_vram_size - + adev->mc.visible_vram_size; + } else { + mem_info->local_mem_size_public = 0; + mem_info->local_mem_size_private = adev->mc.real_vram_size; + } + mem_info->vram_width = adev->mc.vram_width; - return adev->mc.real_vram_size; + pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n", + adev->mc.aper_base, aper_limit, + mem_info->local_mem_size_public, + mem_info->local_mem_size_private); + + if (amdgpu_sriov_vf(adev)) + mem_info->mem_clk_max = adev->clock.default_mclk / 100; + else + mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100; } uint64_t get_gpu_clock_counter(struct kgd_dev *kgd) @@ -271,3 +392,106 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd) return amdgpu_dpm_get_sclk(adev, false) / 100; } + +void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct amdgpu_cu_info acu_info = adev->gfx.cu_info; + + memset(cu_info, 0, sizeof(*cu_info)); + if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap)) + return; + + cu_info->cu_active_number = acu_info.number; + cu_info->cu_ao_mask = acu_info.ao_cu_mask; + memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], + sizeof(acu_info.bitmap)); + cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; + cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; + cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; + cu_info->simd_per_cu = acu_info.simd_per_cu; + cu_info->max_waves_per_simd = acu_info.max_waves_per_simd; + cu_info->wave_front_size = acu_info.wave_front_size; + cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu; + cu_info->lds_size = acu_info.lds_size; +} + +int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, + struct kgd_dev **dma_buf_kgd, + uint64_t *bo_size, void *metadata_buffer, + size_t buffer_size, uint32_t *metadata_size, + uint32_t *flags) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct dma_buf *dma_buf; + struct drm_gem_object *obj; + struct amdgpu_bo *bo; + uint64_t metadata_flags; + int r = -EINVAL; + + dma_buf = dma_buf_get(dma_buf_fd); + if (IS_ERR(dma_buf)) + return PTR_ERR(dma_buf); + + if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) + /* Can't handle non-graphics buffers */ + goto out_put; + + obj = dma_buf->priv; + if (obj->dev->driver != adev->ddev->driver) + /* Can't handle buffers from different drivers */ + goto out_put; + + adev = obj->dev->dev_private; + bo = gem_to_amdgpu_bo(obj); + if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | + AMDGPU_GEM_DOMAIN_GTT | + AMDGPU_GEM_DOMAIN_DGMA))) + /* Only VRAM, GTT and DGMA BOs are supported */ + goto out_put; + + r = 0; + if (dma_buf_kgd) + *dma_buf_kgd = (struct kgd_dev *)adev; + if (bo_size) + *bo_size = amdgpu_bo_size(bo); + if (metadata_size) + *metadata_size = bo->metadata_size; + if (metadata_buffer) + r = amdgpu_bo_get_metadata(bo, metadata_buffer, buffer_size, + metadata_size, &metadata_flags); + if (flags) { + /* If the preferred domain is DGMA, set flags to VRAM because + * KFD doesn't support allocating DGMA memory + */ + *flags = (bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | + AMDGPU_GEM_DOMAIN_DGMA)) ? + ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT; + + if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) + *flags |= ALLOC_MEM_FLAGS_PUBLIC; + } + +out_put: + dma_buf_put(dma_buf); + return r; +} + +uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + uint64_t usage = + amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); + return usage; +} + +bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, + u32 vmid) +{ + if (adev->kfd) { + if ((1 << vmid) & global_compute_vmid_bitmap) + return true; + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h old mode 100644 new mode 100755 index 6d3a10b..b259ba7 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -27,20 +27,109 @@ #include #include +#include +#include #include +#include "amdgpu.h" + +extern const struct kgd2kfd_calls *kgd2kfd; struct amdgpu_device; +struct kfd_bo_va_list { + struct list_head bo_list; + struct amdgpu_bo_va *bo_va; + void *kgd_dev; + bool is_mapped; + bool map_fail; + uint64_t va; + uint64_t pte_flags; +}; + struct kgd_mem { + struct mutex lock; struct amdgpu_bo *bo; - uint64_t gpu_addr; - void *cpu_ptr; + struct list_head bo_va_list; + /* protected by amdkfd_process_info.lock */ + struct ttm_validate_buffer validate_list; + struct ttm_validate_buffer resv_list; + uint32_t domain; + unsigned int mapped_to_gpu_memory; + void *kptr; + uint64_t va; + + uint32_t mapping_flags; + + atomic_t invalid; + struct amdkfd_process_info *process_info; + struct page **user_pages; + + struct amdgpu_sync sync; + + /* flags bitfield */ + bool coherent : 1; + bool no_substitute : 1; + bool aql_queue : 1; +}; + +/* KFD Memory Eviction */ +struct amdgpu_amdkfd_fence { + struct dma_fence base; + void *mm; + spinlock_t lock; + char timeline_name[TASK_COMM_LEN]; +}; + +struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, + void *mm); +bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm); +struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); + +struct amdkfd_process_info { + /* List head of all VMs that belong to a KFD process */ + struct list_head vm_list_head; + /* List head for all KFD BOs that belong to a KFD process. */ + struct list_head kfd_bo_list; + /* List of userptr BOs that are valid or invalid */ + struct list_head userptr_valid_list; + struct list_head userptr_inval_list; + /* Lock to protect kfd_bo_list */ + struct mutex lock; + + /* Number of VMs */ + unsigned int n_vms; + /* Eviction Fence */ + struct amdgpu_amdkfd_fence *eviction_fence; + + /* MMU-notifier related fields */ + atomic_t evicted_bos; + struct delayed_work work; + struct pid *pid; +}; + +/* struct amdkfd_vm - + * For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs + * belonging to a KFD process. All the VMs belonging to the same process point + * to the same amdkfd_process_info. + */ +struct amdkfd_vm { + /* Keep base as the first parameter for pointer compatibility between + * amdkfd_vm and amdgpu_vm. + */ + struct amdgpu_vm base; + + /* List node in amdkfd_process_info.vm_list_head*/ + struct list_head vm_list_node; + + struct amdgpu_device *adev; + /* Points to the KFD process VM info*/ + struct amdkfd_process_info *process_info; }; + int amdgpu_amdkfd_init(void); void amdgpu_amdkfd_fini(void); -bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev); void amdgpu_amdkfd_suspend(struct amdgpu_device *adev); int amdgpu_amdkfd_resume(struct amdgpu_device *adev); @@ -50,17 +139,105 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); +int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); +int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, + uint32_t vmid, uint64_t gpu_addr, + uint32_t *ib_cmd, uint32_t ib_len); +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, + struct dma_fence **ef); struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); +int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, + uint64_t src_offset, struct kgd_mem *dst_mem, + uint64_t dest_offset, uint64_t size, struct dma_fence **f, + uint64_t *actual_size); + +bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, + u32 vmid); /* Shared API */ +int map_bo(struct amdgpu_device *rdev, uint64_t va, void *vm, + struct amdgpu_bo *bo, struct amdgpu_bo_va **bo_va); int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, void **cpu_ptr); void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); -uint64_t get_vmem_size(struct kgd_dev *kgd); +void get_local_mem_info(struct kgd_dev *kgd, + struct kfd_local_mem_info *mem_info); uint64_t get_gpu_clock_counter(struct kgd_dev *kgd); uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd); +void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info); +int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd, + struct kgd_dev **dmabuf_kgd, + uint64_t *bo_size, void *metadata_buffer, + size_t buffer_size, uint32_t *metadata_size, + uint32_t *flags); +uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); + +#define read_user_wptr(mmptr, wptr, dst) \ + ({ \ + bool valid = false; \ + if ((mmptr) && (wptr)) { \ + if ((mmptr) == current->mm) { \ + valid = !get_user((dst), (wptr)); \ + } else if (current->mm == NULL) { \ + use_mm(mmptr); \ + valid = !get_user((dst), (wptr)); \ + unuse_mm(mmptr); \ + } \ + } \ + valid; \ + }) + +/* GPUVM API */ +int amdgpu_amdkfd_gpuvm_sync_memory( + struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); +int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( + struct kgd_dev *kgd, uint64_t va, uint64_t size, + void *vm, struct kgd_mem **mem, + uint64_t *offset, uint32_t flags); +int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); +int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); +int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm); +int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, + void **process_info, + struct dma_fence **ef); +void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); + +uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); + +int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, + struct kfd_vm_fault_info *info); + +int amdgpu_amdkfd_gpuvm_mmap_bo( + struct kgd_dev *kgd, struct vm_area_struct *vma); + +int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, + struct kgd_mem *mem, void **kptr); + +int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, + struct kgd_mem *mem, uint64_t offset, + uint64_t size, struct sg_table **ret_sg); +void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( + struct kgd_mem *mem, struct sg_table *sg); +int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, + struct dma_buf *dmabuf, + uint64_t va, void *vm, + struct kgd_mem **mem, uint64_t *size, + uint64_t *mmap_offset); +int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, + struct kgd_mem *mem, + struct dma_buf **dmabuf); +int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm); +int amdgpu_amdkfd_gpuvm_restore_mem(struct kgd_mem *mem, struct mm_struct *mm); + +void amdgpu_amdkfd_gpuvm_init_mem_limits(void); +void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo); #endif /* AMDGPU_AMDKFD_H_INCLUDED */ + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c new file mode 100644 index 0000000..3961937 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c @@ -0,0 +1,196 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "amdgpu_amdkfd.h" + +const struct dma_fence_ops amd_kfd_fence_ops; +static atomic_t fence_seq = ATOMIC_INIT(0); + +static int amd_kfd_fence_signal(struct dma_fence *f); + +/* Eviction Fence + * Fence helper functions to deal with KFD memory eviction. + * Big Idea - Since KFD submissions are done by user queues, a BO cannot be + * evicted unless all the user queues for that process are evicted. + * + * All the BOs in a process share an eviction fence. When process X wants + * to map VRAM memory but TTM can't find enough space, TTM will attempt to + * evict BOs from its LRU list. TTM checks if the BO is valuable to evict + * by calling ttm_bo_driver->eviction_valuable(). + * + * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs + * to process X. Otherwise, it will return true to indicate BO can be + * evicted by TTM. + * + * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue + * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move + * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler. + * + * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to + * nofity when the BO is free to move. fence_add_callback --> enable_signaling + * --> amdgpu_amdkfd_fence.enable_signaling + * + * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce + * user queues and signal fence. The work item will also start another delayed + * work item to restore BOs + */ + +struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, + void *mm) +{ + struct amdgpu_amdkfd_fence *fence = NULL; + + fence = kzalloc(sizeof(*fence), GFP_KERNEL); + if (fence == NULL) + return NULL; + + /* mm_struct mm is used as void pointer to identify the parent + * KFD process. Don't dereference it. Fence and any threads using + * mm is guranteed to be released before process termination. + */ + fence->mm = mm; + get_task_comm(fence->timeline_name, current); + spin_lock_init(&fence->lock); + + dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, + context, atomic_inc_return(&fence_seq)); + + return fence; +} + +struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) +{ + struct amdgpu_amdkfd_fence *fence; + + if (!f) + return NULL; + + fence = container_of(f, struct amdgpu_amdkfd_fence, base); + if (fence && f->ops == &amd_kfd_fence_ops) + return fence; + + return NULL; +} + +static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) +{ + return "amdgpu_amdkfd_fence"; +} + +static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) +{ + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); + + return fence->timeline_name; +} + +/** + * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict + * a KFD BO and schedules a job to move the BO. + * If fence is already signaled return true. + * If fence is not signaled schedule a evict KFD process work item. + */ +static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) +{ + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); + + if (!fence) + return false; + + if (dma_fence_is_signaled(f)) + return true; + + if (!kgd2kfd->schedule_evict_and_restore_process( + (struct mm_struct *)fence->mm, f)) + return true; + + return false; +} + +static int amd_kfd_fence_signal(struct dma_fence *f) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(f->lock, flags); + /* Set enabled bit so cb will called */ + set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); + ret = dma_fence_signal_locked(f); + spin_unlock_irqrestore(f->lock, flags); + + return ret; +} + +/** + * amd_kfd_fence_release - callback that fence can be freed + * + * @fence: fence + * + * This function is called when the reference count becomes zero. + * It just RCU schedules freeing up the fence. +*/ +static void amd_kfd_fence_release(struct dma_fence *f) +{ + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); + /* Unconditionally signal the fence. The process is getting + * terminated. + */ + if (WARN_ON(!fence)) + return; /* Not an amdgpu_amdkfd_fence */ + + amd_kfd_fence_signal(f); + kfree_rcu(f, rcu); +} + +/** + * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f + * if same return TRUE else return FALSE. + * + * @f: [IN] fence + * @mm: [IN] mm that needs to be verified +*/ +bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) +{ + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); + + if (!fence) + return false; + else if (fence->mm == mm) + return true; + + return false; +} + +const struct dma_fence_ops amd_kfd_fence_ops = { + .get_driver_name = amd_kfd_fence_get_driver_name, + .get_timeline_name = amd_kfd_fence_get_timeline_name, + .enable_signaling = amd_kfd_fence_enable_signaling, + .signaled = NULL, + .wait = dma_fence_default_wait, + .release = amd_kfd_fence_release, +}; + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c old mode 100644 new mode 100755 index 5748504..6964ece --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -20,6 +20,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#undef pr_fmt +#define pr_fmt(fmt) "kfd2kgd: " fmt + #include #include #include @@ -39,6 +42,14 @@ #include "gmc/gmc_7_1_sh_mask.h" #include "cik_structs.h" +#define AMDKFD_SKIP_UNCOMPILED_CODE 1 + +enum hqd_dequeue_request_type { + NO_ACTION = 0, + DRAIN_PIPE, + RESET_WAVES +}; + enum { MAX_TRAPID = 8, /* 3 bits in the bitfield. */ MAX_WATCH_ADDRESSES = 4 @@ -55,8 +66,8 @@ enum { enum { ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, - ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, - /* extend the mask to 26 bits to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENTION = 0x03000000, + /* extend the mask to 26 bits in order to match the low address field */ ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF }; @@ -81,30 +92,42 @@ union TCP_WATCH_CNTL_BITS { float f32All; }; +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem); + +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + /* * Register access functions */ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, - uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, - uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); - + uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, + uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases); static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, - unsigned int vmid); - + unsigned int vmid); static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, - uint32_t hpd_size, uint64_t hpd_gpu_addr); + uint32_t hpd_size, uint64_t hpd_gpu_addr); static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm); +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm); +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, - uint32_t pipe_id, uint32_t queue_id); - -static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, + uint32_t pipe_id, uint32_t queue_id); +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id); -static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout); static int kgd_address_watch_disable(struct kgd_dev *kgd); @@ -124,21 +147,60 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid); static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, uint8_t vmid); static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); +static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req); +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid); +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype); +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base); +static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); + +/* Because of REG_GET_FIELD() being used, we put this function in the + * asic specific file. + */ +static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, + struct tile_config *config) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + config->gb_addr_config = adev->gfx.config.gb_addr_config; + config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFBANK); + config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFRANKS); + + config->tile_config_ptr = adev->gfx.config.tile_mode_array; + config->num_tile_configs = + ARRAY_SIZE(adev->gfx.config.tile_mode_array); + config->macro_tile_config_ptr = + adev->gfx.config.macrotile_mode_array; + config->num_macro_tile_configs = + ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); + + + return 0; +} static const struct kfd2kgd_calls kfd2kgd = { .init_gtt_mem_allocation = alloc_gtt_mem, .free_gtt_mem = free_gtt_mem, - .get_vmem_size = get_vmem_size, + .get_local_mem_info = get_local_mem_info, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, + .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, + .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, + .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, + .open_graphic_handle = open_graphic_handle, .program_sh_mem_settings = kgd_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, .init_pipeline = kgd_init_pipeline, .init_interrupts = kgd_init_interrupts, .hqd_load = kgd_hqd_load, .hqd_sdma_load = kgd_hqd_sdma_load, + .hqd_dump = kgd_hqd_dump, + .hqd_sdma_dump = kgd_hqd_sdma_dump, .hqd_is_occupied = kgd_hqd_is_occupied, .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, .hqd_destroy = kgd_hqd_destroy, @@ -147,17 +209,50 @@ static const struct kfd2kgd_calls kfd2kgd = { .address_watch_execute = kgd_address_watch_execute, .wave_control_execute = kgd_wave_control_execute, .address_watch_get_offset = kgd_address_watch_get_offset, - .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid, - .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, + .get_atc_vmid_pasid_mapping_pasid = + get_atc_vmid_pasid_mapping_pasid, + .get_atc_vmid_pasid_mapping_valid = + get_atc_vmid_pasid_mapping_valid, + .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg, .write_vmid_invalidate_request = write_vmid_invalidate_request, - .get_fw_version = get_fw_version + .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, + .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, + .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, + .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, + .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, + .get_fw_version = get_fw_version, + .set_num_of_requests = set_num_of_requests, + .get_cu_info = get_cu_info, + .alloc_memory_of_scratch = alloc_memory_of_scratch, + .write_config_static_mem = write_config_static_mem, + .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, + .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, + .set_vm_context_page_table_base = set_vm_context_page_table_base, + .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, + .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, + .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, + .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, + .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, + .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, + .submit_ib = amdgpu_amdkfd_submit_ib, + .get_tile_config = amdgpu_amdkfd_get_tile_config, + .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, + .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, + .get_vram_usage = amdgpu_amdkfd_get_vram_usage }; -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions() { return (struct kfd2kgd_calls *)&kfd2kgd; } +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem) +{ + return 0; +} + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) { return (struct amdgpu_device *)kgd; @@ -186,7 +281,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, { struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, queue_id, 0); @@ -222,12 +317,12 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, /* * We have to assume that there is no outstanding mapping. - * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because - * a mapping is in progress or because a mapping finished and the - * SW cleared it. So the protocol is to always wait & clear. + * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a + * mapping is in progress or because a mapping finished and the SW + * cleared it. So the protocol is to always wait & clear. */ - uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | - ATC_VMID0_PASID_MAPPING__VALID_MASK; + uint32_t pasid_mapping = (pasid == 0) ? 0 : + (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); @@ -273,8 +368,7 @@ static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; - - pr_debug("kfd: sdma base address: 0x%x\n", retval); + pr_debug("sdma base address: 0x%x\n", retval); return retval; } @@ -290,26 +384,91 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) } static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr) + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t wptr_shadow, is_wptr_shadow_valid; struct cik_mqd *m; + uint32_t *mqd_hqd; + uint32_t reg, wptr_val, data; + bool valid_wptr = false; m = get_mqd(mqd); - is_wptr_shadow_valid = !get_user(wptr_shadow, wptr); - if (is_wptr_shadow_valid) - m->cp_hqd_pq_wptr = wptr_shadow; + acquire_queue(kgd, pipe_id, queue_id); + + /* HQD registers extend from CP_MQD_BASE_ADDR to CP_MQD_CONTROL. */ + mqd_hqd = &m->cp_mqd_base_addr_lo; + + for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) + WREG32(reg, mqd_hqd[reg - mmCP_MQD_BASE_ADDR]); + + /* Copy userspace write pointer value to register. + * Activate doorbell logic to monitor subsequent changes. + */ + data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, + CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); + WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); + + /* read_user_ptr may take the mm->mmap_sem. + * release srbm_mutex to avoid circular dependency between + * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. + */ + release_queue(kgd); + valid_wptr = read_user_wptr(mm, wptr, wptr_val); acquire_queue(kgd, pipe_id, queue_id); - gfx_v7_0_mqd_commit(adev, m); + if (valid_wptr) + WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); + + data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); + WREG32(mmCP_HQD_ACTIVE, data); + + release_queue(kgd); return 0; } -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t i = 0, reg; +#define HQD_N_REGS (35+4) +#define DUMP_REG(addr) do { \ + if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ + break; \ + (*dump)[i][0] = (addr) << 2; \ + (*dump)[i++][1] = RREG32(addr); \ + } while (0) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + acquire_queue(kgd, pipe_id, queue_id); + + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); + + for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++) + DUMP_REG(reg); + + release_queue(kgd); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + + return 0; +} + +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm) { struct amdgpu_device *adev = get_amdgpu_device(kgd); struct cik_sdma_rlc_registers *m; @@ -320,17 +479,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - end_jiffies = msecs_to_jiffies(2000) + jiffies; while (true) { - data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) - break; - if (time_after(jiffies, end_jiffies)) - return -ETIME; - usleep_range(500, 1000); + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (timeout == 0) + return -ETIME; + msleep(10); + timeout -= 10; } if (m->sdma_engine_id) { data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); @@ -344,25 +503,59 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); } - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, - m->sdma_rlc_doorbell); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, - m->sdma_rlc_virtual_addr); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); + data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL, + ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr); + if (read_user_wptr(mm, wptr, data)) + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); + else + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, + m->sdma_rlc_rb_rptr); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, - m->sdma_rlc_rb_cntl); - + data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL, + RB_ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); return 0; } +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + + queue_id * KFD_CIK_SDMA_QUEUE_OFFSET; + uint32_t i = 0, reg; +#undef HQD_N_REGS +#define HQD_N_REGS (19+4) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) + DUMP_REG(sdma_offset + reg); + for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; + reg++) + DUMP_REG(sdma_offset + reg); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + + return 0; +} + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id) { @@ -403,30 +596,99 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) return false; } -static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t temp; - int timeout = utimeout; + enum hqd_dequeue_request_type type; + unsigned long flags, end_jiffies; + int retry; acquire_queue(kgd, pipe_id, queue_id); WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0); - WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); + switch (reset_type) { + case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: + type = DRAIN_PIPE; + break; + case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: + type = RESET_WAVES; + break; + default: + type = DRAIN_PIPE; + break; + } + + /* Workaround: If IQ timer is active and the wait time is close to or + * equal to 0, dequeueing is not safe. Wait until either the wait time + * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is + * cleared before continuing. Also, ensure wait times are set to at + * least 0x3. + */ + local_irq_save(flags); + preempt_disable(); + retry = 5000; /* wait for 500 usecs at maximum */ + while (true) { + temp = RREG32(mmCP_HQD_IQ_TIMER); + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { + pr_debug("HW is processing IQ\n"); + goto loop; + } + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) + == 3) /* SEM-rearm is safe */ + break; + /* Wait time 3 is safe for CP, but our MMIO read/write + * time is close to 1 microsecond, so check for 10 to + * leave more buffer room + */ + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) + >= 10) + break; + pr_debug("IQ timer is active\n"); + } else + break; +loop: + if (!retry) { + pr_err("CP HQD IQ timer status time out\n"); + break; + } + ndelay(100); + --retry; + } + retry = 1000; + while (true) { + temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); + if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) + break; + pr_debug("Dequeue request is pending\n"); + if (!retry) { + pr_err("CP HQD dequeue request time out\n"); + break; + } + ndelay(100); + --retry; + } + local_irq_restore(flags); + preempt_enable(); + + WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); + + end_jiffies = (utimeout * HZ / 1000) + jiffies; while (true) { temp = RREG32(mmCP_HQD_ACTIVE); - if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) break; - if (timeout <= 0) { - pr_err("kfd: cp queue preemption time out.\n"); + if (time_after(jiffies, end_jiffies)) { + pr_err("cp queue preemption time out\n"); release_queue(kgd); return -ETIME; } - msleep(20); - timeout -= 20; + usleep_range(500, 1000); } release_queue(kgd); @@ -440,7 +702,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, struct cik_sdma_rlc_registers *m; uint32_t sdma_base_addr; uint32_t temp; - int timeout = utimeout; + unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); @@ -451,12 +713,11 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, while (true) { temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) break; - if (timeout <= 0) + if (time_after(jiffies, end_jiffies)) return -ETIME; - msleep(20); - timeout -= 20; + usleep_range(500, 1000); } WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); @@ -464,6 +725,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); + m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); + return 0; } @@ -481,8 +744,9 @@ static int kgd_address_watch_disable(struct kgd_dev *kgd) /* Turning off this address until we set all the registers */ for (i = 0; i < MAX_WATCH_ADDRESSES; i++) - WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], cntl.u32All); + WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); return 0; } @@ -500,20 +764,24 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, /* Turning off this watch point until we set all the registers */ cntl.bitfields.valid = 0; - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], cntl.u32All); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_ADDR_HI], addr_hi); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_HI], + addr_hi); - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_ADDR_LO], addr_lo); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_LO], + addr_lo); /* Enable the watch point */ cntl.bitfields.valid = 1; - WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + - ADDRESS_WATCH_REG_CNTL], cntl.u32All); + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); return 0; } @@ -567,7 +835,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, struct amdgpu_device *adev = (struct amdgpu_device *) kgd; reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); - return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; + return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; } static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) @@ -577,52 +845,90 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); } +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + int vmid; + + for (vmid = 0; vmid < 16; vmid++) { + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) + continue; + if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & + ATC_VMID0_PASID_MAPPING__VALID_MASK) { + if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & + ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + break; + } + } + } + + return 0; +} + +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype) +{ + uint32_t reg; + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | + element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | + index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | + mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; + + WREG32(mmSH_STATIC_MEM_CONFIG, reg); + return 0; +} +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + lock_srbm(kgd, 0, 0, 0, vmid); + WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); + unlock_srbm(kgd); + + return 0; +} + + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) { struct amdgpu_device *adev = (struct amdgpu_device *) kgd; const union amdgpu_firmware_header *hdr; - BUG_ON(kgd == NULL); - switch (type) { case KGD_ENGINE_PFP: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.pfp_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; break; case KGD_ENGINE_ME: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.me_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; break; case KGD_ENGINE_CE: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.ce_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; break; case KGD_ENGINE_MEC1: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; break; case KGD_ENGINE_MEC2: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec2_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; break; case KGD_ENGINE_RLC: - hdr = (const union amdgpu_firmware_header *) - adev->gfx.rlc_fw->data; + hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; break; case KGD_ENGINE_SDMA1: - hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[0].fw->data; + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; break; case KGD_ENGINE_SDMA2: - hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[1].fw->data; + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; break; default: @@ -636,3 +942,42 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) return hdr->common.ucode_version; } +static void set_num_of_requests(struct kgd_dev *dev, uint8_t num_of_req) +{ + uint32_t value; + struct amdgpu_device *adev = get_amdgpu_device(dev); + + value = RREG32(mmATC_ATS_DEBUG); + value &= ~ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR_MASK; + value |= (num_of_req << ATC_ATS_DEBUG__NUM_REQUESTS_AT_ERR__SHIFT); + + WREG32(mmATC_ATS_DEBUG, value); +} + +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + /* TODO: Don't use hardcoded VMIDs */ + if (vmid < 8 || vmid > 15) { + pr_err("trying to set page table base for wrong VMID\n"); + return; + } + WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); +} + + /** + * read_vmid_from_vmfault_reg - read vmid from register + * + * adev: amdgpu_device pointer + * @vmid: vmid pointer + * read vmid from register (CIK). + */ +static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); + + return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c old mode 100644 new mode 100755 index c5044d5..2ff10e9 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -20,6 +20,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#undef pr_fmt +#define pr_fmt(fmt) "kfd2kgd: " fmt + #include #include #include @@ -28,7 +31,7 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "amdgpu_ucode.h" -#include "gfx_v8_0.h" +#include "amdgpu_amdkfd_gfx_v8.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_d.h" #include "gca/gfx_8_0_enum.h" @@ -39,7 +42,31 @@ #include "vi_structs.h" #include "vid.h" -struct cik_sdma_rlc_registers; +enum hqd_dequeue_request_type { + NO_ACTION = 0, + DRAIN_PIPE, + RESET_WAVES, + SAVE_WAVES +}; + +static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { + mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, + mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, + mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, + mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL +}; + + +struct vi_sdma_mqd; + +static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, + void *vm, struct kgd_mem **mem); +static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); + +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem); + +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); /* * Register access functions @@ -55,17 +82,26 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr); static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr); -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd); + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm); +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm); +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); -static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id); static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout); -static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); static int kgd_address_watch_disable(struct kgd_dev *kgd); static int kgd_address_watch_execute(struct kgd_dev *kgd, unsigned int watch_point_id, @@ -84,20 +120,61 @@ static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, uint8_t vmid); static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); +static void set_num_of_requests(struct kgd_dev *kgd, + uint8_t num_of_requests); +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid); +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype); +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base); +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); + +/* Because of REG_GET_FIELD() being used, we put this function in the + * asic specific file. + */ +static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, + struct tile_config *config) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + + config->gb_addr_config = adev->gfx.config.gb_addr_config; + config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFBANK); + config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFRANKS); + + config->tile_config_ptr = adev->gfx.config.tile_mode_array; + config->num_tile_configs = + ARRAY_SIZE(adev->gfx.config.tile_mode_array); + config->macro_tile_config_ptr = + adev->gfx.config.macrotile_mode_array; + config->num_macro_tile_configs = + ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); + + return 0; +} static const struct kfd2kgd_calls kfd2kgd = { .init_gtt_mem_allocation = alloc_gtt_mem, .free_gtt_mem = free_gtt_mem, - .get_vmem_size = get_vmem_size, + .get_local_mem_info = get_local_mem_info, .get_gpu_clock_counter = get_gpu_clock_counter, .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, + .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, + .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, + .create_process_gpumem = create_process_gpumem, + .destroy_process_gpumem = destroy_process_gpumem, + .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, + .open_graphic_handle = open_graphic_handle, .program_sh_mem_settings = kgd_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, .init_pipeline = kgd_init_pipeline, .init_interrupts = kgd_init_interrupts, .hqd_load = kgd_hqd_load, .hqd_sdma_load = kgd_hqd_sdma_load, + .hqd_dump = kgd_hqd_dump, + .hqd_sdma_dump = kgd_hqd_sdma_dump, .hqd_is_occupied = kgd_hqd_is_occupied, .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, .hqd_destroy = kgd_hqd_destroy, @@ -111,14 +188,56 @@ static const struct kfd2kgd_calls kfd2kgd = { .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid, .write_vmid_invalidate_request = write_vmid_invalidate_request, - .get_fw_version = get_fw_version + .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, + .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, + .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, + .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, + .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, + .get_fw_version = get_fw_version, + .set_num_of_requests = set_num_of_requests, + .get_cu_info = get_cu_info, + .alloc_memory_of_scratch = alloc_memory_of_scratch, + .write_config_static_mem = write_config_static_mem, + .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, + .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, + .set_vm_context_page_table_base = set_vm_context_page_table_base, + .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, + .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, + .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, + .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, + .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, + .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, + .submit_ib = amdgpu_amdkfd_submit_ib, + .get_tile_config = amdgpu_amdkfd_get_tile_config, + .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, + .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, + .get_vram_usage = amdgpu_amdkfd_get_vram_usage }; -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void) +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions() { return (struct kfd2kgd_calls *)&kfd2kgd; } +static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, + void *vm, struct kgd_mem **mem) +{ + return 0; +} + +/* Destroys the GPU allocation and frees the kgd_mem structure */ +static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) +{ + +} + +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem) +{ + return 0; +} + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) { return (struct amdgpu_device *)kgd; @@ -147,7 +266,7 @@ static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, { struct amdgpu_device *adev = get_amdgpu_device(kgd); - uint32_t mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, queue_id, 0); @@ -216,21 +335,28 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) uint32_t mec; uint32_t pipe; - mec = (++pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); lock_srbm(kgd, mec, pipe, 0, 0); - WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); + WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | + CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); unlock_srbm(kgd); return 0; } -static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m) +static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m) { - return 0; + uint32_t retval; + + retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET + + m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET; + pr_debug("sdma base address: 0x%x\n", retval); + + return retval; } static inline struct vi_mqd *get_mqd(void *mqd) @@ -238,9 +364,9 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } -static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) { - return (struct cik_sdma_rlc_registers *)mqd; + return (struct vi_sdma_mqd *)mqd; } static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, @@ -252,16 +378,18 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, struct vi_mqd *m; uint32_t *mqd_hqd; uint32_t reg, wptr_val, data; + bool valid_wptr = false; m = get_mqd(mqd); acquire_queue(kgd, pipe_id, queue_id); - /*HIQ is set during driver init period with vmid set to 0. For SRIOV - * world switching support let the RLC know about the HIQ. - * - * Workaround: This causes reboots on CZ. Disable this on CZ, which - * doesn't support SRIOV anyway. - */ + + /* HIQ is set during driver init period with vmid set to 0. For SRIOV + * world switching support let the RLC know about the HIQ. + * + * Workaround: This causes reboots on CZ. Disable this on CZ, which + * doesn't support SRIOV anyway. + */ if (m->cp_hqd_vmid == 0 && adev->asic_type != CHIP_CARRIZO) { uint32_t value, mec, pipe; @@ -304,7 +432,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data); - if (read_user_wptr(mm, wptr, wptr_val)) + /* read_user_ptr may take the mm->mmap_sem. + * release srbm_mutex to avoid circular dependency between + * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex. + */ + release_queue(kgd); + valid_wptr = read_user_wptr(mm, wptr, wptr_val); + acquire_queue(kgd, pipe_id, queue_id); + if (valid_wptr) WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask); data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); @@ -315,8 +450,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, return 0; } -static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t i = 0, reg; +#define HQD_N_REGS (54+4) +#define DUMP_REG(addr) do { \ + if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ + break; \ + (*dump)[i][0] = (addr) << 2; \ + (*dump)[i++][1] = RREG32(addr); \ + } while (0) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + acquire_queue(kgd, pipe_id, queue_id); + + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2); + DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3); + + for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++) + DUMP_REG(reg); + + release_queue(kgd); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + + return 0; +} + +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct vi_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t temp, timeout = 2000; + uint32_t data; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (timeout == 0) + return -ETIME; + msleep(10); + timeout -= 10; + } + if (m->sdma_engine_id) { + data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); + } else { + data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); + } + + data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, + ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); + + if (read_user_wptr(mm, wptr, data)) + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data); + else + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, + m->sdmax_rlcx_rb_rptr); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdmax_rlcx_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, + m->sdmax_rlcx_rb_base_hi); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, + m->sdmax_rlcx_rb_rptr_addr_lo); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, + m->sdmax_rlcx_rb_rptr_addr_hi); + + data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, + RB_ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); + + return 0; +} + +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET + + queue_id * KFD_VI_SDMA_QUEUE_OFFSET; + uint32_t i = 0, reg; +#undef HQD_N_REGS +#define HQD_N_REGS (19+4+2+3+7) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) + DUMP_REG(sdma_offset + reg); + for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK; + reg++) + DUMP_REG(sdma_offset + reg); + for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; + reg++) + DUMP_REG(sdma_offset + reg); + for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG; + reg++) + DUMP_REG(sdma_offset + reg); + for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL; + reg++) + DUMP_REG(sdma_offset + reg); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + return 0; } @@ -345,7 +610,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct cik_sdma_rlc_registers *m; + struct vi_sdma_mqd *m; uint32_t sdma_base_addr; uint32_t sdma_rlc_rb_cntl; @@ -360,29 +625,102 @@ static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) return false; } -static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type, +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, unsigned int utimeout, uint32_t pipe_id, uint32_t queue_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t temp; - int timeout = utimeout; + enum hqd_dequeue_request_type type; + unsigned long flags, end_jiffies; + int retry; + struct vi_mqd *m = get_mqd(mqd); acquire_queue(kgd, pipe_id, queue_id); - WREG32(mmCP_HQD_DEQUEUE_REQUEST, reset_type); + if (m->cp_hqd_vmid == 0) + WREG32_FIELD(RLC_CP_SCHEDULERS, scheduler1, 0); + switch (reset_type) { + case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: + type = DRAIN_PIPE; + break; + case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: + type = RESET_WAVES; + break; + default: + type = DRAIN_PIPE; + break; + } + + /* Workaround: If IQ timer is active and the wait time is close to or + * equal to 0, dequeueing is not safe. Wait until either the wait time + * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is + * cleared before continuing. Also, ensure wait times are set to at + * least 0x3. + */ + local_irq_save(flags); + preempt_disable(); + retry = 5000; /* wait for 500 usecs at maximum */ + while (true) { + temp = RREG32(mmCP_HQD_IQ_TIMER); + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { + pr_debug("HW is processing IQ\n"); + goto loop; + } + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) + == 3) /* SEM-rearm is safe */ + break; + /* Wait time 3 is safe for CP, but our MMIO read/write + * time is close to 1 microsecond, so check for 10 to + * leave more buffer room + */ + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) + >= 10) + break; + pr_debug("IQ timer is active\n"); + } else + break; +loop: + if (!retry) { + pr_err("CP HQD IQ timer status time out\n"); + break; + } + ndelay(100); + --retry; + } + retry = 1000; + while (true) { + temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); + if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) + break; + pr_debug("Dequeue request is pending\n"); + + if (!retry) { + pr_err("CP HQD dequeue request time out\n"); + break; + } + ndelay(100); + --retry; + } + local_irq_restore(flags); + preempt_enable(); + + WREG32(mmCP_HQD_DEQUEUE_REQUEST, type); + + end_jiffies = (utimeout * HZ / 1000) + jiffies; while (true) { temp = RREG32(mmCP_HQD_ACTIVE); - if (temp & CP_HQD_ACTIVE__ACTIVE_MASK) + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) break; - if (timeout <= 0) { - pr_err("kfd: cp queue preemption time out.\n"); + if (time_after(jiffies, end_jiffies)) { + pr_err("cp queue preemption time out.\n"); release_queue(kgd); return -ETIME; } - msleep(20); - timeout -= 20; + usleep_range(500, 1000); } release_queue(kgd); @@ -393,10 +731,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, unsigned int utimeout) { struct amdgpu_device *adev = get_amdgpu_device(kgd); - struct cik_sdma_rlc_registers *m; + struct vi_sdma_mqd *m; uint32_t sdma_base_addr; uint32_t temp; - int timeout = utimeout; + unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); @@ -407,18 +745,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, while (true) { temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) break; - if (timeout <= 0) + if (time_after(jiffies, end_jiffies)) return -ETIME; - msleep(20); - timeout -= 20; + usleep_range(500, 1000); } WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); + + m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); return 0; } @@ -440,7 +779,7 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, struct amdgpu_device *adev = (struct amdgpu_device *) kgd; reg = RREG32(mmATC_VMID0_PASID_MAPPING + vmid); - return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; + return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; } static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) @@ -450,8 +789,83 @@ static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); } +/* + * FIXME: Poliars test failed with this package, FIJI works fine + * From the CP spec it does not official support the invalidation + * with the specified pasid in the package, so disable it for V8 + * + */ +#ifdef V8_SUPPORT_IT_OFFICIAL +static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) +{ + signed long r; + struct dma_fence *f; + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; + + mutex_lock(&adev->gfx.kiq.ring_mutex); + amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ + amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); + amdgpu_ring_write(ring, + PACKET3_INVALIDATE_TLBS_DST_SEL(1) | + PACKET3_INVALIDATE_TLBS_PASID(pasid)); + amdgpu_fence_emit(ring, &f); + amdgpu_ring_commit(ring); + mutex_unlock(&adev->gfx.kiq.ring_mutex); + + r = dma_fence_wait(f, false); + if (r) + DRM_ERROR("wait for kiq fence error: %ld.\n", r); + dma_fence_put(f); + + return r; +} +#endif +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + int vmid; + +#ifdef V8_SUPPORT_IT_OFFICIAL + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; + + if (ring->ready) + return invalidate_tlbs_with_kiq(adev, pasid); +#endif + + for (vmid = 0; vmid < 16; vmid++) { + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) + continue; + if (RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & + ATC_VMID0_PASID_MAPPING__VALID_MASK) { + if ((RREG32(mmATC_VMID0_PASID_MAPPING + vmid) & + ATC_VMID0_PASID_MAPPING__PASID_MASK) == pasid) { + WREG32(mmVM_INVALIDATE_REQUEST, 1 << vmid); + break; + } + } + } + + return 0; +} + static int kgd_address_watch_disable(struct kgd_dev *kgd) { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + union TCP_WATCH_CNTL_BITS cntl; + unsigned int i; + + cntl.u32All = 0; + + cntl.bitfields.valid = 0; + cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; + cntl.bitfields.atc = 1; + + /* Turning off this address until we set all the registers */ + for (i = 0; i < MAX_WATCH_ADDRESSES; i++) + WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + return 0; } @@ -461,6 +875,32 @@ static int kgd_address_watch_execute(struct kgd_dev *kgd, uint32_t addr_hi, uint32_t addr_lo) { + struct amdgpu_device *adev = get_amdgpu_device(kgd); + union TCP_WATCH_CNTL_BITS cntl; + + cntl.u32All = cntl_val; + + /* Turning off this watch point until we set all the registers */ + cntl.bitfields.valid = 0; + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_HI], + addr_hi); + + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_ADDR_LO], + addr_lo); + + /* Enable the watch point */ + cntl.bitfields.valid = 1; + + WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + return 0; } @@ -493,6 +933,32 @@ static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, unsigned int watch_point_id, unsigned int reg_offset) { + return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; +} + +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype) +{ + uint32_t reg; + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = swizzle_enable << SH_STATIC_MEM_CONFIG__SWIZZLE_ENABLE__SHIFT | + element_size << SH_STATIC_MEM_CONFIG__ELEMENT_SIZE__SHIFT | + index_stride << SH_STATIC_MEM_CONFIG__INDEX_STRIDE__SHIFT | + mtype << SH_STATIC_MEM_CONFIG__PRIVATE_MTYPE__SHIFT; + + WREG32(mmSH_STATIC_MEM_CONFIG, reg); + return 0; +} +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + lock_srbm(kgd, 0, 0, 0, vmid); + WREG32(mmSH_HIDDEN_PRIVATE_BASE_VMID, va); + unlock_srbm(kgd); + return 0; } @@ -501,47 +967,45 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) struct amdgpu_device *adev = (struct amdgpu_device *) kgd; const union amdgpu_firmware_header *hdr; - BUG_ON(kgd == NULL); - switch (type) { case KGD_ENGINE_PFP: hdr = (const union amdgpu_firmware_header *) - adev->gfx.pfp_fw->data; + adev->gfx.pfp_fw->data; break; case KGD_ENGINE_ME: hdr = (const union amdgpu_firmware_header *) - adev->gfx.me_fw->data; + adev->gfx.me_fw->data; break; case KGD_ENGINE_CE: hdr = (const union amdgpu_firmware_header *) - adev->gfx.ce_fw->data; + adev->gfx.ce_fw->data; break; case KGD_ENGINE_MEC1: hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec_fw->data; + adev->gfx.mec_fw->data; break; case KGD_ENGINE_MEC2: hdr = (const union amdgpu_firmware_header *) - adev->gfx.mec2_fw->data; + adev->gfx.mec2_fw->data; break; case KGD_ENGINE_RLC: hdr = (const union amdgpu_firmware_header *) - adev->gfx.rlc_fw->data; + adev->gfx.rlc_fw->data; break; case KGD_ENGINE_SDMA1: hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[0].fw->data; + adev->sdma.instance[0].fw->data; break; case KGD_ENGINE_SDMA2: hdr = (const union amdgpu_firmware_header *) - adev->sdma.instance[1].fw->data; + adev->sdma.instance[1].fw->data; break; default: @@ -554,3 +1018,21 @@ static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) /* Only 12 bit in use*/ return hdr->common.ucode_version; } + +static void set_num_of_requests(struct kgd_dev *kgd, + uint8_t num_of_requests) +{ + pr_debug("This is a stub\n"); +} + +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + /* TODO: Don't use hardcoded VMIDs */ + if (vmid < 8 || vmid > 15) { + pr_err("trying to set page table base for wrong VMID\n"); + return; + } + WREG32(mmVM_CONTEXT8_PAGE_TABLE_BASE_ADDR + vmid - 8, page_table_base); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h new file mode 100644 index 0000000..3c94919 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.h @@ -0,0 +1,62 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef AMDGPU_AMDKFD_GFX_V8_H_INCLUDED +#define AMDGPU_AMDKFD_GFX_V8_H_INCLUDED + +#include + +enum { + MAX_TRAPID = 8, /* 3 bits in the bitfield. */ + MAX_WATCH_ADDRESSES = 4 +}; + +enum { + ADDRESS_WATCH_REG_ADDR_HI = 0, + ADDRESS_WATCH_REG_ADDR_LO, + ADDRESS_WATCH_REG_CNTL, + ADDRESS_WATCH_REG_MAX +}; + +/* not defined in the VI reg file */ +enum { + ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, + ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, + /* extend the mask to 26 bits in order to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, + ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF +}; + +union TCP_WATCH_CNTL_BITS { + struct { + uint32_t mask:24; + uint32_t vmid:4; + uint32_t atc:1; + uint32_t mode:2; + uint32_t valid:1; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c new file mode 100644 index 0000000..edbae19 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -0,0 +1,1227 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#undef pr_fmt +#define pr_fmt(fmt) "kfd2kgd: " fmt + +#include +#include +#include +#include +#include +#include "amdgpu.h" +#include "amdgpu_amdkfd.h" +#include "amdgpu_ucode.h" +#include "amdgpu_amdkfd_gfx_v8.h" +#include "vega10/soc15ip.h" +#include "vega10/GC/gc_9_0_offset.h" +#include "vega10/GC/gc_9_0_sh_mask.h" +#include "vega10/vega10_enum.h" +#include "vega10/SDMA0/sdma0_4_0_offset.h" +#include "vega10/SDMA0/sdma0_4_0_sh_mask.h" +#include "vega10/SDMA1/sdma1_4_0_offset.h" +#include "vega10/SDMA1/sdma1_4_0_sh_mask.h" +#include "vega10/ATHUB/athub_1_0_offset.h" +#include "vega10/ATHUB/athub_1_0_sh_mask.h" +#include "vega10/OSSSYS/osssys_4_0_offset.h" +#include "vega10/OSSSYS/osssys_4_0_sh_mask.h" +#include "soc15_common.h" +#include "v9_structs.h" +#include "soc15.h" +#include "soc15d.h" + +/* HACK: MMHUB and GC both have VM-related register with the same + * names but different offsets. Define the MMHUB register we need here + * with a prefix. A proper solution would be to move the functions + * programming these registers into gfx_v9_0.c and mmhub_v1_0.c + * respectively. + */ +#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 +#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 + +#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 +#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 + +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 + +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 + +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c +#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 + +#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 +#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 +#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 +#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 + +enum hqd_dequeue_request_type { + NO_ACTION = 0, + DRAIN_PIPE, + RESET_WAVES, + SAVE_WAVES +}; + +static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = { + mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL, + mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL, + mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL, + mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL +}; + + +static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, + void *vm, struct kgd_mem **mem); +static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem); + +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem); + +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); + +/* + * Register access functions + */ + +static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, + uint32_t sh_mem_config, + uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, + uint32_t sh_mem_bases); +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, + unsigned int vmid); +static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr); +static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); +static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm); +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm); +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); +static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id); +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id); +static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout); +static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +static uint32_t get_watch_base_addr(void); +static int kgd_address_watch_disable(struct kgd_dev *kgd); +static int kgd_address_watch_execute(struct kgd_dev *kgd, + unsigned int watch_point_id, + uint32_t cntl_val, + uint32_t addr_hi, + uint32_t addr_lo); +static int kgd_wave_control_execute(struct kgd_dev *kgd, + uint32_t gfx_index_val, + uint32_t sq_cmd); +static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, + unsigned int watch_point_id, + unsigned int reg_offset); + +static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, + uint8_t vmid); +static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid); +static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid); +static void set_num_of_requests(struct kgd_dev *kgd, + uint8_t num_of_requests); +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid); +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype); +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base); +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); + +/* Because of REG_GET_FIELD() being used, we put this function in the + * asic specific file. + */ +static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, + struct tile_config *config) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + + config->gb_addr_config = adev->gfx.config.gb_addr_config; +#if 0 +/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but + * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu + * changes commented out related code, doing the same here for now but + * need to sync with Ken et al + */ + config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFBANK); + config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, + MC_ARB_RAMCFG, NOOFRANKS); +#endif + + config->tile_config_ptr = adev->gfx.config.tile_mode_array; + config->num_tile_configs = + ARRAY_SIZE(adev->gfx.config.tile_mode_array); + config->macro_tile_config_ptr = + adev->gfx.config.macrotile_mode_array; + config->num_macro_tile_configs = + ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); + + return 0; +} + +static const struct kfd2kgd_calls kfd2kgd = { + .init_gtt_mem_allocation = alloc_gtt_mem, + .free_gtt_mem = free_gtt_mem, + .get_local_mem_info = get_local_mem_info, + .get_gpu_clock_counter = get_gpu_clock_counter, + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, + .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, + .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, + .create_process_gpumem = create_process_gpumem, + .destroy_process_gpumem = destroy_process_gpumem, + .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, + .open_graphic_handle = open_graphic_handle, + .program_sh_mem_settings = kgd_program_sh_mem_settings, + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, + .init_pipeline = kgd_init_pipeline, + .init_interrupts = kgd_init_interrupts, + .hqd_load = kgd_hqd_load, + .hqd_sdma_load = kgd_hqd_sdma_load, + .hqd_dump = kgd_hqd_dump, + .hqd_sdma_dump = kgd_hqd_sdma_dump, + .hqd_is_occupied = kgd_hqd_is_occupied, + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, + .hqd_destroy = kgd_hqd_destroy, + .hqd_sdma_destroy = kgd_hqd_sdma_destroy, + .address_watch_disable = kgd_address_watch_disable, + .address_watch_execute = kgd_address_watch_execute, + .wave_control_execute = kgd_wave_control_execute, + .address_watch_get_offset = kgd_address_watch_get_offset, + .get_atc_vmid_pasid_mapping_pasid = + get_atc_vmid_pasid_mapping_pasid, + .get_atc_vmid_pasid_mapping_valid = + get_atc_vmid_pasid_mapping_valid, + .write_vmid_invalidate_request = write_vmid_invalidate_request, + .invalidate_tlbs = invalidate_tlbs, + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, + .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, + .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, + .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, + .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, + .get_fw_version = get_fw_version, + .set_num_of_requests = set_num_of_requests, + .get_cu_info = get_cu_info, + .alloc_memory_of_scratch = alloc_memory_of_scratch, + .write_config_static_mem = write_config_static_mem, + .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo, + .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, + .set_vm_context_page_table_base = set_vm_context_page_table_base, + .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table, + .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table, + .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info, + .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf, + .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf, + .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, + .submit_ib = amdgpu_amdkfd_submit_ib, + .get_tile_config = amdgpu_amdkfd_get_tile_config, + .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, + .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem, + .get_vram_usage = amdgpu_amdkfd_get_vram_usage +}; + +struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions() +{ + return (struct kfd2kgd_calls *)&kfd2kgd; +} + +static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size, + void *vm, struct kgd_mem **mem) +{ + return 0; +} + +/* Destroys the GPU allocation and frees the kgd_mem structure */ +static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem) +{ + +} + +static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm, + int fd, uint32_t handle, struct kgd_mem **mem) +{ + return 0; +} + +static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) +{ + return (struct amdgpu_device *)kgd; +} + +static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, + uint32_t queue, uint32_t vmid) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + mutex_lock(&adev->srbm_mutex); + soc15_grbm_select(adev, mec, pipe, queue, vmid); +} + +static void unlock_srbm(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + soc15_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); +} + +static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t queue_id) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, queue_id, 0); +} + +static uint32_t get_queue_mask(struct amdgpu_device *adev, + uint32_t pipe_id, uint32_t queue_id) +{ + unsigned int bit = (pipe_id * adev->gfx.mec.num_pipe_per_mec + + queue_id) & 31; + + return ((uint32_t)1) << bit; +} + +static void release_queue(struct kgd_dev *kgd) +{ + unlock_srbm(kgd); +} + +static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, + uint32_t sh_mem_config, + uint32_t sh_mem_ape1_base, + uint32_t sh_mem_ape1_limit, + uint32_t sh_mem_bases) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + lock_srbm(kgd, 0, 0, 0, vmid); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); + WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); + /* APE1 no longer exists on GFX9 */ + + unlock_srbm(kgd); +} + +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, + unsigned int vmid) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + /* + * We have to assume that there is no outstanding mapping. + * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because + * a mapping is in progress or because a mapping finished + * and the SW cleared it. + * So the protocol is to always wait & clear. + */ + uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | + ATC_VMID0_PASID_MAPPING__VALID_MASK; + + /* + * need to do this twice, once for gfx and once for mmhub + * for ATC add 16 to VMID for mmhub, for IH different registers. + * ATC_VMID0..15 registers are separate from ATC_VMID16..31. + */ + + WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, + pasid_mapping); + + while (!(RREG32(SOC15_REG_OFFSET( + ATHUB, 0, + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & + (1U << vmid))) + cpu_relax(); + + WREG32(SOC15_REG_OFFSET(ATHUB, 0, + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), + 1U << vmid); + + /* Mapping vmid to pasid also for IH block */ + WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, + pasid_mapping); + + WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, + pasid_mapping); + + while (!(RREG32(SOC15_REG_OFFSET( + ATHUB, 0, + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & + (1U << (vmid + 16)))) + cpu_relax(); + + WREG32(SOC15_REG_OFFSET(ATHUB, 0, + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), + 1U << (vmid + 16)); + + /* Mapping vmid to pasid also for IH block */ + WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, + pasid_mapping); + return 0; +} + +static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, + uint32_t hpd_size, uint64_t hpd_gpu_addr) +{ + /* amdgpu owns the per-pipe state */ + return 0; +} + +/* TODO - RING0 form of field is obsolete, seems to date back to SI + * but still works + */ + +static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t mec; + uint32_t pipe; + + mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + lock_srbm(kgd, mec, pipe, 0, 0); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), + CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | + CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); + + unlock_srbm(kgd); + + return 0; +} + +static uint32_t get_sdma_base_addr(unsigned int engine_id, + unsigned int queue_id) +{ + static const uint32_t base[2] = { + SOC15_REG_OFFSET(SDMA0, 0, + mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, + SOC15_REG_OFFSET(SDMA1, 0, + mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL + }; + uint32_t retval; + + retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - + mmSDMA0_RLC0_RB_CNTL); + + pr_debug("sdma base address: 0x%x\n", retval); + + return retval; +} + +static uint32_t get_watch_base_addr(void) +{ + uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - + mmTCP_WATCH0_ADDR_H; + + pr_debug("kfd: reg watch base address: 0x%x\n", retval); + + return retval; +} + +static inline struct v9_mqd *get_mqd(void *mqd) +{ + return (struct v9_mqd *)mqd; +} + +static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v9_sdma_mqd *)mqd; +} + +static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct v9_mqd *m; + uint32_t *mqd_hqd; + uint32_t reg, hqd_base, data; + + m = get_mqd(mqd); + + acquire_queue(kgd, pipe_id, queue_id); + + /* HIQ is set during driver init period with vmid set to 0*/ + if (m->cp_hqd_vmid == 0) { + uint32_t value, mec, pipe; + + mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); + + pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", + mec, pipe, queue_id); + value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, + ((mec << 5) | (pipe << 3) | queue_id | 0x80)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); + } + + /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ + mqd_hqd = &m->cp_mqd_base_addr_lo; + hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); + + for (reg = hqd_base; + reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) + WREG32(reg, mqd_hqd[reg - hqd_base]); + + + /* Activate doorbell logic before triggering WPTR poll. */ + data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, + CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); + + if (wptr) { + /* Don't read wptr with get_user because the user + * context may not be accessible (if this function + * runs in a work queue). Instead trigger a one-shot + * polling read from memory in the CP. This assumes + * that wptr is GPU-accessible in the queue's VMID via + * ATC or SVM. WPTR==RPTR before starting the poll so + * the CP starts fetching new commands from the right + * place. + * + * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit + * tricky. Assume that the queue didn't overflow. The + * number of valid bits in the 32-bit RPTR depends on + * the queue size. The remaining bits are taken from + * the saved 64-bit WPTR. If the WPTR wrapped, add the + * queue size. + */ + uint32_t queue_size = + 2 << REG_GET_FIELD(m->cp_hqd_pq_control, + CP_HQD_PQ_CONTROL, QUEUE_SIZE); + uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); + + if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) + guessed_wptr += queue_size; + guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); + guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; + + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), + lower_32_bits(guessed_wptr)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), + upper_32_bits(guessed_wptr)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), + lower_32_bits((uint64_t)wptr)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), + upper_32_bits((uint64_t)wptr)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), + get_queue_mask(adev, pipe_id, queue_id)); + } + + /* Start the EOP fetcher */ + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), + REG_SET_FIELD(m->cp_hqd_eop_rptr, + CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); + + data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); + + release_queue(kgd); + + return 0; +} + +static int kgd_hqd_dump(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t i = 0, reg; +#define HQD_N_REGS 56 +#define DUMP_REG(addr) do { \ + if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ + break; \ + (*dump)[i][0] = (addr) << 2; \ + (*dump)[i++][1] = RREG32(addr); \ + } while (0) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + acquire_queue(kgd, pipe_id, queue_id); + + for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); + reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) + DUMP_REG(reg); + + release_queue(kgd); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + + return 0; +} + +static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct v9_sdma_mqd *m; + uint32_t sdma_base_addr, sdmax_gfx_context_cntl; + uint32_t temp, timeout = 2000; + uint32_t data; + uint64_t data64; + uint64_t __user *wptr64 = (uint64_t __user *)wptr; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, + m->sdma_queue_id); + sdmax_gfx_context_cntl = m->sdma_engine_id ? + SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : + SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (timeout == 0) + return -ETIME; + msleep(10); + timeout -= 10; + } + data = RREG32(sdmax_gfx_context_cntl); + data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(sdmax_gfx_context_cntl, data); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, + m->sdmax_rlcx_doorbell_offset); + + data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, + ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, + m->sdmax_rlcx_rb_rptr_hi); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); + if (read_user_wptr(mm, wptr64, data64)) { + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, + lower_32_bits(data64)); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, + upper_32_bits(data64)); + } else { + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, + m->sdmax_rlcx_rb_rptr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, + m->sdmax_rlcx_rb_rptr_hi); + } + WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); + + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, + m->sdmax_rlcx_rb_base_hi); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, + m->sdmax_rlcx_rb_rptr_addr_lo); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, + m->sdmax_rlcx_rb_rptr_addr_hi); + + data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, + RB_ENABLE, 1); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); + + return 0; +} + +static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id); + uint32_t i = 0, reg; +#undef HQD_N_REGS +#define HQD_N_REGS (19+6+7+10) + + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); + if (*dump == NULL) + return -ENOMEM; + + for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) + DUMP_REG(sdma_base_addr + reg); + for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) + DUMP_REG(sdma_base_addr + reg); + for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; + reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) + DUMP_REG(sdma_base_addr + reg); + for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; + reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) + DUMP_REG(sdma_base_addr + reg); + + WARN_ON_ONCE(i != HQD_N_REGS); + *n_regs = i; + + return 0; +} + +static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, + uint32_t pipe_id, uint32_t queue_id) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t act; + bool retval = false; + uint32_t low, high; + + acquire_queue(kgd, pipe_id, queue_id); + act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); + if (act) { + low = lower_32_bits(queue_address >> 8); + high = upper_32_bits(queue_address >> 8); + + if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && + high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) + retval = true; + } + release_queue(kgd); + return retval; +} + +static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct v9_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t sdma_rlc_rb_cntl; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, + m->sdma_queue_id); + + sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); + + if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) + return true; + + return false; +} + +static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, + enum kfd_preempt_type reset_type, + unsigned int utimeout, uint32_t pipe_id, + uint32_t queue_id) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + enum hqd_dequeue_request_type type; + unsigned long end_jiffies; + uint32_t temp; + struct v9_mqd *m = get_mqd(mqd); + +#if 0 + unsigned long flags; + int retry; +#endif + + acquire_queue(kgd, pipe_id, queue_id); + + if (m->cp_hqd_vmid == 0) + WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); + + switch (reset_type) { + case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: + type = DRAIN_PIPE; + break; + case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: + type = RESET_WAVES; + break; + default: + type = DRAIN_PIPE; + break; + } + +#if 0 /* Is this still needed? */ + /* Workaround: If IQ timer is active and the wait time is close to or + * equal to 0, dequeueing is not safe. Wait until either the wait time + * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is + * cleared before continuing. Also, ensure wait times are set to at + * least 0x3. + */ + local_irq_save(flags); + preempt_disable(); + retry = 5000; /* wait for 500 usecs at maximum */ + while (true) { + temp = RREG32(mmCP_HQD_IQ_TIMER); + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { + pr_debug("HW is processing IQ\n"); + goto loop; + } + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) + == 3) /* SEM-rearm is safe */ + break; + /* Wait time 3 is safe for CP, but our MMIO read/write + * time is close to 1 microsecond, so check for 10 to + * leave more buffer room + */ + if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) + >= 10) + break; + pr_debug("IQ timer is active\n"); + } else + break; +loop: + if (!retry) { + pr_err("CP HQD IQ timer status time out\n"); + break; + } + ndelay(100); + --retry; + } + retry = 1000; + while (true) { + temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); + if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) + break; + pr_debug("Dequeue request is pending\n"); + + if (!retry) { + pr_err("CP HQD dequeue request time out\n"); + break; + } + ndelay(100); + --retry; + } + local_irq_restore(flags); + preempt_enable(); +#endif + + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); + + end_jiffies = (utimeout * HZ / 1000) + jiffies; + while (true) { + temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) + break; + if (time_after(jiffies, end_jiffies)) { + pr_err("cp queue preemption time out.\n"); + release_queue(kgd); + return -ETIME; + } + usleep_range(500, 1000); + } + + release_queue(kgd); + return 0; +} + +static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, + unsigned int utimeout) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + struct v9_sdma_mqd *m; + uint32_t sdma_base_addr; + uint32_t temp; + unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; + + m = get_sdma_mqd(mqd); + sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id, + m->sdma_queue_id); + + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); + temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); + + while (true) { + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (time_after(jiffies, end_jiffies)) + return -ETIME; + usleep_range(500, 1000); + } + + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); + + m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); + m->sdmax_rlcx_rb_rptr_hi = + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); + + return 0; +} + +static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, + uint8_t vmid) +{ + uint32_t reg; + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + + vmid); + return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; +} + +static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, + uint8_t vmid) +{ + uint32_t reg; + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + + reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + + vmid); + return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; +} + +static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + uint32_t req = (1 << vmid) | + (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */ + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; + + spin_lock(&adev->tlb_invalidation_lock); + + /* Use light weight invalidation. + * + * TODO 1: agree on the right set of invalidation registers for + * KFD use. Use the last one for now. Invalidate both GC and + * MMHUB. + * + * TODO 2: support range-based invalidation, requires kfg2kgd + * interface change + */ + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), + 0xffffffff); + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), + 0x0000001f); + + WREG32(SOC15_REG_OFFSET(MMHUB, 0, + mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), + 0xffffffff); + WREG32(SOC15_REG_OFFSET(MMHUB, 0, + mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), + 0x0000001f); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); + + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), + req); + + while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & + (1 << vmid))) + cpu_relax(); + + while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, + mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & + (1 << vmid))) + cpu_relax(); + + spin_unlock(&adev->tlb_invalidation_lock); + +} + +static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) +{ + signed long r; + struct dma_fence *f; + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; + + mutex_lock(&adev->gfx.kiq.ring_mutex); + amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ + amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); + amdgpu_ring_write(ring, + PACKET3_INVALIDATE_TLBS_DST_SEL(1) | + PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | + PACKET3_INVALIDATE_TLBS_PASID(pasid) | + PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(2)); + amdgpu_fence_emit(ring, &f); + amdgpu_ring_commit(ring); + mutex_unlock(&adev->gfx.kiq.ring_mutex); + + r = dma_fence_wait(f, false); + if (r) + DRM_ERROR("wait for kiq fence error: %ld.\n", r); + dma_fence_put(f); + + return r; +} + +static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + int vmid; + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; + + if (ring->ready) + return invalidate_tlbs_with_kiq(adev, pasid); + + for (vmid = 0; vmid < 16; vmid++) { + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) + continue; + if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { + if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) + == pasid) { + write_vmid_invalidate_request(kgd, vmid); + break; + } + } + } + + return 0; +} + +static int kgd_address_watch_disable(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + union TCP_WATCH_CNTL_BITS cntl; + unsigned int i; + uint32_t watch_base_addr; + + cntl.u32All = 0; + + cntl.bitfields.valid = 0; + cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; + cntl.bitfields.atc = 1; + + watch_base_addr = get_watch_base_addr(); + /* Turning off this address until we set all the registers */ + for (i = 0; i < MAX_WATCH_ADDRESSES; i++) + WREG32(watch_base_addr + + watchRegs[i * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + + return 0; +} + +static int kgd_address_watch_execute(struct kgd_dev *kgd, + unsigned int watch_point_id, + uint32_t cntl_val, + uint32_t addr_hi, + uint32_t addr_lo) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + union TCP_WATCH_CNTL_BITS cntl; + uint32_t watch_base_addr; + + watch_base_addr = get_watch_base_addr(); + cntl.u32All = cntl_val; + + /* Turning off this watch point until we set all the registers */ + cntl.bitfields.valid = 0; + WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + + WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI], + addr_hi); + + WREG32(watch_base_addr + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO], + addr_lo); + + /* Enable the watch point */ + cntl.bitfields.valid = 1; + + WREG32(watch_base_addr + + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + + ADDRESS_WATCH_REG_CNTL], + cntl.u32All); + + return 0; +} + +static int kgd_wave_control_execute(struct kgd_dev *kgd, + uint32_t gfx_index_val, + uint32_t sq_cmd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint32_t data = 0; + + mutex_lock(&adev->grbm_idx_mutex); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); + WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); + + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, + INSTANCE_BROADCAST_WRITES, 1); + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, + SH_BROADCAST_WRITES, 1); + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, + SE_BROADCAST_WRITES, 1); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); + mutex_unlock(&adev->grbm_idx_mutex); + + return 0; +} + +static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, + unsigned int watch_point_id, + unsigned int reg_offset) +{ + return get_watch_base_addr() + + watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]; +} + +static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype) +{ + /* No longer needed on GFXv9. These values are now hard-coded, + * except for the MTYPE which comes from the page table. + */ + + return 0; +} +static int alloc_memory_of_scratch(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid) +{ + /* No longer needed on GFXv9. The scratch base address is + * passed to the shader by the CP. It's the user mode driver's + * responsibility. + */ + + return 0; +} + +/* FIXME: Does this need to be ASIC-specific code? */ +static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + const union amdgpu_firmware_header *hdr; + + switch (type) { + case KGD_ENGINE_PFP: + hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; + break; + + case KGD_ENGINE_ME: + hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; + break; + + case KGD_ENGINE_CE: + hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; + break; + + case KGD_ENGINE_MEC1: + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; + break; + + case KGD_ENGINE_MEC2: + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; + break; + + case KGD_ENGINE_RLC: + hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; + break; + + case KGD_ENGINE_SDMA1: + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; + break; + + case KGD_ENGINE_SDMA2: + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; + break; + + default: + return 0; + } + + if (hdr == NULL) + return 0; + + /* Only 12 bit in use*/ + return hdr->common.ucode_version; +} + +static void set_num_of_requests(struct kgd_dev *kgd, + uint8_t num_of_requests) +{ + pr_debug("This is a stub\n"); +} + +static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | + AMDGPU_PTE_VALID; + + /* TODO: Don't use hardcoded VMIDs */ + if (vmid < 8 || vmid > 15) { + pr_err("trying to set page table base for wrong VMID %u\n", + vmid); + return; + } + + /* TODO: take advantage of per-process address space size. For + * now, all processes share the same address space size, like + * on GFX8 and older. + */ + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); + + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), + lower_32_bits(adev->vm_manager.max_pfn - 1)); + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), + upper_32_bits(adev->vm_manager.max_pfn - 1)); + + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), + lower_32_bits(adev->vm_manager.max_pfn - 1)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), + upper_32_bits(adev->vm_manager.max_pfn - 1)); + + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c new file mode 100644 index 0000000..7df892d --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -0,0 +1,2578 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#undef pr_fmt +#define pr_fmt(fmt) "kfd2kgd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "amdgpu_amdkfd.h" +#include "amdgpu_ucode.h" +#include "gca/gfx_8_0_sh_mask.h" +#include "gca/gfx_8_0_d.h" +#include "gca/gfx_8_0_enum.h" +#include "oss/oss_3_0_sh_mask.h" +#include "oss/oss_3_0_d.h" +#include "gmc/gmc_8_1_sh_mask.h" +#include "gmc/gmc_8_1_d.h" + +/* Special VM and GART address alignment needed for VI pre-Fiji due to + * a HW bug. + */ +#define VI_BO_SIZE_ALIGN (0x8000) + +/* BO flag to indicate a KFD userptr BO */ +#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) + +/* Impose limit on how much memory KFD can use */ +struct kfd_mem_usage_limit { + uint64_t max_system_mem_limit; + uint64_t max_userptr_mem_limit; + int64_t system_mem_used; + int64_t userptr_mem_used; + spinlock_t mem_limit_lock; +}; + +static struct kfd_mem_usage_limit kfd_mem_limit; + +/* Struct used for amdgpu_amdkfd_bo_validate */ +struct amdgpu_vm_parser { + uint32_t domain; + bool wait; +}; + +static const char * const domain_bit_to_string[] = { + "CPU", + "GTT", + "VRAM", + "GDS", + "GWS", + "OA" +}; + +#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] + +static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); + + +static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) +{ + return (struct amdgpu_device *)kgd; +} + +static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, + struct kgd_mem *mem) +{ + struct kfd_bo_va_list *entry; + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) + if (entry->bo_va->base.vm == avm) + return false; + + return true; +} + +/* Set memory usage limits. Current, limits are + * System (kernel) memory - 15/16th System RAM + * Userptr memory - 15/16th System RAM + */ +void amdgpu_amdkfd_gpuvm_init_mem_limits(void) +{ + struct sysinfo si; + uint64_t mem; + + si_meminfo(&si); + mem = si.totalram - si.totalhigh; + mem *= si.mem_unit; + + spin_lock_init(&kfd_mem_limit.mem_limit_lock); + kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4); /* 15/16 */ + kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 4); /* 15/16 */ + pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", + (kfd_mem_limit.max_system_mem_limit >> 20), + (kfd_mem_limit.max_userptr_mem_limit >> 20)); +} + +static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, + uint64_t size, u32 domain) +{ + size_t acc_size; + int ret = 0; + + acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, + sizeof(struct amdgpu_bo)); + + spin_lock(&kfd_mem_limit.mem_limit_lock); + if (domain == AMDGPU_GEM_DOMAIN_GTT) { + if (kfd_mem_limit.system_mem_used + (acc_size + size) > + kfd_mem_limit.max_system_mem_limit) { + ret = -ENOMEM; + goto err_no_mem; + } + kfd_mem_limit.system_mem_used += (acc_size + size); + } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { + if ((kfd_mem_limit.system_mem_used + acc_size > + kfd_mem_limit.max_system_mem_limit) || + (kfd_mem_limit.userptr_mem_used + (size + acc_size) > + kfd_mem_limit.max_userptr_mem_limit)) { + ret = -ENOMEM; + goto err_no_mem; + } + kfd_mem_limit.system_mem_used += acc_size; + kfd_mem_limit.userptr_mem_used += size; + } +err_no_mem: + spin_unlock(&kfd_mem_limit.mem_limit_lock); + return ret; +} + +static void unreserve_system_mem_limit(struct amdgpu_device *adev, + uint64_t size, u32 domain) +{ + size_t acc_size; + + acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, + sizeof(struct amdgpu_bo)); + + spin_lock(&kfd_mem_limit.mem_limit_lock); + if (domain == AMDGPU_GEM_DOMAIN_GTT) { + kfd_mem_limit.system_mem_used -= (acc_size + size); + } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { + kfd_mem_limit.system_mem_used -= acc_size; + kfd_mem_limit.userptr_mem_used -= size; + } + WARN_ONCE(kfd_mem_limit.system_mem_used < 0, + "kfd system memory accounting unbalanced"); + WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, + "kfd userptr memory accounting unbalanced"); + + spin_unlock(&kfd_mem_limit.mem_limit_lock); +} + +void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) +{ + spin_lock(&kfd_mem_limit.mem_limit_lock); + + if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { + kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; + kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); + } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { + kfd_mem_limit.system_mem_used -= + (bo->tbo.acc_size + amdgpu_bo_size(bo)); + } + WARN_ONCE(kfd_mem_limit.system_mem_used < 0, + "kfd system memory accounting unbalanced"); + WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, + "kfd userptr memory accounting unbalanced"); + + spin_unlock(&kfd_mem_limit.mem_limit_lock); +} + + +/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's + * reservation object. + * + * @bo: [IN] Remove eviction fence(s) from this BO + * @ef: [IN] If ef is specified, then this eviction fence is removed if it + * is present in the shared list. + * @ef_list: [OUT] Returns list of eviction fences. These fences are removed + * from BO's reservation object shared list. + * @ef_count: [OUT] Number of fences in ef_list. + * + * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be + * called to restore the eviction fences and to avoid memory leak. This is + * useful for shared BOs. + * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held. + */ +static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, + struct amdgpu_amdkfd_fence *ef, + struct amdgpu_amdkfd_fence ***ef_list, + unsigned int *ef_count) +{ + struct reservation_object_list *fobj; + struct reservation_object *resv; + unsigned int i = 0, j = 0, k = 0, shared_count; + unsigned int count = 0; + struct amdgpu_amdkfd_fence **fence_list; + + if (!ef && !ef_list) + return -EINVAL; + + if (ef_list) { + *ef_list = NULL; + *ef_count = 0; + } + + resv = bo->tbo.resv; + fobj = reservation_object_get_list(resv); + + if (!fobj) + return 0; + + preempt_disable(); + write_seqcount_begin(&resv->seq); + + /* Go through all the shared fences in the resevation object. If + * ef is specified and it exists in the list, remove it and reduce the + * count. If ef is not specified, then get the count of eviction fences + * present. + */ + shared_count = fobj->shared_count; + for (i = 0; i < shared_count; ++i) { + struct dma_fence *f; + + f = rcu_dereference_protected(fobj->shared[i], + reservation_object_held(resv)); + + if (ef) { + if (f->context == ef->base.context) { + dma_fence_put(f); + fobj->shared_count--; + } else + RCU_INIT_POINTER(fobj->shared[j++], f); + + } else if (to_amdgpu_amdkfd_fence(f)) + count++; + } + write_seqcount_end(&resv->seq); + preempt_enable(); + + if (ef || !count) + return 0; + + /* Alloc memory for count number of eviction fence pointers. Fill the + * ef_list array and ef_count + */ + + fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *), + GFP_KERNEL); + if (!fence_list) + return -ENOMEM; + + preempt_disable(); + write_seqcount_begin(&resv->seq); + + j = 0; + for (i = 0; i < shared_count; ++i) { + struct dma_fence *f; + struct amdgpu_amdkfd_fence *efence; + + f = rcu_dereference_protected(fobj->shared[i], + reservation_object_held(resv)); + + efence = to_amdgpu_amdkfd_fence(f); + if (efence) { + fence_list[k++] = efence; + fobj->shared_count--; + } else + RCU_INIT_POINTER(fobj->shared[j++], f); + } + + write_seqcount_end(&resv->seq); + preempt_enable(); + + *ef_list = fence_list; + *ef_count = k; + + return 0; +} + +/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's + * reservation object. + * + * @bo: [IN] Add eviction fences to this BO + * @ef_list: [IN] List of eviction fences to be added + * @ef_count: [IN] Number of fences in ef_list. + * + * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this + * function. + */ +static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo, + struct amdgpu_amdkfd_fence **ef_list, + unsigned int ef_count) +{ + int i; + + if (!ef_list || !ef_count) + return; + + for (i = 0; i < ef_count; i++) { + amdgpu_bo_fence(bo, &ef_list[i]->base, true); + /* Readding the fence takes an additional reference. Drop that + * reference. + */ + dma_fence_put(&ef_list[i]->base); + } + + kfree(ef_list); +} + +static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, + bool wait) +{ + int ret; + + if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm), + "Called with userptr BO")) + return -EINVAL; + + amdgpu_ttm_placement_from_domain(bo, domain); + + ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); + if (ret) + goto validate_fail; + if (wait) { + struct amdgpu_amdkfd_fence **ef_list; + unsigned int ef_count; + + ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list, + &ef_count); + if (ret) + goto validate_fail; + + ttm_bo_wait(&bo->tbo, false, false); + amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count); + } + +validate_fail: + return ret; +} + +static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo) +{ + struct amdgpu_vm_parser *p = param; + + return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait); +} + +/* vm_validate_pt_pd_bos - Validate page table and directory BOs + * + * Also updates page directory entries so we don't need to do this + * again later until the page directory is validated again (e.g. after + * an eviction or allocating new page tables). + */ +static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) +{ + struct amdgpu_bo *pd = vm->root.base.bo; + struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); + struct amdgpu_vm_parser param; + int ret; + + param.domain = AMDGPU_GEM_DOMAIN_VRAM; + param.wait = false; + + ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate, + ¶m); + if (ret) { + pr_err("amdgpu: failed to validate PT BOs\n"); + return ret; + } + + ret = amdgpu_amdkfd_validate(¶m, pd); + if (ret) { + pr_err("amdgpu: failed to validate PD\n"); + return ret; + } + + ret = amdgpu_vm_update_directories(adev, vm); + if (ret != 0) + return ret; + + return 0; +} + +/* add_bo_to_vm - Add a BO to a VM + * + * Everything that needs to bo done only once when a BO is first added + * to a VM. It can later be mapped and unmapped many times without + * repeating these steps. + * + * 1. Allocate and initialize BO VA entry data structure + * 2. Add BO to the VM + * 3. Determine ASIC-specific PTE flags + * 4. Alloc page tables and directories if needed + * 4a. Validate new page tables and directories and update directories + */ +static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem, + struct amdgpu_vm *avm, bool is_aql, + struct kfd_bo_va_list **p_bo_va_entry) +{ + int ret; + struct kfd_bo_va_list *bo_va_entry; + struct amdkfd_vm *kvm = container_of(avm, + struct amdkfd_vm, base); + struct amdgpu_bo *pd = avm->root.base.bo; + struct amdgpu_bo *bo = mem->bo; + uint64_t va = mem->va; + struct list_head *list_bo_va = &mem->bo_va_list; + unsigned long bo_size = bo->tbo.mem.size; + + if (!va) { + pr_err("Invalid VA when adding BO to VM\n"); + return -EINVAL; + } + + if (is_aql) + va += bo_size; + + bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL); + if (!bo_va_entry) + return -ENOMEM; + + pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va, + va + bo_size, avm); + + /* Add BO to VM internal data structures*/ + bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo); + if (bo_va_entry->bo_va == NULL) { + ret = -EINVAL; + pr_err("Failed to add BO object to VM. ret == %d\n", + ret); + goto err_vmadd; + } + + bo_va_entry->va = va; + bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev, + mem->mapping_flags); + bo_va_entry->kgd_dev = (void *)adev; + list_add(&bo_va_entry->bo_list, list_bo_va); + + if (p_bo_va_entry) + *p_bo_va_entry = bo_va_entry; + + /* Allocate new page tables if neeeded and validate + * them. Clearing of new page tables and validate need to wait + * on move fences. We don't want that to trigger the eviction + * fence, so remove it temporarily. + */ + amdgpu_amdkfd_remove_eviction_fence(pd, + kvm->process_info->eviction_fence, + NULL, NULL); + + ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo)); + if (ret) { + pr_err("Failed to allocate pts, err=%d\n", ret); + goto err_alloc_pts; + } + + ret = vm_validate_pt_pd_bos(avm); + if (ret != 0) { + pr_err("validate_pt_pd_bos() failed\n"); + goto err_alloc_pts; + } + + /* Add the eviction fence back */ + amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); + + return 0; + +err_alloc_pts: + amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); + amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va); + list_del(&bo_va_entry->bo_list); +err_vmadd: + kfree(bo_va_entry); + return ret; +} + +static void remove_bo_from_vm(struct amdgpu_device *adev, + struct kfd_bo_va_list *entry, unsigned long size) +{ + pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n", + entry->va, + entry->va + size, entry); + amdgpu_vm_bo_rmv(adev, entry->bo_va); + list_del(&entry->bo_list); + kfree(entry); +} + +static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, + struct amdkfd_process_info *process_info, + bool userptr) +{ + struct ttm_validate_buffer *entry = &mem->validate_list; + struct amdgpu_bo *bo = mem->bo; + + INIT_LIST_HEAD(&entry->head); + entry->shared = true; + entry->bo = &bo->tbo; + mutex_lock(&process_info->lock); + if (userptr) + list_add_tail(&entry->head, &process_info->userptr_valid_list); + else + list_add_tail(&entry->head, &process_info->kfd_bo_list); + mutex_unlock(&process_info->lock); +} + +/* Initializes user pages. It registers the MMU notifier and validates + * the userptr BO in the GTT domain. + * + * The BO must already be on the userptr_valid_list. Otherwise an + * eviction and restore may happen that leaves the new BO unmapped + * with the user mode queues running. + * + * Takes the process_info->lock to protect against concurrent restore + * workers. + * + * Returns 0 for success, negative errno for errors. + */ +static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, + uint64_t user_addr) +{ + struct amdkfd_process_info *process_info = mem->process_info; + struct amdgpu_bo *bo = mem->bo; + int ret = 0; + + mutex_lock(&process_info->lock); + + ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); + if (ret) { + pr_err("%s: Failed to set userptr: %d\n", __func__, ret); + goto out; + } + + ret = amdgpu_mn_register(bo, user_addr); + if (ret) { + pr_err("%s: Failed to register MMU notifier: %d\n", + __func__, ret); + goto out; + } + + /* If no restore worker is running concurrently, user_pages + * should not be allocated + */ + WARN(mem->user_pages, "Leaking user_pages array"); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + mem->user_pages = drm_calloc_large(bo->tbo.ttm->num_pages, + sizeof(struct page *)); +#else + mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, + sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); +#endif + if (!mem->user_pages) { + pr_err("%s: Failed to allocate pages array\n", __func__); + ret = -ENOMEM; + goto unregister_out; + } + + ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); + if (ret) { + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + goto free_out; + } + + amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); + + ret = amdgpu_bo_reserve(bo, true); + if (ret) { + pr_err("%s: Failed to reserve BO\n", __func__); + goto release_out; + } + amdgpu_ttm_placement_from_domain(bo, mem->domain); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, + true, false); + if (ret) + pr_err("%s: failed to validate BO\n", __func__); + amdgpu_bo_unreserve(bo); + +release_out: + if (ret) + release_pages(mem->user_pages, bo->tbo.ttm->num_pages, 0); +free_out: +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(mem->user_pages); +#else + kvfree(mem->user_pages); +#endif + mem->user_pages = NULL; +unregister_out: + if (ret) + amdgpu_mn_unregister(bo); +out: + mutex_unlock(&process_info->lock); + return ret; +} + +static int __map_bo_to_kernel(struct amdgpu_bo *bo, u32 domain, void **kptr) +{ + int ret; + + ret = amdgpu_bo_reserve(bo, true); + if (ret) { + pr_err("Failed to reserve bo. ret %d\n", ret); + return ret; + } + + ret = amdgpu_bo_pin(bo, domain, NULL); + if (ret) { + pr_err("Failed to pin bo. ret %d\n", ret); + goto pin_failed; + } + + ret = amdgpu_bo_kmap(bo, kptr); + if (ret) { + pr_err("Failed to map bo to kernel. ret %d\n", ret); + goto kmap_failed; + } + + amdgpu_bo_unreserve(bo); + + return ret; + +kmap_failed: + amdgpu_bo_unpin(bo); +pin_failed: + amdgpu_bo_unreserve(bo); + + return ret; +} + +static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va, + uint64_t size, void *vm, struct kgd_mem **mem, + uint64_t *offset, u32 domain, u64 flags, + struct sg_table *sg, bool aql_queue, + bool readonly, bool execute, bool coherent, bool no_sub, + bool userptr) +{ + struct amdgpu_device *adev; + int ret; + struct amdgpu_bo *bo; + uint64_t user_addr = 0; + int byte_align; + u32 alloc_domain; + uint32_t mapping_flags; + struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; + + if (aql_queue) + size = size >> 1; + if (userptr) { + if (!offset || !*offset) + return -EINVAL; + user_addr = *offset; + } + + adev = get_amdgpu_device(kgd); + byte_align = (adev->family == AMDGPU_FAMILY_VI && + adev->asic_type != CHIP_FIJI && + adev->asic_type != CHIP_POLARIS10 && + adev->asic_type != CHIP_POLARIS11) ? + VI_BO_SIZE_ALIGN : 1; + + *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); + if (*mem == NULL) { + ret = -ENOMEM; + goto err; + } + INIT_LIST_HEAD(&(*mem)->bo_va_list); + mutex_init(&(*mem)->lock); + (*mem)->coherent = coherent; + (*mem)->no_substitute = no_sub; + (*mem)->aql_queue = aql_queue; + + mapping_flags = AMDGPU_VM_PAGE_READABLE; + if (!readonly) + mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; + if (execute) + mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; + if (coherent) + mapping_flags |= AMDGPU_VM_MTYPE_UC; + else + mapping_flags |= AMDGPU_VM_MTYPE_NC; + + (*mem)->mapping_flags = mapping_flags; + + alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain; + + amdgpu_sync_create(&(*mem)->sync); + + ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain); + if (ret) { + pr_err("Insufficient system memory\n"); + goto err_bo_create; + } + + pr_debug("\t create BO VA 0x%llx size 0x%llx domain %s\n", + va, size, domain_string(alloc_domain)); + + /* Allocate buffer object. Userptr objects need to start out + * in the CPU domain, get moved to GTT when pinned. + */ + ret = amdgpu_bo_create(adev, size, byte_align, false, + alloc_domain, + flags, sg, NULL, 0, &bo); + if (ret != 0) { + pr_err("Failed to create BO on domain %s. ret %d\n", + domain_string(alloc_domain), ret); + unreserve_system_mem_limit(adev, size, alloc_domain); + goto err_bo_create; + } + bo->kfd_bo = *mem; + (*mem)->bo = bo; + if (userptr) + bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; + + (*mem)->va = va; + (*mem)->domain = domain; + (*mem)->mapped_to_gpu_memory = 0; + (*mem)->process_info = kfd_vm->process_info; + add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, userptr); + + if (userptr) { + ret = init_user_pages(*mem, current->mm, user_addr); + if (ret) { + mutex_lock(&kfd_vm->process_info->lock); + list_del(&(*mem)->validate_list.head); + mutex_unlock(&kfd_vm->process_info->lock); + goto allocate_init_user_pages_failed; + } + } + + if (offset) + *offset = amdgpu_bo_mmap_offset(bo); + + return 0; + +allocate_init_user_pages_failed: + amdgpu_bo_unref(&bo); +err_bo_create: + kfree(*mem); +err: + return ret; +} + +/* Reserving a BO and its page table BOs must happen atomically to + * avoid deadlocks. When updating userptrs we need to temporarily + * back-off the reservation and then reacquire it. Track all the + * reservation info in a context structure. Buffers can be mapped to + * multiple VMs simultaneously (buffers being restored on multiple + * GPUs). + */ +struct bo_vm_reservation_context { + struct amdgpu_bo_list_entry kfd_bo; + unsigned int n_vms; + struct amdgpu_bo_list_entry *vm_pd; + struct ww_acquire_ctx ticket; + struct list_head list, duplicates; + struct amdgpu_sync *sync; + bool reserved; +}; + +/** + * reserve_bo_and_vm - reserve a BO and a VM unconditionally. + * @mem: KFD BO structure. + * @vm: the VM to reserve. + * @ctx: the struct that will be used in unreserve_bo_and_vms(). + */ +static int reserve_bo_and_vm(struct kgd_mem *mem, + struct amdgpu_vm *vm, + struct bo_vm_reservation_context *ctx) +{ + struct amdgpu_bo *bo = mem->bo; + int ret; + + WARN_ON(!vm); + + ctx->reserved = false; + ctx->n_vms = 1; + ctx->sync = &mem->sync; + + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->duplicates); + + ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) + * ctx->n_vms, GFP_KERNEL); + if (ctx->vm_pd == NULL) + return -ENOMEM; + + ctx->kfd_bo.robj = bo; + ctx->kfd_bo.priority = 0; + ctx->kfd_bo.tv.bo = &bo->tbo; + ctx->kfd_bo.tv.shared = true; + ctx->kfd_bo.user_pages = NULL; + list_add(&ctx->kfd_bo.tv.head, &ctx->list); + + amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]); + + ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, + false, &ctx->duplicates); + if (!ret) + ctx->reserved = true; + else + pr_err("Failed to reserve buffers in ttm\n"); + + if (ret) { + kfree(ctx->vm_pd); + ctx->vm_pd = NULL; + } + + return ret; +} + +enum VA_TYPE { + VA_NOT_MAPPED = 0, + VA_MAPPED, + VA_DO_NOT_CARE, +}; + +/** + * reserve_bo_and_vm - reserve a BO and some VMs that the BO has been added + * to, conditionally based on map_type. + * @mem: KFD BO structure. + * @vm: the VM to reserve. If NULL, then all VMs associated with the BO + * is used. Otherwise, a single VM associated with the BO. + * @map_type: the mapping status that will be used to filter the VMs. + * @ctx: the struct that will be used in unreserve_bo_and_vms(). + */ +static int reserve_bo_and_cond_vms(struct kgd_mem *mem, + struct amdgpu_vm *vm, enum VA_TYPE map_type, + struct bo_vm_reservation_context *ctx) +{ + struct amdgpu_bo *bo = mem->bo; + struct kfd_bo_va_list *entry; + unsigned int i; + int ret; + + ctx->reserved = false; + ctx->n_vms = 0; + ctx->vm_pd = NULL; + ctx->sync = &mem->sync; + + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->duplicates); + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if ((vm && vm != entry->bo_va->base.vm) || + (entry->is_mapped != map_type + && map_type != VA_DO_NOT_CARE)) + continue; + + ctx->n_vms++; + } + + if (ctx->n_vms != 0) { + ctx->vm_pd = kzalloc(sizeof(struct amdgpu_bo_list_entry) + * ctx->n_vms, GFP_KERNEL); + if (ctx->vm_pd == NULL) + return -ENOMEM; + } + + ctx->kfd_bo.robj = bo; + ctx->kfd_bo.priority = 0; + ctx->kfd_bo.tv.bo = &bo->tbo; + ctx->kfd_bo.tv.shared = true; + ctx->kfd_bo.user_pages = NULL; + list_add(&ctx->kfd_bo.tv.head, &ctx->list); + + i = 0; + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if ((vm && vm != entry->bo_va->base.vm) || + (entry->is_mapped != map_type + && map_type != VA_DO_NOT_CARE)) + continue; + + amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list, + &ctx->vm_pd[i]); + i++; + } + + ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list, + false, &ctx->duplicates); + if (!ret) + ctx->reserved = true; + else + pr_err("Failed to reserve buffers in ttm.\n"); + + if (ret) { + kfree(ctx->vm_pd); + ctx->vm_pd = NULL; + } + + return ret; +} + +static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, + bool wait, bool intr) +{ + int ret = 0; + + if (wait) + ret = amdgpu_sync_wait(ctx->sync, intr); + + if (ctx->reserved) + ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list); + kfree(ctx->vm_pd); + + ctx->sync = NULL; + + ctx->reserved = false; + ctx->vm_pd = NULL; + + return ret; +} + +static int unmap_bo_from_gpuvm(struct amdgpu_device *adev, + struct kfd_bo_va_list *entry, + struct amdgpu_sync *sync) +{ + struct amdgpu_bo_va *bo_va = entry->bo_va; + struct amdgpu_vm *vm = bo_va->base.vm; + struct amdkfd_vm *kvm = container_of(vm, struct amdkfd_vm, base); + struct amdgpu_bo *pd = vm->root.base.bo; + + /* Remove eviction fence from PD (and thereby from PTs too as they + * share the resv. object. Otherwise during PT update job (see + * amdgpu_vm_bo_update_mapping), eviction fence will get added to + * job->sync object + */ + amdgpu_amdkfd_remove_eviction_fence(pd, + kvm->process_info->eviction_fence, + NULL, NULL); + amdgpu_vm_bo_unmap(adev, bo_va, entry->va); + + amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); + + /* Add the eviction fence back */ + amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true); + + amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); + + /* Sync objects can't handle multiple GPUs (contexts) updating + * sync->last_vm_update. Fortunately we don't need it for + * KFD's purposes, so we can just drop that fence. + */ + if (sync->last_vm_update) { + dma_fence_put(sync->last_vm_update); + sync->last_vm_update = NULL; + } + + return 0; +} + +static int update_gpuvm_pte(struct amdgpu_device *adev, + struct kfd_bo_va_list *entry, + struct amdgpu_sync *sync) +{ + int ret; + struct amdgpu_vm *vm; + struct amdgpu_bo_va *bo_va; + struct amdgpu_bo *bo; + + bo_va = entry->bo_va; + vm = bo_va->base.vm; + bo = bo_va->base.bo; + + /* Update the page tables */ + ret = amdgpu_vm_bo_update(adev, bo_va, false); + if (ret != 0) { + pr_err("amdgpu_vm_bo_update failed\n"); + return ret; + } + + amdgpu_sync_fence(adev, sync, bo_va->last_pt_update); + + /* Sync objects can't handle multiple GPUs (contexts) updating + * sync->last_vm_update. Fortunately we don't need it for + * KFD's purposes, so we can just drop that fence. + */ + if (sync->last_vm_update) { + dma_fence_put(sync->last_vm_update); + sync->last_vm_update = NULL; + } + + return 0; +} + +static int map_bo_to_gpuvm(struct amdgpu_device *adev, + struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, + bool no_update_pte) +{ + int ret; + + /* Set virtual address for the allocation */ + ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0, + amdgpu_bo_size(entry->bo_va->base.bo), entry->pte_flags); + if (ret != 0) { + pr_err("Failed to map VA 0x%llx in vm. ret %d\n", + entry->va, ret); + return ret; + } + + if (no_update_pte) + return 0; + + ret = update_gpuvm_pte(adev, entry, sync); + if (ret != 0) { + pr_err("update_gpuvm_pte() failed\n"); + goto update_gpuvm_pte_failed; + } + + return 0; + +update_gpuvm_pte_failed: + unmap_bo_from_gpuvm(adev, entry, sync); + return ret; +} + +static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size) +{ + struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL); + + if (!sg) + return NULL; + if (sg_alloc_table(sg, 1, GFP_KERNEL)) { + kfree(sg); + return NULL; + } + sg->sgl->dma_address = addr; + sg->sgl->length = size; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->sgl->dma_length = size; +#endif + return sg; +} + +int amdgpu_amdkfd_gpuvm_sync_memory( + struct kgd_dev *kgd, struct kgd_mem *mem, bool intr) +{ + int ret = 0; + struct amdgpu_sync sync; + struct amdgpu_device *adev; + + adev = get_amdgpu_device(kgd); + amdgpu_sync_create(&sync); + + mutex_lock(&mem->lock); + amdgpu_sync_clone(adev, &mem->sync, &sync); + mutex_unlock(&mem->lock); + + ret = amdgpu_sync_wait(&sync, intr); + amdgpu_sync_free(&sync); + return ret; +} + +#define BOOL_TO_STR(b) (b == true) ? "true" : "false" + +int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( + struct kgd_dev *kgd, uint64_t va, uint64_t size, + void *vm, struct kgd_mem **mem, + uint64_t *offset, uint32_t flags) +{ + bool aql_queue, public, readonly, execute, coherent, no_sub, userptr; + u64 alloc_flag; + uint32_t domain; + uint64_t *temp_offset; + struct sg_table *sg = NULL; + + if (!(flags & ALLOC_MEM_FLAGS_NONPAGED)) { + pr_err("current hw doesn't support paged memory\n"); + return -EINVAL; + } + + domain = 0; + alloc_flag = 0; + temp_offset = NULL; + + aql_queue = (flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM) ? true : false; + public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false; + readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false; + execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false; + coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false; + no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false; + userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false; + + /* + * Check on which domain to allocate BO + */ + if (flags & ALLOC_MEM_FLAGS_VRAM) { + domain = AMDGPU_GEM_DOMAIN_VRAM; + alloc_flag = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; + if (public) { + alloc_flag = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + temp_offset = offset; + } + alloc_flag |= AMDGPU_GEM_CREATE_VRAM_CLEARED; + } else if (flags & (ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_USERPTR)) { + domain = AMDGPU_GEM_DOMAIN_GTT; + alloc_flag = 0; + temp_offset = offset; + } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) { + domain = AMDGPU_GEM_DOMAIN_GTT; + alloc_flag = 0; + temp_offset = offset; + if (size > UINT_MAX) + return -EINVAL; + sg = create_doorbell_sg(*offset, size); + if (!sg) + return -ENOMEM; + } + + if (offset && !userptr) + *offset = 0; + + pr_debug("Allocate VA 0x%llx - 0x%llx domain %s aql %s\n", + va, va + size, domain_string(domain), + BOOL_TO_STR(aql_queue)); + + pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n", + alloc_flag, BOOL_TO_STR(public), + BOOL_TO_STR(readonly), BOOL_TO_STR(execute), + BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub)); + + return __alloc_memory_of_gpu(kgd, va, size, vm, mem, + temp_offset, domain, + alloc_flag, sg, + aql_queue, readonly, execute, + coherent, no_sub, userptr); +} + +int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +{ + struct amdgpu_device *adev; + struct kfd_bo_va_list *entry, *tmp; + struct bo_vm_reservation_context ctx; + int ret = 0; + struct ttm_validate_buffer *bo_list_entry; + struct amdkfd_process_info *process_info; + unsigned long bo_size; + + adev = get_amdgpu_device(kgd); + process_info = ((struct amdkfd_vm *)vm)->process_info; + + bo_size = mem->bo->tbo.mem.size; + + mutex_lock(&mem->lock); + + if (mem->mapped_to_gpu_memory > 0) { + pr_err("BO VA 0x%llx size 0x%lx is already mapped to vm %p.\n", + mem->va, bo_size, vm); + mutex_unlock(&mem->lock); + return -EBUSY; + } + + mutex_unlock(&mem->lock); + /* lock is not needed after this, since mem is unused and will + * be freed anyway + */ + + /* No more MMU notifiers */ + amdgpu_mn_unregister(mem->bo); + + /* Make sure restore workers don't access the BO any more */ + bo_list_entry = &mem->validate_list; + mutex_lock(&process_info->lock); + list_del(&bo_list_entry->head); + mutex_unlock(&process_info->lock); + + /* Free user pages if necessary */ + if (mem->user_pages) { + pr_debug("%s: Freeing user_pages array\n", __func__); + if (mem->user_pages[0]) + release_pages(mem->user_pages, + mem->bo->tbo.ttm->num_pages, 0); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(mem->user_pages); +#else + kvfree(mem->user_pages); +#endif + } + + ret = reserve_bo_and_cond_vms(mem, NULL, VA_DO_NOT_CARE, &ctx); + if (unlikely(ret != 0)) + return ret; + + /* The eviction fence should be removed by the last unmap. + * TODO: Log an error condition if the bo still has the eviction fence + * attached + */ + amdgpu_amdkfd_remove_eviction_fence(mem->bo, + process_info->eviction_fence, + NULL, NULL); + pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va, + mem->va + bo_size * (1 + mem->aql_queue)); + + /* Remove from VM internal data structures */ + list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list) { + remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev, + entry, bo_size); + } + + ret = unreserve_bo_and_vms(&ctx, false, false); + + /* Free the sync object */ + amdgpu_sync_free(&mem->sync); + + /* If the SG is not NULL, it's one we created for a doorbell + * BO. We need to free it. + */ + if (mem->bo->tbo.sg) { + sg_free_table(mem->bo->tbo.sg); + kfree(mem->bo->tbo.sg); + } + + /* Free the BO*/ + amdgpu_bo_unref(&mem->bo); + kfree(mem); + + return ret; +} + +int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +{ + struct amdgpu_device *adev; + int ret; + struct amdgpu_bo *bo; + uint32_t domain; + struct kfd_bo_va_list *entry; + struct bo_vm_reservation_context ctx; + struct kfd_bo_va_list *bo_va_entry = NULL; + struct kfd_bo_va_list *bo_va_entry_aql = NULL; + struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; + unsigned long bo_size; + bool is_invalid_userptr; + + adev = get_amdgpu_device(kgd); + + /* Make sure restore is not running concurrently. Since we + * don't map invalid userptr BOs, we rely on the next restore + * worker to do the mapping + */ + mutex_lock(&mem->process_info->lock); + + /* Lock mmap-sem. If we find an invalid userptr BO, we can be + * sure that the MMU notifier is no longer running + * concurrently and the queues are actually stopped + */ + down_read(¤t->mm->mmap_sem); + is_invalid_userptr = atomic_read(&mem->invalid); + up_read(¤t->mm->mmap_sem); + + mutex_lock(&mem->lock); + + bo = mem->bo; + + if (!bo) { + pr_err("Invalid BO when mapping memory to GPU\n"); + return -EINVAL; + } + + domain = mem->domain; + bo_size = bo->tbo.mem.size; + + pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n", + mem->va, + mem->va + bo_size * (1 + mem->aql_queue), + vm, domain_string(domain)); + + ret = reserve_bo_and_vm(mem, vm, &ctx); + if (unlikely(ret != 0)) + goto bo_reserve_failed; + + /* Userptr can be marked as "not invalid", but not actually be + * validated yet (still in the system domain). In that case + * the queues are still stopped and we can leave mapping for + * the next restore worker + */ + if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) + is_invalid_userptr = true; + + if (check_if_add_bo_to_vm((struct amdgpu_vm *)vm, mem)) { + ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, false, + &bo_va_entry); + if (ret != 0) + goto add_bo_to_vm_failed; + if (mem->aql_queue) { + ret = add_bo_to_vm(adev, mem, (struct amdgpu_vm *)vm, + true, &bo_va_entry_aql); + if (ret != 0) + goto add_bo_to_vm_failed_aql; + } + } + + if (mem->mapped_to_gpu_memory == 0 && + !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { + /* Validate BO only once. The eviction fence gets added to BO + * the first time it is mapped. Validate will wait for all + * background evictions to complete. + */ + ret = amdgpu_amdkfd_bo_validate(bo, domain, true); + if (ret) { + pr_debug("Validate failed\n"); + goto map_bo_to_gpuvm_failed; + } + } + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if (entry->bo_va->base.vm == vm && !entry->is_mapped) { + pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n", + entry->va, entry->va + bo_size, + entry); + + ret = map_bo_to_gpuvm(adev, entry, ctx.sync, + is_invalid_userptr); + if (ret != 0) { + pr_err("Failed to map radeon bo to gpuvm\n"); + goto map_bo_to_gpuvm_failed; + } + entry->is_mapped = true; + mem->mapped_to_gpu_memory++; + pr_debug("\t INC mapping count %d\n", + mem->mapped_to_gpu_memory); + } + } + + if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) == NULL) + amdgpu_bo_fence(bo, + &kfd_vm->process_info->eviction_fence->base, + true); + ret = unreserve_bo_and_vms(&ctx, false, false); + + mutex_unlock(&mem->process_info->lock); + mutex_unlock(&mem->lock); + return ret; + +map_bo_to_gpuvm_failed: + if (bo_va_entry_aql) + remove_bo_from_vm(adev, bo_va_entry_aql, bo_size); +add_bo_to_vm_failed_aql: + if (bo_va_entry) + remove_bo_from_vm(adev, bo_va_entry, bo_size); +add_bo_to_vm_failed: + unreserve_bo_and_vms(&ctx, false, false); +bo_reserve_failed: + mutex_unlock(&mem->process_info->lock); + mutex_unlock(&mem->lock); + return ret; +} + +static u64 get_vm_pd_gpu_offset(void *vm) +{ + struct amdgpu_vm *avm = (struct amdgpu_vm *) vm; + struct amdgpu_device *adev = + amdgpu_ttm_adev(avm->root.base.bo->tbo.bdev); + u64 offset; + + BUG_ON(avm == NULL); + + amdgpu_bo_reserve(avm->root.base.bo, false); + + offset = amdgpu_bo_gpu_offset(avm->root.base.bo); + + amdgpu_bo_unreserve(avm->root.base.bo); + + /* On some ASICs the FB doesn't start at 0. Adjust FB offset + * to an actual MC address. + */ + if (adev->gart.gart_funcs->get_vm_pde) + offset = amdgpu_gart_get_vm_pde(adev, offset); + + return offset; +} + +int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, + void **process_info, + struct dma_fence **ef) +{ + int ret; + struct amdkfd_vm *new_vm; + struct amdkfd_process_info *info; + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL); + if (new_vm == NULL) + return -ENOMEM; + + /* Initialize the VM context, allocate the page directory and zero it */ + ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE); + if (ret != 0) { + pr_err("Failed init vm ret %d\n", ret); + /* Undo everything related to the new VM context */ + goto vm_init_fail; + } + new_vm->adev = adev; + + if (!*process_info) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + pr_err("Failed to create amdkfd_process_info"); + ret = -ENOMEM; + goto alloc_process_info_fail; + } + + mutex_init(&info->lock); + INIT_LIST_HEAD(&info->vm_list_head); + INIT_LIST_HEAD(&info->kfd_bo_list); + INIT_LIST_HEAD(&info->userptr_valid_list); + INIT_LIST_HEAD(&info->userptr_inval_list); + + info->eviction_fence = + amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), + current->mm); + if (info->eviction_fence == NULL) { + pr_err("Failed to create eviction fence\n"); + goto create_evict_fence_fail; + } + + info->pid = get_task_pid(current->group_leader, + PIDTYPE_PID); + atomic_set(&info->evicted_bos, 0); + INIT_DELAYED_WORK(&info->work, + amdgpu_amdkfd_restore_userptr_worker); + + *process_info = info; + *ef = dma_fence_get(&info->eviction_fence->base); + } + + new_vm->process_info = *process_info; + + mutex_lock(&new_vm->process_info->lock); + list_add_tail(&new_vm->vm_list_node, + &(new_vm->process_info->vm_list_head)); + new_vm->process_info->n_vms++; + mutex_unlock(&new_vm->process_info->lock); + + *vm = (void *) new_vm; + + pr_debug("Created process vm %p\n", *vm); + + return ret; + +create_evict_fence_fail: + kfree(info); +alloc_process_info_fail: + amdgpu_vm_fini(adev, &new_vm->base); +vm_init_fail: + kfree(new_vm); + return ret; + +} + +void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; + struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *) vm; + struct amdgpu_vm *avm = &kfd_vm->base; + struct amdgpu_bo *pd; + struct amdkfd_process_info *process_info; + + if (WARN_ON(!kgd || !vm)) + return; + + pr_debug("Destroying process vm %p\n", vm); + /* Release eviction fence from PD */ + pd = avm->root.base.bo; + amdgpu_bo_reserve(pd, false); + amdgpu_bo_fence(pd, NULL, false); + amdgpu_bo_unreserve(pd); + + process_info = kfd_vm->process_info; + + mutex_lock(&process_info->lock); + process_info->n_vms--; + list_del(&kfd_vm->vm_list_node); + mutex_unlock(&process_info->lock); + + /* Release per-process resources */ + if (!process_info->n_vms) { + WARN_ON(!list_empty(&process_info->kfd_bo_list)); + WARN_ON(!list_empty(&process_info->userptr_valid_list)); + WARN_ON(!list_empty(&process_info->userptr_inval_list)); + + dma_fence_put(&process_info->eviction_fence->base); + cancel_delayed_work_sync(&process_info->work); + put_pid(process_info->pid); + kfree(process_info); + } + + /* Release the VM context */ + amdgpu_vm_fini(adev, avm); + kfree(vm); +} + +uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm) +{ + return get_vm_pd_gpu_offset(vm) >> AMDGPU_GPU_PAGE_SHIFT; +} + +int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, + struct kfd_vm_fault_info *mem) +{ + struct amdgpu_device *adev; + + adev = (struct amdgpu_device *) kgd; + if (atomic_read(&adev->mc.vm_fault_info_updated) == 1) { + *mem = *adev->mc.vm_fault_info; + mb(); + atomic_set(&adev->mc.vm_fault_info_updated, 0); + } + return 0; +} + +static bool is_mem_on_local_device(struct kgd_dev *kgd, + struct list_head *bo_va_list, void *vm) +{ + struct kfd_bo_va_list *entry; + + list_for_each_entry(entry, bo_va_list, bo_list) { + if (entry->kgd_dev == kgd && entry->bo_va->base.vm == vm) + return true; + } + + return false; +} + +int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + struct kgd_dev *kgd, struct kgd_mem *mem, void *vm) +{ + struct kfd_bo_va_list *entry; + struct amdgpu_device *adev; + unsigned int mapped_before; + int ret = 0; + struct bo_vm_reservation_context ctx; + struct amdkfd_process_info *process_info; + unsigned long bo_size; + + adev = (struct amdgpu_device *) kgd; + process_info = ((struct amdkfd_vm *)vm)->process_info; + + bo_size = mem->bo->tbo.mem.size; + + mutex_lock(&mem->lock); + + /* + * Make sure that this BO mapped on KGD before unmappping it + */ + if (!is_mem_on_local_device(kgd, &mem->bo_va_list, vm)) { + ret = -EINVAL; + goto out; + } + + if (mem->mapped_to_gpu_memory == 0) { + pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", + mem->va, bo_size, vm); + ret = -EINVAL; + goto out; + } + mapped_before = mem->mapped_to_gpu_memory; + + ret = reserve_bo_and_cond_vms(mem, vm, VA_MAPPED, &ctx); + if (unlikely(ret != 0)) + goto out; + + pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n", + mem->va, + mem->va + bo_size * (1 + mem->aql_queue), + vm); + + list_for_each_entry(entry, &mem->bo_va_list, bo_list) { + if (entry->bo_va->base.vm == vm && entry->is_mapped) { + pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n", + entry->va, + entry->va + bo_size, + entry); + + ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync); + if (ret == 0) { + entry->is_mapped = false; + } else { + pr_err("failed to unmap VA 0x%llx\n", + mem->va); + goto unreserve_out; + } + + mem->mapped_to_gpu_memory--; + pr_debug("\t DEC mapping count %d\n", + mem->mapped_to_gpu_memory); + } + } + + /* If BO is unmapped from all VMs, unfence it. It can be evicted if + * required. + */ + if (mem->mapped_to_gpu_memory == 0 && + !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) + amdgpu_amdkfd_remove_eviction_fence(mem->bo, + process_info->eviction_fence, + NULL, NULL); + + if (mapped_before == mem->mapped_to_gpu_memory) { + pr_debug("BO VA 0x%llx size 0x%lx is not mapped to vm %p\n", + mem->va, bo_size, vm); + ret = -EINVAL; + } + +unreserve_out: + unreserve_bo_and_vms(&ctx, false, false); +out: + mutex_unlock(&mem->lock); + return ret; +} + +int amdgpu_amdkfd_gpuvm_mmap_bo(struct kgd_dev *kgd, struct vm_area_struct *vma) +{ + struct amdgpu_device *adev; + + adev = get_amdgpu_device(kgd); + if (!adev) { + pr_err("Could not get amdgpu device in %s\n", __func__); + return -ENODEV; + } + + return amdgpu_bo_mmap(NULL, vma, &adev->mman.bdev); +} + +int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, + struct kgd_mem *mem, void **kptr) +{ + int ret; + struct amdgpu_bo *bo = mem->bo; + + if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { + pr_err("userptr can't be mapped to kernel\n"); + return -EINVAL; + } + + /* delete kgd_mem from kfd_bo_list to avoid re-validating + * this BO in BO's restoring after eviction. + */ + mutex_lock(&mem->process_info->lock); + + list_del_init(&mem->validate_list.head); + + ret = __map_bo_to_kernel(bo, AMDGPU_GEM_DOMAIN_GTT, kptr); + if (!ret) + mem->kptr = *kptr; + + mutex_unlock(&mem->process_info->lock); + + return ret; +} + +static int pin_bo_wo_map(struct kgd_mem *mem) +{ + struct amdgpu_bo *bo = mem->bo; + int ret = 0; + + ret = amdgpu_bo_reserve(bo, false); + if (unlikely(ret != 0)) + return ret; + + ret = amdgpu_bo_pin(bo, mem->domain, NULL); + amdgpu_bo_unreserve(bo); + + return ret; +} + +static void unpin_bo_wo_map(struct kgd_mem *mem) +{ + struct amdgpu_bo *bo = mem->bo; + int ret = 0; + + ret = amdgpu_bo_reserve(bo, false); + if (unlikely(ret != 0)) + return; + + amdgpu_bo_unpin(bo); + amdgpu_bo_unreserve(bo); +} + +#define AMD_GPU_PAGE_SHIFT PAGE_SHIFT +#define AMD_GPU_PAGE_SIZE (_AC(1, UL) << AMD_GPU_PAGE_SHIFT) + +static int get_sg_table(struct amdgpu_device *adev, + struct kgd_mem *mem, uint64_t offset, + uint64_t size, struct sg_table **ret_sg) +{ + struct amdgpu_bo *bo = mem->bo; + struct sg_table *sg = NULL; + unsigned long bus_addr; + unsigned int chunks; + unsigned int i; + struct scatterlist *s; + uint64_t offset_in_page; + unsigned int page_size; + int ret; + + sg = kmalloc(sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + + if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) + page_size = AMD_GPU_PAGE_SIZE; + else + page_size = PAGE_SIZE; + + + offset_in_page = offset & (page_size - 1); + chunks = (size + offset_in_page + page_size - 1) + / page_size; + + ret = sg_alloc_table(sg, chunks, GFP_KERNEL); + if (unlikely(ret)) + goto out; + + if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) { + bus_addr = bo->tbo.offset + adev->mc.aper_base + offset; + + for_each_sg(sg->sgl, s, sg->orig_nents, i) { + uint64_t chunk_size, length; + + chunk_size = page_size - offset_in_page; + length = min(size, chunk_size); + + sg_set_page(s, NULL, length, offset_in_page); + s->dma_address = bus_addr; + s->dma_length = length; + + size -= length; + offset_in_page = 0; + bus_addr += length; + } + } else { + struct page **pages; + unsigned int cur_page; + + pages = bo->tbo.ttm->pages; + + cur_page = offset / page_size; + for_each_sg(sg->sgl, s, sg->orig_nents, i) { + uint64_t chunk_size, length; + + chunk_size = page_size - offset_in_page; + length = min(size, chunk_size); + + sg_set_page(s, pages[cur_page], length, offset_in_page); + s->dma_address = page_to_phys(pages[cur_page]); + s->dma_length = length; + + size -= length; + offset_in_page = 0; + cur_page++; + } + } + + *ret_sg = sg; + return 0; +out: + kfree(sg); + *ret_sg = NULL; + return ret; +} + +int amdgpu_amdkfd_gpuvm_pin_get_sg_table(struct kgd_dev *kgd, + struct kgd_mem *mem, uint64_t offset, + uint64_t size, struct sg_table **ret_sg) +{ + int ret; + struct amdgpu_device *adev; + + ret = pin_bo_wo_map(mem); + if (unlikely(ret != 0)) + return ret; + + adev = get_amdgpu_device(kgd); + + ret = get_sg_table(adev, mem, offset, size, ret_sg); + if (ret) + unpin_bo_wo_map(mem); + + return ret; +} + +void amdgpu_amdkfd_gpuvm_unpin_put_sg_table( + struct kgd_mem *mem, struct sg_table *sg) +{ + sg_free_table(sg); + kfree(sg); + + unpin_bo_wo_map(mem); +} + +int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, + struct dma_buf *dma_buf, + uint64_t va, void *vm, + struct kgd_mem **mem, uint64_t *size, + uint64_t *mmap_offset) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct drm_gem_object *obj; + struct amdgpu_bo *bo; + struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm; + + if (dma_buf->ops != &drm_gem_prime_dmabuf_ops) + /* Can't handle non-graphics buffers */ + return -EINVAL; + + obj = dma_buf->priv; + if (obj->dev->dev_private != adev) + /* Can't handle buffers from other devices */ + return -EINVAL; + + bo = gem_to_amdgpu_bo(obj); + if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | + AMDGPU_GEM_DOMAIN_GTT | + AMDGPU_GEM_DOMAIN_DGMA))) + /* Only VRAM and GTT BOs are supported */ + return -EINVAL; + + *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); + if (*mem == NULL) + return -ENOMEM; + + if (size) + *size = amdgpu_bo_size(bo); + + if (mmap_offset) + *mmap_offset = amdgpu_bo_mmap_offset(bo); + + INIT_LIST_HEAD(&(*mem)->bo_va_list); + mutex_init(&(*mem)->lock); + (*mem)->mapping_flags = + AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | + AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC; + + (*mem)->bo = amdgpu_bo_ref(bo); + (*mem)->va = va; + if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) + (*mem)->domain = AMDGPU_GEM_DOMAIN_VRAM; + else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) + (*mem)->domain = AMDGPU_GEM_DOMAIN_GTT; + else + (*mem)->domain = AMDGPU_GEM_DOMAIN_DGMA; + (*mem)->mapped_to_gpu_memory = 0; + (*mem)->process_info = kfd_vm->process_info; + add_kgd_mem_to_kfd_bo_list(*mem, kfd_vm->process_info, false); + amdgpu_sync_create(&(*mem)->sync); + + return 0; +} + +int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_dev *kgd, void *vm, + struct kgd_mem *mem, + struct dma_buf **dmabuf) +{ + struct amdgpu_device *adev = NULL; + struct amdgpu_bo *bo = NULL; + struct drm_gem_object *gobj = NULL; + + if (!dmabuf || !kgd || !vm || !mem) + return -EINVAL; + + adev = get_amdgpu_device(kgd); + bo = mem->bo; + + gobj = amdgpu_gem_prime_foreign_bo(adev, bo); + if (gobj == NULL) { + pr_err("Export BO failed. Unable to find/create GEM object\n"); + return -EINVAL; + } + + *dmabuf = amdgpu_gem_prime_export(adev->ddev, gobj, 0); + return 0; +} + +static int process_validate_vms(struct amdkfd_process_info *process_info) +{ + struct amdkfd_vm *peer_vm; + int ret; + + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) { + ret = vm_validate_pt_pd_bos(&peer_vm->base); + if (ret) + return ret; + } + + return 0; +} + +/* Evict a userptr BO by stopping the queues if necessary + * + * Runs in MMU notifier, may be in RECLAIM_FS context. This means it + * cannot do any memory allocations, and cannot take any locks that + * are held elsewhere while allocating memory. Therefore this is as + * simple as possible, using atomic counters. + * + * It doesn't do anything to the BO itself. The real work happens in + * restore, where we get updated page addresses. This function only + * ensures that GPU access to the BO is stopped. + */ +int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, + struct mm_struct *mm) +{ + struct amdkfd_process_info *process_info = mem->process_info; + int invalid, evicted_bos; + int r = 0; + + invalid = atomic_inc_return(&mem->invalid); + evicted_bos = atomic_inc_return(&process_info->evicted_bos); + if (evicted_bos == 1) { + /* First eviction, stop the queues */ + r = kgd2kfd->quiesce_mm(NULL, mm); + if (r != 0) + pr_err("Failed to quiesce KFD\n"); + schedule_delayed_work(&process_info->work, 1); + } + + return r; +} + +/* Update invalid userptr BOs + * + * Moves invalidated (evicted) userptr BOs from userptr_valid_list to + * userptr_inval_list and updates user pages for all BOs that have + * been invalidated since their last update. + */ +static int update_invalid_user_pages(struct amdkfd_process_info *process_info, + struct mm_struct *mm) +{ + struct kgd_mem *mem, *tmp_mem; + struct amdgpu_bo *bo; + int invalid, ret; + + /* Move all invalidated BOs to the userptr_inval_list and + * release their user pages by migration to the CPU domain + */ + list_for_each_entry_safe(mem, tmp_mem, + &process_info->userptr_valid_list, + validate_list.head) { + if (!atomic_read(&mem->invalid)) + continue; /* BO is still valid */ + + bo = mem->bo; + + if (amdgpu_bo_reserve(bo, true)) + return -EAGAIN; + amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); + amdgpu_bo_unreserve(bo); + if (ret) { + pr_err("%s: Failed to invalidate userptr BO\n", + __func__); + return -EAGAIN; + } + + list_move_tail(&mem->validate_list.head, + &process_info->userptr_inval_list); + } + + if (list_empty(&process_info->userptr_inval_list)) + return 0; /* All evicted userptr BOs were freed */ + + /* Go through userptr_inval_list and update any invalid user_pages */ + list_for_each_entry(mem, &process_info->userptr_inval_list, + validate_list.head) { + invalid = atomic_read(&mem->invalid); + if (!invalid) + /* BO hasn't been invalidated since the last + * revalidation attempt. Keep its BO list. + */ + continue; + + bo = mem->bo; + + if (!mem->user_pages) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + mem->user_pages = + drm_calloc_large(bo->tbo.ttm->num_pages, + sizeof(struct page *)); +#else + mem->user_pages = + kvmalloc_array(bo->tbo.ttm->num_pages, + sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); +#endif + if (!mem->user_pages) { + pr_err("%s: Failed to allocate pages array\n", + __func__); + return -ENOMEM; + } + } else if (mem->user_pages[0]) { + release_pages(mem->user_pages, + bo->tbo.ttm->num_pages, 0); + } + + /* Get updated user pages */ + ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, + mem->user_pages); + if (ret) { + mem->user_pages[0] = NULL; + pr_info("%s: Failed to get user pages: %d\n", + __func__, ret); + /* Pretend it succeeded. It will fail later + * with a VM fault if the GPU tries to access + * it. Better than hanging indefinitely with + * stalled user mode queues. + */ + } + + /* Mark the BO as valid unless it was invalidated + * again concurrently + */ + if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) + return -EAGAIN; + } + return 0; +} + +/* Validate invalid userptr BOs + * + * Validates BOs on the userptr_inval_list, and moves them back to the + * userptr_valid_list. Also updates GPUVM page tables with new page + * addresses and waits for the page table updates to complete. + */ +static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) +{ + struct amdgpu_bo_list_entry *pd_bo_list_entries; + struct list_head resv_list, duplicates; + struct ww_acquire_ctx ticket; + struct amdgpu_sync sync; + + struct amdkfd_vm *peer_vm; + struct kgd_mem *mem, *tmp_mem; + struct amdgpu_bo *bo; + int i, ret; + + pd_bo_list_entries = kcalloc(process_info->n_vms, + sizeof(struct amdgpu_bo_list_entry), + GFP_KERNEL); + if (!pd_bo_list_entries) { + pr_err("%s: Failed to allocate PD BO list entries\n", __func__); + return -ENOMEM; + } + + INIT_LIST_HEAD(&resv_list); + INIT_LIST_HEAD(&duplicates); + + /* Get all the page directory BOs that need to be reserved */ + i = 0; + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) + amdgpu_vm_get_pd_bo(&peer_vm->base, &resv_list, + &pd_bo_list_entries[i++]); + /* Add the userptr_inval_list entries to resv_list */ + list_for_each_entry(mem, &process_info->userptr_inval_list, + validate_list.head) { + list_add_tail(&mem->resv_list.head, &resv_list); + mem->resv_list.bo = mem->validate_list.bo; + mem->resv_list.shared = mem->validate_list.shared; + } + + /* Reserve all BOs and page tables for validation */ + ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); + WARN(!list_empty(&duplicates), "Duplicates should be empty"); + if (ret) + goto out; + + amdgpu_sync_create(&sync); + + /* Avoid triggering eviction fences when unmapping invalid + * userptr BOs (waits for all fences, doesn't use + * FENCE_OWNER_VM) + */ + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) + amdgpu_amdkfd_remove_eviction_fence(peer_vm->base.root.base.bo, + process_info->eviction_fence, + NULL, NULL); + + ret = process_validate_vms(process_info); + if (ret) + goto unreserve_out; + + /* Validate BOs and update GPUVM page tables */ + list_for_each_entry_safe(mem, tmp_mem, + &process_info->userptr_inval_list, + validate_list.head) { + struct kfd_bo_va_list *bo_va_entry; + + bo = mem->bo; + + /* Copy pages array and validate the BO if we got user pages */ + if (mem->user_pages[0]) { + amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, + mem->user_pages); + amdgpu_ttm_placement_from_domain(bo, mem->domain); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, + false, false); + if (ret) { + pr_err("%s: failed to validate BO\n", __func__); + goto unreserve_out; + } + } + + /* Validate succeeded, now the BO owns the pages, free + * our copy of the pointer array. Put this BO back on + * the userptr_valid_list. If we need to revalidate + * it, we need to start from scratch. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0) + drm_free_large(mem->user_pages); +#else + kvfree(mem->user_pages); +#endif + mem->user_pages = NULL; + list_move_tail(&mem->validate_list.head, + &process_info->userptr_valid_list); + + /* Update mapping. If the BO was not validated + * (because we couldn't get user pages), this will + * clear the page table entries, which will result in + * VM faults if the GPU tries to access the invalid + * memory. + */ + list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { + if (!bo_va_entry->is_mapped) + continue; + + ret = update_gpuvm_pte((struct amdgpu_device *) + bo_va_entry->kgd_dev, + bo_va_entry, &sync); + if (ret) { + pr_err("%s: update PTE failed\n", __func__); + /* make sure this gets validated again */ + atomic_inc(&mem->invalid); + goto unreserve_out; + } + } + } +unreserve_out: + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) + amdgpu_bo_fence(peer_vm->base.root.base.bo, + &process_info->eviction_fence->base, true); + ttm_eu_backoff_reservation(&ticket, &resv_list); + amdgpu_sync_wait(&sync, false); + amdgpu_sync_free(&sync); +out: + kfree(pd_bo_list_entries); + + return ret; +} + +/* Worker callback to restore evicted userptr BOs + * + * Tries to update and validate all userptr BOs. If successful and no + * concurrent evictions happened, the queues are restarted. Otherwise, + * reschedule for another attempt later. + */ +static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct amdkfd_process_info *process_info = + container_of(dwork, struct amdkfd_process_info, work); + struct task_struct *usertask; + struct mm_struct *mm; + int evicted_bos; + + evicted_bos = atomic_read(&process_info->evicted_bos); + if (!evicted_bos) + return; + + /* Reference task and mm in case of concurrent process termination */ + usertask = get_pid_task(process_info->pid, PIDTYPE_PID); + if (!usertask) + return; + mm = get_task_mm(usertask); + if (!mm) { + put_task_struct(usertask); + return; + } + + mutex_lock(&process_info->lock); + + if (update_invalid_user_pages(process_info, mm)) + goto unlock_out; + /* userptr_inval_list can be empty if all evicted userptr BOs + * have been freed. In that case there is nothing to validate + * and we can just restart the queues. + */ + if (!list_empty(&process_info->userptr_inval_list)) { + if (atomic_read(&process_info->evicted_bos) != evicted_bos) + goto unlock_out; /* Concurrent eviction, try again */ + + if (validate_invalid_user_pages(process_info)) + goto unlock_out; + } + /* Final check for concurrent evicton and atomic update. If + * another eviction happens after successful update, it will + * be a first eviction that calls quiesce_mm. The eviction + * reference counting inside KFD will handle this case. + */ + if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != + evicted_bos) + goto unlock_out; + evicted_bos = 0; + if (kgd2kfd->resume_mm(NULL, mm)) { + pr_err("%s: Failed to resume KFD\n", __func__); + /* No recovery from this failure. Probably the CP is + * hanging. No point trying again. + */ + } +unlock_out: + mutex_unlock(&process_info->lock); + mmput(mm); + put_task_struct(usertask); + + /* If validation failed, reschedule another attempt */ + if (evicted_bos) + schedule_delayed_work(&process_info->work, 1); +} + +/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given + * KFD process identified by process_info + * + * @process_info: amdkfd_process_info of the KFD process + * + * After memory eviction, restore thread calls this function. The function + * should be called when the Process is still valid. BO restore involves - + * + * 1. Release old eviction fence and create new one + * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. + * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of + * BOs that need to be reserved. + * 4. Reserve all the BOs + * 5. Validate of PD and PT BOs. + * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence + * 7. Add fence to all PD and PT BOs. + * 8. Unreserve all BOs + */ + +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) +{ + struct amdgpu_bo_list_entry *pd_bo_list; + struct amdkfd_process_info *process_info = info; + struct amdkfd_vm *peer_vm; + struct kgd_mem *mem; + struct bo_vm_reservation_context ctx; + struct amdgpu_amdkfd_fence *new_fence; + int ret = 0, i; + struct list_head duplicate_save; + struct amdgpu_sync sync_obj; + + INIT_LIST_HEAD(&duplicate_save); + INIT_LIST_HEAD(&ctx.list); + INIT_LIST_HEAD(&ctx.duplicates); + + pd_bo_list = kcalloc(process_info->n_vms, + sizeof(struct amdgpu_bo_list_entry), + GFP_KERNEL); + if (pd_bo_list == NULL) + return -ENOMEM; + + i = 0; + mutex_lock(&process_info->lock); + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) + amdgpu_vm_get_pd_bo(&peer_vm->base, &ctx.list, + &pd_bo_list[i++]); + + /* Reserve all BOs and page tables/directory. Add all BOs from + * kfd_bo_list to ctx.list + */ + list_for_each_entry(mem, &process_info->kfd_bo_list, + validate_list.head) { + + list_add_tail(&mem->resv_list.head, &ctx.list); + mem->resv_list.bo = mem->validate_list.bo; + mem->resv_list.shared = mem->validate_list.shared; + } + + ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list, + false, &duplicate_save); + if (ret) { + pr_debug("Memory eviction: TTM Reserve Failed. Try again\n"); + goto ttm_reserve_fail; + } + + amdgpu_sync_create(&sync_obj); + ctx.sync = &sync_obj; + + /* Validate PDs and PTs */ + ret = process_validate_vms(process_info); + if (ret) + goto validate_map_fail; + + /* Wait for PD/PTs validate to finish */ + /* FIXME: I think this isn't needed */ + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) { + struct amdgpu_bo *bo = peer_vm->base.root.base.bo; + + ttm_bo_wait(&bo->tbo, false, false); + } + + /* Validate BOs and map them to GPUVM (update VM page tables). */ + list_for_each_entry(mem, &process_info->kfd_bo_list, + validate_list.head) { + + struct amdgpu_bo *bo = mem->bo; + uint32_t domain = mem->domain; + struct kfd_bo_va_list *bo_va_entry; + + ret = amdgpu_amdkfd_bo_validate(bo, domain, false); + if (ret) { + pr_debug("Memory eviction: Validate BOs failed. Try again\n"); + goto validate_map_fail; + } + + list_for_each_entry(bo_va_entry, &mem->bo_va_list, + bo_list) { + ret = update_gpuvm_pte((struct amdgpu_device *) + bo_va_entry->kgd_dev, + bo_va_entry, + ctx.sync); + if (ret) { + pr_debug("Memory eviction: update PTE failed. Try again\n"); + goto validate_map_fail; + } + } + } + + amdgpu_sync_wait(ctx.sync, false); + + /* Release old eviction fence and create new one, because fence only + * goes from unsignaled to signaled, fence cannot be reused. + * Use context and mm from the old fence. + */ + new_fence = amdgpu_amdkfd_fence_create( + process_info->eviction_fence->base.context, + process_info->eviction_fence->mm); + if (!new_fence) { + pr_err("Failed to create eviction fence\n"); + ret = -ENOMEM; + goto validate_map_fail; + } + dma_fence_put(&process_info->eviction_fence->base); + process_info->eviction_fence = new_fence; + *ef = dma_fence_get(&new_fence->base); + + /* Wait for validate to finish and attach new eviction fence */ + list_for_each_entry(mem, &process_info->kfd_bo_list, + validate_list.head) + ttm_bo_wait(&mem->bo->tbo, false, false); + list_for_each_entry(mem, &process_info->kfd_bo_list, + validate_list.head) + amdgpu_bo_fence(mem->bo, + &process_info->eviction_fence->base, true); + + /* Attach eviction fence to PD / PT BOs */ + list_for_each_entry(peer_vm, &process_info->vm_list_head, + vm_list_node) { + struct amdgpu_bo *bo = peer_vm->base.root.base.bo; + + amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); + } +validate_map_fail: + ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list); + amdgpu_sync_free(&sync_obj); +ttm_reserve_fail: + mutex_unlock(&process_info->lock); +evict_fence_fail: + kfree(pd_bo_list); + return ret; +} + +int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem, + uint64_t src_offset, struct kgd_mem *dst_mem, + uint64_t dst_offset, uint64_t size, + struct dma_fence **f, uint64_t *actual_size) +{ + struct amdgpu_device *adev = NULL; + struct ttm_mem_reg *src = NULL, *dst = NULL; + struct ttm_buffer_object *src_ttm_bo, *dst_ttm_bo; + struct drm_mm_node *src_mm, *dst_mm; + struct amdgpu_ring *ring; + struct ww_acquire_ctx ticket; + struct list_head list; + struct ttm_validate_buffer resv_list[2]; + uint64_t src_start, dst_start; + uint64_t src_left, dst_left, cur_copy_size, total_copy_size = 0; + struct dma_fence *fence = NULL; + int r; + + if (!kgd || !src_mem || !dst_mem) + return -EINVAL; + + if (actual_size) + *actual_size = 0; + + adev = get_amdgpu_device(kgd); + src_ttm_bo = &src_mem->bo->tbo; + dst_ttm_bo = &dst_mem->bo->tbo; + src = &src_ttm_bo->mem; + dst = &dst_ttm_bo->mem; + src_mm = (struct drm_mm_node *)src->mm_node; + dst_mm = (struct drm_mm_node *)dst->mm_node; + + ring = adev->mman.buffer_funcs_ring; + + INIT_LIST_HEAD(&list); + + resv_list[0].bo = src_ttm_bo; + resv_list[0].shared = true; + resv_list[1].bo = dst_ttm_bo; + resv_list[1].shared = true; + + list_add_tail(&resv_list[0].head, &list); + list_add_tail(&resv_list[1].head, &list); + + if (!ring->ready) { + pr_err("Trying to move memory with ring turned off.\n"); + return -EINVAL; + } + + r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL); + if (r) { + pr_err("Copy buffer failed. Unable to reserve bo (%d)\n", r); + return r; + } + + switch (src->mem_type) { + case TTM_PL_TT: + r = amdgpu_ttm_bind(src_ttm_bo, src); + if (r) { + DRM_ERROR("Copy failed. Cannot bind to gart\n"); + goto copy_fail; + } + break; + case TTM_PL_VRAM: + /* VRAM could be scattered. Find the node in which the offset + * belongs to + */ + while (src_offset >= (src_mm->size << PAGE_SHIFT)) { + src_offset -= (src_mm->size << PAGE_SHIFT); + ++src_mm; + } + break; + default: + DRM_ERROR("Unknown placement %d\n", src->mem_type); + r = -EINVAL; + goto copy_fail; + } + src_start = src_mm->start << PAGE_SHIFT; + src_start += src_ttm_bo->bdev->man[src->mem_type].gpu_offset; + src_start += src_offset; + src_left = (src_mm->size << PAGE_SHIFT) - src_offset; + + switch (dst->mem_type) { + case TTM_PL_TT: + r = amdgpu_ttm_bind(dst_ttm_bo, dst); + if (r) { + DRM_ERROR("Copy failed. Cannot bind to gart\n"); + goto copy_fail; + } + break; + case TTM_PL_VRAM: + while (dst_offset >= (dst_mm->size << PAGE_SHIFT)) { + dst_offset -= (dst_mm->size << PAGE_SHIFT); + ++dst_mm; + } + break; + default: + DRM_ERROR("Unknown placement %d\n", dst->mem_type); + r = -EINVAL; + goto copy_fail; + } + dst_start = dst_mm->start << PAGE_SHIFT; + dst_start += dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; + dst_start += dst_offset; + dst_left = (dst_mm->size << PAGE_SHIFT) - dst_offset; + + do { + struct dma_fence *next; + + /* src_left/dst_left: amount of space left in the current node + * Copy minimum of (src_left, dst_left, amount of bytes left to + * copy) + */ + cur_copy_size = min3(src_left, dst_left, + (size - total_copy_size)); + + r = amdgpu_copy_buffer(ring, src_start, dst_start, + cur_copy_size, NULL, &next, false, false); + if (r) + break; + + /* Just keep the last fence */ + dma_fence_put(fence); + fence = next; + + total_copy_size += cur_copy_size; + /* Required amount of bytes copied. Done. */ + if (total_copy_size >= size) + break; + + /* If end of src or dst node is reached, move to next node */ + src_left -= cur_copy_size; + if (!src_left) { + ++src_mm; + src_start = src_mm->start << PAGE_SHIFT; + src_start += + src_ttm_bo->bdev->man[src->mem_type].gpu_offset; + src_left = src_mm->size << PAGE_SHIFT; + } else + src_start += cur_copy_size; + + dst_left -= cur_copy_size; + if (!dst_left) { + ++dst_mm; + dst_start = dst_mm->start << PAGE_SHIFT; + dst_start += + dst_ttm_bo->bdev->man[dst->mem_type].gpu_offset; + dst_left = dst_mm->size << PAGE_SHIFT; + } else + dst_start += cur_copy_size; + + } while (total_copy_size < size); + + /* Failure could occur after partial copy. So fill in amount copied + * and fence, still fill-in + */ + if (actual_size) + *actual_size = total_copy_size; + + if (fence) { + amdgpu_bo_fence(src_mem->bo, fence, true); + amdgpu_bo_fence(dst_mem->bo, fence, true); + } + + if (f) + *f = fence; + +copy_fail: + ttm_eu_backoff_reservation(&ticket, &list); + return r; +} + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 9c472c5..2be2e05 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -817,11 +817,7 @@ static struct drm_driver kms_driver = { .driver_features = DRIVER_USE_AGP | DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM | -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0) DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ, -#else - DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET, -#endif .load = amdgpu_driver_load_kms, .open = amdgpu_driver_open_kms, .postclose = amdgpu_driver_postclose_kms, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c old mode 100644 new mode 100755 index 283dc1b..f421505 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -36,6 +36,7 @@ #include #include "amdgpu.h" #include "amdgpu_trace.h" +#include "amdgpu_amdkfd.h" static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) { @@ -46,6 +47,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo) if (bo->tbo.mem.mem_type == AMDGPU_PL_DGMA_IMPORT) kfree(tbo->mem.bus.addr); + if (bo->kfd_bo) + amdgpu_amdkfd_unreserve_system_memory_limit(bo); amdgpu_bo_kunmap(bo); if (bo->gem_base.import_attach) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h old mode 100644 new mode 100755 index 8a91658..f73dba5 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -89,6 +89,7 @@ struct amdgpu_bo { struct ttm_bo_kmap_obj dma_buf_vmap; struct amdgpu_mn *mn; + struct kgd_mem *kfd_bo; union { struct list_head mn_list; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h old mode 100644 new mode 100755 index 322d2529..af8e544 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -36,6 +36,7 @@ /* some special values for the owner field */ #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) #define AMDGPU_FENCE_OWNER_VM ((void*)1ul) +#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) #define AMDGPU_FENCE_FLAG_INT (1 << 1) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c old mode 100644 new mode 100755 index c586f44..7ee8247 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c @@ -31,6 +31,7 @@ #include #include "amdgpu.h" #include "amdgpu_trace.h" +#include "amdgpu_amdkfd.h" struct amdgpu_sync_entry { struct hlist_node node; @@ -84,11 +85,20 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev, */ static void *amdgpu_sync_get_owner(struct dma_fence *f) { - struct amd_sched_fence *s_fence = to_amd_sched_fence(f); + struct amd_sched_fence *s_fence; + struct amdgpu_amdkfd_fence *kfd_fence; + + if (f == NULL) + return AMDGPU_FENCE_OWNER_UNDEFINED; + s_fence = to_amd_sched_fence(f); if (s_fence) return s_fence->owner; + kfd_fence = to_amdgpu_amdkfd_fence(f); + if (kfd_fence) + return AMDGPU_FENCE_OWNER_KFD; + return AMDGPU_FENCE_OWNER_UNDEFINED; } @@ -171,7 +181,8 @@ int amdgpu_sync_fence(struct amdgpu_device *adev, struct amdgpu_sync *sync, * @resv: reservation object with embedded fence * @shared: true if we should only sync to the exclusive fence * - * Sync to the fence + * Sync to the fence except if it is KFD eviction fence and owner is + * AMDGPU_FENCE_OWNER_VM. */ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync, @@ -198,11 +209,15 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, for (i = 0; i < flist->shared_count; ++i) { f = rcu_dereference_protected(flist->shared[i], reservation_object_held(resv)); + fence_owner = amdgpu_sync_get_owner(f); + if (fence_owner == AMDGPU_FENCE_OWNER_KFD && + owner != AMDGPU_FENCE_OWNER_UNDEFINED) + continue; + if (amdgpu_sync_same_dev(adev, f)) { /* VM updates are only interesting * for other VM updates and moves. */ - fence_owner = amdgpu_sync_get_owner(f); if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && ((owner == AMDGPU_FENCE_OWNER_VM) != @@ -297,6 +312,31 @@ struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync) return NULL; } +int amdgpu_sync_clone(struct amdgpu_device *adev, + struct amdgpu_sync *source, + struct amdgpu_sync *clone) +{ + struct amdgpu_sync_entry *e; + struct hlist_node *tmp; + struct dma_fence *f; + int i, r; + + hash_for_each_safe(source->fences, i, tmp, e, node) { + + f = e->fence; + if (!dma_fence_is_signaled(f)) { + r = amdgpu_sync_fence(adev, clone, f); + if (r) + return r; + } else { + hash_del(&e->node); + dma_fence_put(f); + kmem_cache_free(amdgpu_sync_slab, e); + } + } + return 0; +} + int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr) { struct amdgpu_sync_entry *e; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h old mode 100644 new mode 100755 index dc76879..8e29bc7 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h @@ -49,6 +49,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync, struct amdgpu_ring *ring); struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync); +int amdgpu_sync_clone(struct amdgpu_device *adev, struct amdgpu_sync *source, + struct amdgpu_sync *clone); int amdgpu_sync_wait(struct amdgpu_sync *sync, bool intr); void amdgpu_sync_free(struct amdgpu_sync *sync); int amdgpu_sync_init(void); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h old mode 100644 new mode 100755 index 9f34fab..f22f7a8 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h @@ -272,6 +272,7 @@ # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) +# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) #define PACKET3_SET_RESOURCES 0xA0 /* 1. header * 2. CONTROL diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h old mode 100644 new mode 100755 index 323e21c..d09592a --- a/drivers/gpu/drm/amd/amdgpu/vid.h +++ b/drivers/gpu/drm/amd/amdgpu/vid.h @@ -27,6 +27,8 @@ #define SDMA1_REGISTER_OFFSET 0x200 /* not a register */ #define SDMA_MAX_INSTANCE 2 +#define KFD_VI_SDMA_QUEUE_OFFSET 0x80 /* not a register */ + /* crtc instance offsets */ #define CRTC0_REGISTER_OFFSET (0x1b9c - 0x1b9c) #define CRTC1_REGISTER_OFFSET (0x1d9c - 0x1b9c) diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile old mode 100644 new mode 100755 index f55a0f8..dba08ec --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -26,5 +26,3 @@ amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o obj-$(CONFIG_HSA_AMD) += amdkfd.o -AMDKFD_FULL_PATH = $(src) -include $(AMDKFD_FULL_PATH)/backport/Makefile diff --git a/drivers/gpu/drm/amd/amdkfd/backport/backport.h b/drivers/gpu/drm/amd/amdkfd/backport/backport.h index 8b13b98..e1f8c1d 100644 --- a/drivers/gpu/drm/amd/amdkfd/backport/backport.h +++ b/drivers/gpu/drm/amd/amdkfd/backport/backport.h @@ -2,12 +2,5 @@ #define AMDKFD_BACKPORT_H #include -#if defined(BUILD_AS_DKMS) -#include -#endif -#include -#include -#include -#include #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index b2795af..207a05e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -25,9 +25,7 @@ #include #include #include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) #include -#endif #include #include #include diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 5f597a6..4e94081 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -811,11 +811,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, */ pgdat = NODE_DATA(numa_node_id); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - mem_in_bytes += pgdat->node_zones[zone_type].present_pages; -#else mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; -#endif mem_in_bytes <<= PAGE_SHIFT; sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c6b447d..6b3a1fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -326,11 +326,6 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); static int kfd_resume(struct kfd_dev *kfd); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -void kfd_init_processes_srcu(void); -void kfd_cleanup_processes_srcu(void); -#endif - static const struct kfd_device_info *lookup_device_info(unsigned short did) { size_t i; @@ -633,10 +628,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd_ib_mem_init(kfd); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - kfd_init_processes_srcu(); -#endif - if (kfd_resume(kfd)) { dev_err(kfd_device, "Error resuming kfd\n"); goto kfd_resume_error; @@ -678,9 +669,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) { if (kfd->init_complete) { kgd2kfd_suspend(kfd); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - kfd_cleanup_processes_srcu(); -#endif kfd_cwsr_fini(kfd); device_queue_manager_uninit(kfd->dqm); kfd_interrupt_exit(kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 8debe6e..7eacf42 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -24,10 +24,8 @@ #include #include #include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) #include #include -#endif #include #include #include "kfd_priv.h" @@ -269,13 +267,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) { struct kfd_event *ev; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_possible(p->events, ev, node, events, id) -#else hash_for_each_possible(p->events, ev, events, id) -#endif if (ev->event_id == id) return ev; @@ -420,13 +412,7 @@ static void destroy_events(struct kfd_process *p) struct hlist_node *tmp; unsigned int hash_bkt; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_safe(p->events, hash_bkt, node, tmp, ev, events) -#else hash_for_each_safe(p->events, hash_bkt, tmp, ev, events) -#endif destroy_event(p, ev); } @@ -972,16 +958,9 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, int bkt; bool send_signal = true; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - ev_data = (struct kfd_hsa_memory_exception_data *) event_data; - - hash_for_each(p->events, bkt, node, ev, events) -#else ev_data = (struct kfd_hsa_memory_exception_data *) event_data; hash_for_each(p->events, bkt, ev, events) -#endif if (ev->type == type) { send_signal = false; dev_dbg(kfd_device, @@ -1114,9 +1093,6 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, int bkt; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_hsa_memory_exception_data memory_exception_data; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; -#endif if (!p) return; /* Presumably process exited. */ @@ -1136,11 +1112,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, } mutex_lock(&p->event_mutex); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - hash_for_each(p->events, bkt, node, ev, events) { -#else hash_for_each(p->events, bkt, ev, events) { -#endif if (ev->type == KFD_EVENT_TYPE_MEMORY) { ev->memory_exception_data = memory_exception_data; set_event(ev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 4f4392a..47dcf4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -61,11 +61,7 @@ int kfd_interrupt_init(struct kfd_dev *kfd) return r; } -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - kfd->ih_wq = create_rt_workqueue("KFD IH"); -#else kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); -#endif spin_lock_init(&kfd->interrupt_lock); INIT_WORK(&kfd->interrupt_work, interrupt_wq); @@ -115,15 +111,9 @@ bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, kfd->device_info->ih_ring_entry_size); if (count != kfd->device_info->ih_ring_entry_size) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - dev_err(kfd_chardev(), - "Interrupt ring overflow, dropping interrupt %d\n", - count); -#else dev_err_ratelimited(kfd_chardev(), "Interrupt ring overflow, dropping interrupt %d\n", count); -#endif return false; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c index c6be3ba..e67eb9f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c @@ -192,21 +192,13 @@ int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, { int r; struct kfd_ipc_obj *entry, *found = NULL; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *tmp_node; -#endif mutex_lock(&kfd_ipc_handles.lock); /* Convert the user provided handle to hash key and search only in that * bucket */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - hlist_for_each_entry(entry, tmp_node, - &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { -#else hlist_for_each_entry(entry, &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { -#endif if (!memcmp(entry->share_handle, share_handle, sizeof(entry->share_handle))) { found = entry; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 64bf653..5724d33 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -465,19 +465,15 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, static int debugfs_show_mqd(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct cik_mqd), false); -#endif return 0; } static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct cik_sdma_rlc_registers), false); -#endif return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 0713cac..6c302d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -455,19 +455,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, static int debugfs_show_mqd(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct v9_mqd), false); -#endif return 0; } static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct v9_sdma_mqd), false); -#endif return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index a5ba6f7..5c26e5a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -468,19 +468,15 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, static int debugfs_show_mqd(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct vi_mqd), false); -#endif return 0; } static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, data, sizeof(struct vi_sdma_mqd), false); -#endif return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 9fcb6fb..7cca7b4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -410,10 +410,8 @@ int pm_debugfs_runlist(struct seq_file *m, void *data) return 0; } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 0, 0) && !defined(OS_NAME_RHEL_7_2) seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); -#endif return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h old mode 100644 new mode 100755 index ebe311e..88fdfc9 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -36,11 +36,7 @@ #include #include #include -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -#include -#else #include -#endif #include #include @@ -727,7 +723,7 @@ struct kfd_process { size_t signal_event_count; bool signal_event_limit_reached; - struct rb_root bo_interval_tree; + struct rb_root_cached bo_interval_tree; /* Information used for memory eviction */ void *process_info; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index b458995..c798fa3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -23,10 +23,8 @@ #include #include #include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) #include #include -#endif #include #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) #include @@ -50,20 +48,7 @@ struct mm_struct; static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); static DEFINE_MUTEX(kfd_processes_mutex); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) -static struct srcu_struct kfd_processes_srcu; -void kfd_init_processes_srcu(void) -{ - init_srcu_struct(&kfd_processes_srcu); -} - -void kfd_cleanup_processes_srcu(void) -{ - cleanup_srcu_struct(&kfd_processes_srcu); -} -#else DEFINE_STATIC_SRCU(kfd_processes_srcu); -#endif static struct workqueue_struct *kfd_process_wq; @@ -81,11 +66,7 @@ static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); void kfd_process_create_wq(void) { if (!kfd_process_wq) -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) - kfd_process_wq = create_workqueue("kfd_process_wq"); -#else kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); -#endif } void kfd_process_destroy_wq(void) @@ -273,15 +254,8 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) { struct kfd_process *process; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_possible_rcu(kfd_processes_table, process, node, - kfd_processes, (uintptr_t)mm) -#else hash_for_each_possible_rcu(kfd_processes_table, process, kfd_processes, (uintptr_t)mm) -#endif if (process->mm == mm) return process; @@ -586,7 +560,7 @@ static struct kfd_process *create_process(const struct task_struct *thread, if (!process) goto err_alloc_process; - process->bo_interval_tree = RB_ROOT; + process->bo_interval_tree = RB_ROOT_CACHED; process->pasid = kfd_pasid_alloc(); if (process->pasid == 0) @@ -1026,13 +1000,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) int idx = srcu_read_lock(&kfd_processes_srcu); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { -#else hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -#endif if (p->pasid == pasid) { kref_get(&p->ref); ret_p = p; @@ -1051,13 +1019,7 @@ void kfd_suspend_all_processes(void) unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { -#else hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -#endif if (cancel_delayed_work_sync(&p->eviction_work.dwork)) dma_fence_put(p->eviction_work.quiesce_fence); cancel_delayed_work_sync(&p->restore_work); @@ -1077,13 +1039,7 @@ int kfd_resume_all_processes(void) unsigned int temp; int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { -#else hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -#endif if (!schedule_delayed_work(&p->restore_work, 0)) { pr_err("Restore process %d failed during resume\n", p->pasid); @@ -1171,13 +1127,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) int idx = srcu_read_lock(&kfd_processes_srcu); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) - struct hlist_node *node; - - hash_for_each_rcu(kfd_processes_table, temp, node, p, kfd_processes) { -#else hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { -#endif seq_printf(m, "Process %d PASID %d:\n", p->lead_thread->tgid, p->pasid); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index ffd8e0f..d08e3de 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -122,9 +122,7 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) struct kfd_mem_properties *mem; struct kfd_cache_properties *cache; struct kfd_iolink_properties *iolink; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct kfd_perf_properties *perf; -#endif list_del(&dev->list); @@ -149,14 +147,12 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) kfree(iolink); } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) while (dev->perf_props.next != &dev->perf_props) { perf = container_of(dev->perf_props.next, struct kfd_perf_properties, list); list_del(&perf->list); kfree(perf); } -#endif kfree(dev); } @@ -192,9 +188,7 @@ struct kfd_topology_device *kfd_create_topology_device( INIT_LIST_HEAD(&dev->mem_props); INIT_LIST_HEAD(&dev->cache_props); INIT_LIST_HEAD(&dev->io_link_props); -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) INIT_LIST_HEAD(&dev->perf_props); -#endif list_add_tail(&dev->list, device_list); @@ -374,7 +368,6 @@ static struct kobj_type cache_type = { .sysfs_ops = &cache_ops, }; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) /****** Sysfs of Performance Counters ******/ struct kfd_perf_attr { @@ -407,7 +400,6 @@ static struct kfd_perf_attr perf_attr_iommu[] = { KFD_PERF_DESC(counter_ids, 0), }; /****************************************/ -#endif static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) @@ -546,9 +538,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct kfd_perf_properties *perf; -#endif if (dev->kobj_iolink) { list_for_each_entry(iolink, &dev->io_link_props, list) @@ -590,7 +580,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) dev->kobj_mem = NULL; } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) if (dev->kobj_perf) { list_for_each_entry(perf, &dev->perf_props, list) { kfree(perf->attr_group); @@ -600,7 +589,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) kobject_put(dev->kobj_perf); dev->kobj_perf = NULL; } -#endif if (dev->kobj_node) { sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); @@ -618,11 +606,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct kfd_perf_properties *perf; uint32_t num_attrs; struct attribute **attrs; -#endif int ret; uint32_t i; @@ -653,11 +639,9 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (!dev->kobj_iolink) return -ENOMEM; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); if (!dev->kobj_perf) return -ENOMEM; -#endif /* * Creating sysfs files for node properties @@ -749,7 +733,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, i++; } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) /* All hardware blocks have the same number of attributes. */ num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); list_for_each_entry(perf, &dev->perf_props, list) { @@ -775,7 +758,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; } -#endif return 0; } @@ -942,7 +924,6 @@ static void find_system_memory(const struct dmi_header *dm, } } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) /* * Performance counters information is not part of CRAT but we would like to * put them in the sysfs under topology directory for Thunk to get the data. @@ -966,7 +947,6 @@ static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) return 0; } -#endif /* kfd_add_non_crat_information - Add information that is not currently * defined in CRAT but is necessary for KFD topology @@ -1074,11 +1054,9 @@ int kfd_topology_init(void) } } -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) kdev = list_first_entry(&temp_topology_device_list, struct kfd_topology_device, list); kfd_add_perf_to_topology(kdev); -#endif down_write(&topology_lock); kfd_topology_update_device_list(&temp_topology_device_list, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index b59b32c..f22d420 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -141,14 +141,12 @@ struct kfd_iolink_properties { struct attribute attr; }; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct kfd_perf_properties { struct list_head list; char block_name[16]; uint32_t max_concurrent; struct attribute_group *attr_group; }; -#endif struct kfd_topology_device { struct list_head list; @@ -160,17 +158,13 @@ struct kfd_topology_device { struct list_head cache_props; uint32_t io_link_count; struct list_head io_link_props; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct list_head perf_props; -#endif struct kfd_dev *gpu; struct kobject *kobj_node; struct kobject *kobj_mem; struct kobject *kobj_cache; struct kobject *kobj_iolink; -#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 10, 0) struct kobject *kobj_perf; -#endif struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2780641..977b21b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -707,11 +707,7 @@ static int dm_display_resume(struct drm_device *ddev) err: DRM_ERROR("Restoring old state failed with %i\n", ret); -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) - drm_atomic_state_free(state); -#else drm_atomic_state_put(state); -#endif return ret; } diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h old mode 100644 new mode 100755 index 36f3766..b6cf2d5 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -30,6 +30,7 @@ #include #include +#include struct pci_dev; @@ -40,6 +41,46 @@ struct kfd_dev; struct kgd_dev; struct kgd_mem; +struct kfd_process_device; +struct amdgpu_bo; + +enum kfd_preempt_type { + KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN = 0, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, +}; + +struct kfd_vm_fault_info { + uint64_t page_addr; + uint32_t vmid; + uint32_t mc_id; + uint32_t status; + bool prot_valid; + bool prot_read; + bool prot_write; + bool prot_exec; +}; + +struct kfd_cu_info { + uint32_t num_shader_engines; + uint32_t num_shader_arrays_per_engine; + uint32_t num_cu_per_sh; + uint32_t cu_active_number; + uint32_t cu_ao_mask; + uint32_t simd_per_cu; + uint32_t max_waves_per_simd; + uint32_t wave_front_size; + uint32_t max_scratch_slots_per_cu; + uint32_t lds_size; + uint32_t cu_bitmap[4][4]; +}; + +/* For getting GPU local memory information from KGD */ +struct kfd_local_mem_info { + uint64_t local_mem_size_private; + uint64_t local_mem_size_public; + uint32_t vram_width; + uint32_t mem_clk_max; +}; enum kgd_memory_pool { KGD_POOL_SYSTEM_CACHEABLE = 1, @@ -72,6 +113,21 @@ struct kgd2kfd_shared_resources { /* Bit n == 1 means Queue n is available for KFD */ DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); + /* Doorbell assignments (SOC15 and later chips only). Only + * specific doorbells are routed to each SDMA engine. Others + * are routed to IH and VCN. They are not usable by the CP. + * + * Any doorbell number D that satisfies the following condition + * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val + * + * KFD currently uses 1024 (= 0x3ff) doorbells per process. If + * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means + * mask would be set to 0x1f8 and val set to 0x0f0. + */ + unsigned int sdma_doorbell[2][2]; + unsigned int reserved_doorbell_mask; + unsigned int reserved_doorbell_val; + /* Base address of doorbell aperture. */ phys_addr_t doorbell_physical_address; @@ -80,8 +136,41 @@ struct kgd2kfd_shared_resources { /* Number of bytes at start of aperture reserved for KGD. */ size_t doorbell_start_offset; + + /* GPUVM address space size in bytes */ + uint64_t gpuvm_size; }; +struct tile_config { + uint32_t *tile_config_ptr; + uint32_t *macro_tile_config_ptr; + uint32_t num_tile_configs; + uint32_t num_macro_tile_configs; + + uint32_t gb_addr_config; + uint32_t num_banks; + uint32_t num_ranks; +}; + +/* + * Allocation flag domains currently only VRAM and GTT domain supported + */ +#define ALLOC_MEM_FLAGS_VRAM (1 << 0) +#define ALLOC_MEM_FLAGS_GTT (1 << 1) +#define ALLOC_MEM_FLAGS_USERPTR (1 << 2) +#define ALLOC_MEM_FLAGS_DOORBELL (1 << 3) + +/* + * Allocation flags attributes/access options. + */ +#define ALLOC_MEM_FLAGS_NONPAGED (1 << 31) +#define ALLOC_MEM_FLAGS_READONLY (1 << 30) +#define ALLOC_MEM_FLAGS_PUBLIC (1 << 29) +#define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) +#define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) +#define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) +#define ALLOC_MEM_FLAGS_COHERENT (1 << 25) + /** * struct kfd2kgd_calls * @@ -90,7 +179,7 @@ struct kgd2kfd_shared_resources { * * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture * - * @get_vmem_size: Retrieves (physical) size of VRAM + * @get_local_mem_info: Retrieves information about GPU local memory * * @get_gpu_clock_counter: Retrieves GPU clock counter * @@ -112,6 +201,12 @@ struct kgd2kfd_shared_resources { * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot. * used only for no HWS mode. * + * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs. + * Array is allocated with kmalloc, needs to be freed with kfree by caller. + * + * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs. + * Array is allocated with kmalloc, needs to be freed with kfree by caller. + * * @hqd_is_occupies: Checks if a hqd slot is occupied. * * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot. @@ -121,8 +216,34 @@ struct kgd2kfd_shared_resources { * @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that * SDMA hqd slot. * + * @map_memory_to_gpu: Allocates and pins BO, PD and all related PTs + * + * @unmap_memory_to_gpu: Releases and unpins BO, PD and all related PTs + * * @get_fw_version: Returns FW versions from the header * + * @set_num_of_requests: Sets number of Peripheral Page Request (PPR) sent to + * IOMMU when address translation failed + * + * @get_cu_info: Retrieves activated cu info + * + * @get_dmabuf_info: Returns information about a dmabuf if it was + * created by the GPU driver + * + * @import_dmabuf: Imports a DMA buffer, creating a new kgd_mem object + * Supports only DMA buffers created by GPU driver on the same GPU + * + * @export_dmabuf: Emports a KFD BO for sharing with other process + * + * @submit_ib: Submits an IB to the engine specified by inserting the IB to + * the corresonded ring (ring type). + * + * @restore_process_bos: Restore all BOs that belongs to the process + * + * @copy_mem_to_mem: Copies size bytes from source BO to destination BO + * + * @get_vram_usage: Returns current VRAM usage + * * This structure contains function pointers to services that the kgd driver * provides to amdkfd driver. * @@ -134,11 +255,23 @@ struct kfd2kgd_calls { void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); - uint64_t (*get_vmem_size)(struct kgd_dev *kgd); + void(*get_local_mem_info)(struct kgd_dev *kgd, + struct kfd_local_mem_info *mem_info); uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd); uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd); + int (*create_process_vm)(struct kgd_dev *kgd, void **vm, + void **process_info, struct dma_fence **ef); + void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); + + int (*create_process_gpumem)(struct kgd_dev *kgd, uint64_t va, size_t size, void *vm, struct kgd_mem **mem); + void (*destroy_process_gpumem)(struct kgd_dev *kgd, struct kgd_mem *mem); + + uint32_t (*get_process_page_dir)(void *vm); + + int (*open_graphic_handle)(struct kgd_dev *kgd, uint64_t va, void *vm, int fd, uint32_t handle, struct kgd_mem **mem); + /* Register access functions */ void (*program_sh_mem_settings)(struct kgd_dev *kgd, uint32_t vmid, uint32_t sh_mem_config, uint32_t sh_mem_ape1_base, @@ -151,16 +284,28 @@ struct kfd2kgd_calls { uint32_t hpd_size, uint64_t hpd_gpu_addr); int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); + int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, - uint32_t queue_id, uint32_t __user *wptr); + uint32_t queue_id, uint32_t __user *wptr, + uint32_t wptr_shift, uint32_t wptr_mask, + struct mm_struct *mm); + + int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd, + uint32_t __user *wptr, struct mm_struct *mm); + + int (*hqd_dump)(struct kgd_dev *kgd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); - int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd); + int (*hqd_sdma_dump)(struct kgd_dev *kgd, + uint32_t engine_id, uint32_t queue_id, + uint32_t (**dump)[2], uint32_t *n_regs); bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); - int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type, + int (*hqd_destroy)(struct kgd_dev *kgd, void *mqd, uint32_t reset_type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); @@ -168,7 +313,7 @@ struct kfd2kgd_calls { int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd, unsigned int timeout); - + int (*address_watch_disable)(struct kgd_dev *kgd); int (*address_watch_execute)(struct kgd_dev *kgd, unsigned int watch_point_id, @@ -187,11 +332,72 @@ struct kfd2kgd_calls { uint16_t (*get_atc_vmid_pasid_mapping_pasid)( struct kgd_dev *kgd, uint8_t vmid); + uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); void (*write_vmid_invalidate_request)(struct kgd_dev *kgd, uint8_t vmid); + int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid); + + int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); + + int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va, + uint64_t size, void *vm, + struct kgd_mem **mem, uint64_t *offset, + uint32_t flags); + int (*free_memory_of_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, + void *vm); + int (*map_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, + void *vm); + int (*unmap_memory_to_gpu)(struct kgd_dev *kgd, struct kgd_mem *mem, + void *vm); + uint16_t (*get_fw_version)(struct kgd_dev *kgd, enum kgd_engine_type type); + + void (*set_num_of_requests)(struct kgd_dev *kgd, + uint8_t num_of_requests); + int (*alloc_memory_of_scratch)(struct kgd_dev *kgd, + uint64_t va, uint32_t vmid); + int (*write_config_static_mem)(struct kgd_dev *kgd, bool swizzle_enable, + uint8_t element_size, uint8_t index_stride, uint8_t mtype); + void (*get_cu_info)(struct kgd_dev *kgd, + struct kfd_cu_info *cu_info); + int (*mmap_bo)(struct kgd_dev *kgd, struct vm_area_struct *vma); + int (*map_gtt_bo_to_kernel)(struct kgd_dev *kgd, + struct kgd_mem *mem, void **kptr); + void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, uint32_t vmid, + uint32_t page_table_base); + + int (*pin_get_sg_table_bo)(struct kgd_dev *kgd, + struct kgd_mem *mem, uint64_t offset, + uint64_t size, struct sg_table **ret_sg); + void (*unpin_put_sg_table_bo)(struct kgd_mem *mem, + struct sg_table *sg); + + int (*get_dmabuf_info)(struct kgd_dev *kgd, int dma_buf_fd, + struct kgd_dev **dma_buf_kgd, uint64_t *bo_size, + void *metadata_buffer, size_t buffer_size, + uint32_t *metadata_size, uint32_t *flags); + int (*import_dmabuf)(struct kgd_dev *kgd, struct dma_buf *dmabuf, + uint64_t va, void *vm, struct kgd_mem **mem, + uint64_t *size, uint64_t *mmap_offset); + int (*export_dmabuf)(struct kgd_dev *kgd, void *vm, struct kgd_mem *mem, + struct dma_buf **dmabuf); + + int (*get_vm_fault_info)(struct kgd_dev *kgd, + struct kfd_vm_fault_info *info); + int (*submit_ib)(struct kgd_dev *kgd, enum kgd_engine_type engine, + uint32_t vmid, uint64_t gpu_addr, + uint32_t *ib_cmd, uint32_t ib_len); + int (*get_tile_config)(struct kgd_dev *kgd, + struct tile_config *config); + + int (*restore_process_bos)(void *process_info, struct dma_fence **ef); + int (*copy_mem_to_mem)(struct kgd_dev *kgd, struct kgd_mem *src_mem, + uint64_t src_offset, struct kgd_mem *dst_mem, + uint64_t dest_offset, uint64_t size, + struct dma_fence **f, uint64_t *actual_size); + uint64_t (*get_vram_usage)(struct kgd_dev *kgd); }; /** @@ -210,6 +416,13 @@ struct kfd2kgd_calls { * * @resume: Notifies amdkfd about a resume action done to a kgd device * + * @quiesce_mm: Quiesce all user queue access to specified MM address space + * + * @resume_mm: Resume user queue access to specified MM address space + * + * @schedule_evict_and_restore_process: Schedules work queue that will prepare + * for safe eviction of KFD BOs that belong to the specified process. + * * This structure contains function callback pointers so the kgd driver * will notify to the amdkfd about certain status changes. * @@ -224,9 +437,13 @@ struct kgd2kfd_calls { void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); void (*suspend)(struct kfd_dev *kfd); int (*resume)(struct kfd_dev *kfd); + int (*quiesce_mm)(struct kfd_dev *kfd, struct mm_struct *mm); + int (*resume_mm)(struct kfd_dev *kfd, struct mm_struct *mm); + int (*schedule_evict_and_restore_process)(struct mm_struct *mm, + struct dma_fence *fence); }; int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f); -#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ +#endif /* KGD_KFD_INTERFACE_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h old mode 100644 new mode 100755 index 2fb25ab..ceaf493 --- a/drivers/gpu/drm/amd/include/v9_structs.h +++ b/drivers/gpu/drm/amd/include/v9_structs.h @@ -29,10 +29,10 @@ struct v9_sdma_mqd { uint32_t sdmax_rlcx_rb_base; uint32_t sdmax_rlcx_rb_base_hi; uint32_t sdmax_rlcx_rb_rptr; + uint32_t sdmax_rlcx_rb_rptr_hi; uint32_t sdmax_rlcx_rb_wptr; + uint32_t sdmax_rlcx_rb_wptr_hi; uint32_t sdmax_rlcx_rb_wptr_poll_cntl; - uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; - uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; uint32_t sdmax_rlcx_rb_rptr_addr_hi; uint32_t sdmax_rlcx_rb_rptr_addr_lo; uint32_t sdmax_rlcx_ib_cntl; @@ -44,29 +44,29 @@ struct v9_sdma_mqd { uint32_t sdmax_rlcx_skip_cntl; uint32_t sdmax_rlcx_context_status; uint32_t sdmax_rlcx_doorbell; - uint32_t sdmax_rlcx_virtual_addr; - uint32_t sdmax_rlcx_ape1_cntl; + uint32_t sdmax_rlcx_status; uint32_t sdmax_rlcx_doorbell_log; - uint32_t reserved_22; - uint32_t reserved_23; - uint32_t reserved_24; - uint32_t reserved_25; - uint32_t reserved_26; - uint32_t reserved_27; - uint32_t reserved_28; - uint32_t reserved_29; - uint32_t reserved_30; - uint32_t reserved_31; - uint32_t reserved_32; - uint32_t reserved_33; - uint32_t reserved_34; - uint32_t reserved_35; - uint32_t reserved_36; - uint32_t reserved_37; - uint32_t reserved_38; - uint32_t reserved_39; - uint32_t reserved_40; - uint32_t reserved_41; + uint32_t sdmax_rlcx_watermark; + uint32_t sdmax_rlcx_doorbell_offset; + uint32_t sdmax_rlcx_csa_addr_lo; + uint32_t sdmax_rlcx_csa_addr_hi; + uint32_t sdmax_rlcx_ib_sub_remain; + uint32_t sdmax_rlcx_preempt; + uint32_t sdmax_rlcx_dummy_reg; + uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; + uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; + uint32_t sdmax_rlcx_rb_aql_cntl; + uint32_t sdmax_rlcx_minor_ptr_update; + uint32_t sdmax_rlcx_midcmd_data0; + uint32_t sdmax_rlcx_midcmd_data1; + uint32_t sdmax_rlcx_midcmd_data2; + uint32_t sdmax_rlcx_midcmd_data3; + uint32_t sdmax_rlcx_midcmd_data4; + uint32_t sdmax_rlcx_midcmd_data5; + uint32_t sdmax_rlcx_midcmd_data6; + uint32_t sdmax_rlcx_midcmd_data7; + uint32_t sdmax_rlcx_midcmd_data8; + uint32_t sdmax_rlcx_midcmd_cntl; uint32_t reserved_42; uint32_t reserved_43; uint32_t reserved_44; diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h old mode 100644 new mode 100755 index 2023482..717fbae --- a/drivers/gpu/drm/amd/include/vi_structs.h +++ b/drivers/gpu/drm/amd/include/vi_structs.h @@ -153,6 +153,8 @@ struct vi_sdma_mqd { uint32_t reserved_125; uint32_t reserved_126; uint32_t reserved_127; + uint32_t sdma_engine_id; + uint32_t sdma_queue_id; }; struct vi_mqd { diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c old mode 100644 new mode 100755 index 2292462..82d97f3 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2983,6 +2983,87 @@ bool pci_acs_path_enabled(struct pci_dev *start, } /** + * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port + * @dev: the PCI device + * + * Return 0 if the device is capable of generating AtomicOp requests, + * all upstream bridges support AtomicOp routing, egress blocking is disabled + * on all upstream ports, and the root port supports 32-bit, 64-bit and/or + * 128-bit AtomicOp completion, or negative otherwise. + */ +int pci_enable_atomic_ops_to_root(struct pci_dev *dev) +{ + struct pci_bus *bus = dev->bus; + + if (!pci_is_pcie(dev)) + return -EINVAL; + + switch (pci_pcie_type(dev)) { + /* + * PCIe 3.0, 6.15 specifies that endpoints and root ports are permitted + * to implement AtomicOp requester capabilities. + */ + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_RC_END: + break; + default: + return -EINVAL; + } + + while (bus->parent) { + struct pci_dev *bridge = bus->self; + u32 cap; + + pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap); + + switch (pci_pcie_type(bridge)) { + /* + * Upstream, downstream and root ports may implement AtomicOp + * routing capabilities. AtomicOp routing via a root port is + * not considered. + */ + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + if (!(cap & PCI_EXP_DEVCAP2_ATOMIC_ROUTE)) + return -EINVAL; + break; + + /* + * Root ports are permitted to implement AtomicOp completion + * capabilities. + */ + case PCI_EXP_TYPE_ROOT_PORT: + if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64 | + PCI_EXP_DEVCAP2_ATOMIC_COMP128))) + return -EINVAL; + break; + } + + /* + * Upstream ports may block AtomicOps on egress. + */ + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { + u32 ctl2; + + pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, + &ctl2); + if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_BLOCK) + return -EINVAL; + } + + bus = bus->parent; + } + + pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_ATOMIC_REQ); + + return 0; +} +EXPORT_SYMBOL(pci_enable_atomic_ops_to_root); + +/** * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge * @dev: the PCI device * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTC, 4=INTD) diff --git a/include/drm/amd_rdma.h b/include/drm/amd_rdma.h new file mode 100644 index 0000000..b0cab3c --- /dev/null +++ b/include/drm/amd_rdma.h @@ -0,0 +1,70 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* @file This file defined kernel interfaces to communicate with amdkfd */ + +#ifndef AMD_RDMA_H_ +#define AMD_RDMA_H_ + + +/** + * Structure describing information needed to P2P access from another device + * to specific location of GPU memory + */ +struct amd_p2p_info { + uint64_t va; /**< Specify user virt. address + * which this page table + * described + */ + uint64_t size; /**< Specify total size of + * allocation + */ + struct pid *pid; /**< Specify process pid to which + * virtual address belongs + */ + struct sg_table *pages; /**< Specify DMA/Bus addresses */ + void *priv; /**< Pointer set by AMD kernel + * driver + */ +}; + +/** + * Structure providing function pointers to support rdma/p2p requirements. + * to specific location of GPU memory + */ +struct amd_rdma_interface { + int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid, + struct amd_p2p_info **amd_p2p_data, + void (*free_callback)(void *client_priv), + void *client_priv); + int (*put_pages)(struct amd_p2p_info **amd_p2p_data); + int (*is_gpu_address)(uint64_t address, struct pid *pid); + int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid, + unsigned long *page_size); +}; + + +int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma); + + +#endif /* AMD_RDMA_H_ */ + diff --git a/include/linux/pci.h b/include/linux/pci.h old mode 100644 new mode 100755 index b1abbcc..3df545d --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2072,6 +2072,7 @@ void pci_request_acs(void); bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags); bool pci_acs_path_enabled(struct pci_dev *start, struct pci_dev *end, u16 acs_flags); +int pci_enable_atomic_ops_to_root(struct pci_dev *dev); #define PCI_VPD_LRDT 0x80 /* Large Resource Data Type */ #define PCI_VPD_LRDT_ID(x) ((x) | PCI_VPD_LRDT) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 5bb2b45..de5367c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -23,15 +23,15 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 1 +#define KFD_IOCTL_MINOR_VERSION 2 struct kfd_ioctl_get_version_args { - __u32 major_version; /* from KFD */ - __u32 minor_version; /* from KFD */ + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -43,36 +43,51 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ - __u64 doorbell_offset; /* from KFD */ - - __u32 ring_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 queue_type; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ - __u32 queue_id; /* from KFD */ - - __u64 eop_buffer_address; /* to KFD */ - __u64 eop_buffer_size; /* to KFD */ - __u64 ctx_save_restore_address; /* to KFD */ - __u64 ctx_save_restore_size; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ + + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ + + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint32_t ctx_save_restore_size; /* to KFD */ + uint32_t ctl_stack_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - __u32 queue_id; /* to KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_update_queue_args { - __u64 ring_base_address; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ + + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ +}; - __u32 queue_id; /* to KFD */ - __u32 ring_size; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ +struct kfd_ioctl_set_cu_mask_args { + uint32_t queue_id; /* to KFD */ + uint32_t num_cu_mask; /* to KFD */ + uint64_t cu_mask_ptr; /* to KFD */ +}; + +struct kfd_ioctl_get_queue_wave_state_args { + uint64_t ctl_stack_address; /* to KFD */ + uint32_t ctl_stack_used_size; /* from KFD */ + uint32_t save_area_used_size; /* from KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -80,13 +95,20 @@ struct kfd_ioctl_update_queue_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - __u64 alternate_aperture_base; /* to KFD */ - __u64 alternate_aperture_size; /* to KFD */ + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ + + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; +}; - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; +struct kfd_ioctl_set_trap_handler_args { + uint64_t tba_addr; + uint64_t tma_addr; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; /* @@ -97,35 +119,52 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - __u64 gpu_clock_counter; /* from KFD */ - __u64 cpu_clock_counter; /* from KFD */ - __u64 system_clock_counter; /* from KFD */ - __u64 system_clock_freq; /* from KFD */ + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; #define NUM_OF_SUPPORTED_GPUS 7 struct kfd_process_device_apertures { - __u64 lds_base; /* from KFD */ - __u64 lds_limit; /* from KFD */ - __u64 scratch_base; /* from KFD */ - __u64 scratch_limit; /* from KFD */ - __u64 gpuvm_base; /* from KFD */ - __u64 gpuvm_limit; /* from KFD */ - __u32 gpu_id; /* from KFD */ - __u32 pad; + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; }; +/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use + * kfd_ioctl_get_process_apertures_new instead, which supports + * arbitrary numbers of GPUs. + */ struct kfd_ioctl_get_process_apertures_args { struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; +}; + +struct kfd_ioctl_get_process_apertures_new_args { + /* User allocated. Pointer to struct kfd_process_device_apertures + * filled in by Kernel + */ + uint64_t kfd_process_device_apertures_ptr; + /* to KFD - indicates amount of memory present in + * kfd_process_device_apertures_ptr + * from KFD - Number of entries filled by KFD. + */ + uint32_t num_of_nodes; + uint32_t pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -133,103 +172,245 @@ struct kfd_ioctl_get_process_apertures_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_unregister_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_address_watch_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; /* Matching HSA_EVENTTYPE */ -#define KFD_IOC_EVENT_SIGNAL 0 -#define KFD_IOC_EVENT_NODECHANGE 1 -#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 -#define KFD_IOC_EVENT_HW_EXCEPTION 3 -#define KFD_IOC_EVENT_SYSTEM_EVENT 4 -#define KFD_IOC_EVENT_DEBUG_EVENT 5 -#define KFD_IOC_EVENT_PROFILE_EVENT 6 -#define KFD_IOC_EVENT_QUEUE_EVENT 7 -#define KFD_IOC_EVENT_MEMORY 8 - -#define KFD_IOC_WAIT_RESULT_COMPLETE 0 -#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 -#define KFD_IOC_WAIT_RESULT_FAIL 2 - -#define KFD_SIGNAL_EVENT_LIMIT 256 +#define KFD_IOC_EVENT_SIGNAL 0 +#define KFD_IOC_EVENT_NODECHANGE 1 +#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 +#define KFD_IOC_EVENT_HW_EXCEPTION 3 +#define KFD_IOC_EVENT_SYSTEM_EVENT 4 +#define KFD_IOC_EVENT_DEBUG_EVENT 5 +#define KFD_IOC_EVENT_PROFILE_EVENT 6 +#define KFD_IOC_EVENT_QUEUE_EVENT 7 +#define KFD_IOC_EVENT_MEMORY 8 + +#define KFD_IOC_WAIT_RESULT_COMPLETE 0 +#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 +#define KFD_IOC_WAIT_RESULT_FAIL 2 + +#define KFD_SIGNAL_EVENT_LIMIT 4096 struct kfd_ioctl_create_event_args { - __u64 event_page_offset; /* from KFD */ - __u32 event_trigger_data; /* from KFD - signal events only */ - __u32 event_type; /* to KFD */ - __u32 auto_reset; /* to KFD */ - __u32 node_id; /* to KFD - only valid for certain + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain event types */ - __u32 event_id; /* from KFD */ - __u32 event_slot_index; /* from KFD */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_set_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_reset_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_memory_exception_failure { - __u32 NotPresent; /* Page not present or supervisor privilege */ - __u32 ReadOnly; /* Write access to a read-only page */ - __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 pad; + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t imprecise; /* Can't determine the exact fault address */ }; -/* memory exception data*/ +/* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - __u64 va; - __u32 gpu_id; - __u32 pad; + uint64_t va; + uint32_t gpu_id; + uint32_t pad; }; -/* Event data*/ +/* Event data */ struct kfd_event_data { union { struct kfd_hsa_memory_exception_data memory_exception_data; }; /* From KFD */ - __u64 kfd_event_data_ext; /* pointer to an extension structure - for future exception types */ - __u32 event_id; /* to KFD */ - __u32 pad; + uint64_t kfd_event_data_ext; /* pointer to an extension structure + for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_wait_events_args { - __u64 events_ptr; /* pointed to struct + uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - __u32 num_events; /* to KFD */ - __u32 wait_for_all; /* to KFD */ - __u32 timeout; /* to KFD */ - __u32 wait_result; /* from KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ +}; + +struct kfd_ioctl_alloc_memory_of_scratch_args { + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +/* Allocation flags: memory types */ +#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0) +#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1) +#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2) +#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3) +/* Allocation flags: attributes/access options */ +#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED (1 << 31) +#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY (1 << 30) +#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29) +#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) +#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) +#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26) +#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 25) + +struct kfd_ioctl_alloc_memory_of_gpu_args { + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint64_t handle; /* from KFD */ + uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ + uint32_t gpu_id; /* to KFD */ + uint32_t flags; +}; + +struct kfd_ioctl_free_memory_of_gpu_args { + uint64_t handle; /* to KFD */ +}; + +struct kfd_ioctl_map_memory_to_gpu_args { + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t device_ids_array_size; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_unmap_memory_from_gpu_args { + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t device_ids_array_size; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_set_process_dgpu_aperture_args { + uint64_t dgpu_base; + uint64_t dgpu_limit; + uint32_t gpu_id; + uint32_t pad; +}; + +struct kfd_ioctl_get_dmabuf_info_args { + uint64_t size; /* from KFD */ + uint64_t metadata_ptr; /* to KFD */ + uint32_t metadata_size; /* to KFD (space allocated by user) + * from KFD (actual metadata size) */ + uint32_t gpu_id; /* from KFD */ + uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ + uint32_t dmabuf_fd; /* to KFD */ +}; + +struct kfd_ioctl_import_dmabuf_args { + uint64_t va_addr; /* to KFD */ + uint64_t handle; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t dmabuf_fd; /* to KFD */ +}; + +struct kfd_ioctl_ipc_export_handle_args { + uint64_t handle; /* to KFD */ + uint32_t share_handle[4]; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_ipc_import_handle_args { + uint64_t handle; /* from KFD */ + uint64_t va_addr; /* to KFD */ + uint64_t mmap_offset; /* from KFD */ + uint32_t share_handle[4]; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_get_tile_config_args { + /* to KFD: pointer to tile array */ + uint64_t tile_config_ptr; + /* to KFD: pointer to macro tile array */ + uint64_t macro_tile_config_ptr; + /* to KFD: array size allocated by user mode + * from KFD: array size filled by kernel + */ + uint32_t num_tile_configs; + /* to KFD: array size allocated by user mode + * from KFD: array size filled by kernel + */ + uint32_t num_macro_tile_configs; + + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ + /* struct size can be extended later if needed + * without breaking ABI compatibility + */ +}; + +struct kfd_memory_range { + uint64_t va_addr; + uint64_t size; +}; + +/* flags definitions + * BIT0: 0: read operation, 1: write operation. + * This also identifies if the src or dst array belongs to remote process + */ +#define KFD_CROSS_MEMORY_RW_BIT (1 << 0) +#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT) +#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT) +#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT) + +struct kfd_ioctl_cross_memory_copy_args { + /* to KFD: Process ID of the remote process */ + uint32_t pid; + /* to KFD: See above definition */ + uint32_t flags; + /* to KFD: Source GPU VM range */ + uint64_t src_mem_range_array; + /* to KFD: Size of above array */ + uint64_t src_mem_array_size; + /* to KFD: Destination GPU VM range */ + uint64_t dst_mem_range_array; + /* to KFD: Size of above array */ + uint64_t dst_mem_array_size; + /* from KFD: Total amount of bytes copied */ + uint64_t bytes_copied; }; @@ -287,7 +468,56 @@ struct kfd_ioctl_wait_events_args { #define AMDKFD_IOC_DBG_WAVE_CONTROL \ AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) +#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \ + AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args) + +#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \ + AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args) + +#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \ + AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args) + +#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \ + AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args) + +#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH \ + AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args) + +#define AMDKFD_IOC_SET_CU_MASK \ + AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args) + +#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE \ + AMDKFD_IOW(0x17, \ + struct kfd_ioctl_set_process_dgpu_aperture_args) + +#define AMDKFD_IOC_SET_TRAP_HANDLER \ + AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args) + +#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \ + AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args) + +#define AMDKFD_IOC_GET_DMABUF_INFO \ + AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args) + +#define AMDKFD_IOC_IMPORT_DMABUF \ + AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args) + +#define AMDKFD_IOC_GET_TILE_CONFIG \ + AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args) + +#define AMDKFD_IOC_IPC_IMPORT_HANDLE \ + AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args) + +#define AMDKFD_IOC_IPC_EXPORT_HANDLE \ + AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args) + +#define AMDKFD_IOC_CROSS_MEMORY_COPY \ + AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args) + +#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE \ + AMDKFD_IOWR(0x20, struct kfd_ioctl_get_queue_wave_state_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x11 +#define AMDKFD_COMMAND_END 0x21 #endif diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h old mode 100644 new mode 100755 index 87c2c84..1256851 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -624,7 +624,9 @@ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ #define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ +#define PCI_EXP_DEVCAP2_ATOMIC_COMP32 0x00000080 /* 32b AtomicOp completion */ #define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */ +#define PCI_EXP_DEVCAP2_ATOMIC_COMP128 0x00000200 /* 128b AtomicOp completion*/ #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ @@ -634,6 +636,7 @@ #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ #define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */ +#define PCI_EXP_DEVCTL2_ATOMIC_BLOCK 0x0040 /* Block AtomicOp on egress */ #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ diff --git a/kernel/fork.c b/kernel/fork.c index a19ee25..70d8d5b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1082,6 +1082,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } +EXPORT_SYMBOL_GPL(mm_access); static void complete_vfork_done(struct task_struct *tsk) { -- 2.7.4