aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch1544
1 files changed, 1544 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch
new file mode 100644
index 00000000..3f836132
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/1623-drm-amdgpu-Add-Vega10-support-for-KFD.patch
@@ -0,0 +1,1544 @@
+From 9256a2b22c170e1dcd239a31628abbbf846f21f2 Mon Sep 17 00:00:00 2001
+From: Felix Kuehling <Felix.Kuehling@amd.com>
+Date: Tue, 14 Mar 2017 23:36:55 -0400
+Subject: [PATCH 1623/4131] drm/amdgpu: Add Vega10 support for KFD
+
+Change-Id: I37da6493d070c7e490e18a478ffeb42c1b158a82
+Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/Makefile | 1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 48 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 22 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1124 +++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 47 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 13 +
+ drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 16 +
+ 9 files changed, 1232 insertions(+), 47 deletions(-)
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
+index 7ff4af3..81423ee 100755
+--- a/drivers/gpu/drm/amd/amdgpu/Makefile
++++ b/drivers/gpu/drm/amd/amdgpu/Makefile
+@@ -110,6 +110,7 @@ amdgpu-y += \
+ amdgpu_amdkfd.o \
+ amdgpu_amdkfd_gfx_v7.o \
+ amdgpu_amdkfd_gfx_v8.o \
++ amdgpu_amdkfd_gfx_v9.o \
+ amdgpu_amdkfd_gpuvm.o
+
+ # add cgs
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+index a30b616..3a1776b 100755
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+@@ -28,7 +28,6 @@
+ #include "amdgpu.h"
+ #include "amdgpu_gfx.h"
+ #include <linux/module.h>
+-#include <linux/mmu_context.h>
+
+ #define AMDKFD_SKIP_UNCOMPILED_CODE 1
+
+@@ -82,6 +81,9 @@ bool amdgpu_amdkfd_load_interface(struct amdgpu_device *adev)
+ case CHIP_POLARIS11:
+ kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
+ break;
++ case CHIP_VEGA10:
++ kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
++ break;
+ default:
+ return false;
+ }
+@@ -142,6 +144,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+ &gpu_resources.doorbell_physical_address,
+ &gpu_resources.doorbell_aperture_size,
+ &gpu_resources.doorbell_start_offset);
++ if (adev->asic_type >= CHIP_VEGA10) {
++ /* On SOC15 the BIF is involved in routing
++ * doorbells using the low 12 bits of the
++ * address. Communicate the assignments to
++ * KFD. KFD uses two doorbell pages per
++ * process in case of 64-bit doorbells so we
++ * can use each doorbell assignment twice.
++ */
++ gpu_resources.sdma_doorbell[0][0] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE0;
++ gpu_resources.sdma_doorbell[0][1] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200;
++ gpu_resources.sdma_doorbell[1][0] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE1;
++ gpu_resources.sdma_doorbell[1][1] =
++ AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200;
++ /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for
++ * SDMA, IH and VCN. So don't use them for the CP.
++ */
++ gpu_resources.reserved_doorbell_mask = 0x1f0;
++ gpu_resources.reserved_doorbell_val = 0x0f0;
++ }
+
+ kgd2kfd->device_init(adev->kfd, &gpu_resources);
+ }
+@@ -610,28 +634,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
+ return r;
+ }
+
+-bool read_user_wptr(struct mm_struct *mm, uint32_t __user *wptr,
+- uint32_t *wptr_val)
+-{
+- bool wptr_valid = false;
+-
+- if (mm && wptr) {
+- if (mm == current->mm) {
+- /* Running in the correct user process context */
+- wptr_valid = !get_user(*wptr_val, wptr);
+- } else if (current->mm == NULL) {
+- /* A kernel thread can temporarily use a user
+- * process context for AIO
+- */
+- use_mm(mm);
+- wptr_valid = !get_user(*wptr_val, wptr);
+- unuse_mm(mm);
+- }
+- }
+-
+- return wptr_valid;
+-}
+-
+ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev,
+ u32 vmid)
+ {
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+index 262a4fe..23e2f12 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+@@ -28,6 +28,7 @@
+ #include <linux/types.h>
+ #include <linux/mm.h>
+ #include <linux/workqueue.h>
++#include <linux/mmu_context.h>
+ #include <kgd_kfd_interface.h>
+ #include "amdgpu.h"
+
+@@ -59,10 +60,10 @@ struct kgd_mem {
+ struct delayed_work work; /* for restore evicted mem */
+ struct mm_struct *mm; /* for restore */
+
+- uint32_t pte_flags;
+-
++ uint64_t pte_flags;
+
+ /* flags bitfield */
++ bool coherent : 1;
+ bool no_substitute : 1;
+ bool aql_queue : 1;
+ bool busy : 1;
+@@ -141,6 +142,7 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
+ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info);
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
+ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
+ int amdgpu_amdkfd_copy_mem_to_mem(struct kgd_dev *kgd, struct kgd_mem *src_mem,
+ uint64_t src_offset, struct kgd_mem *dst_mem,
+ uint64_t dest_offset, uint64_t size, struct fence **f,
+@@ -168,8 +170,20 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
+ size_t buffer_size, uint32_t *metadata_size,
+ uint32_t *flags);
+
+-bool read_user_wptr(struct mm_struct *mm, uint32_t __user *wptr,
+- uint32_t *wptr_val);
++#define read_user_wptr(mmptr, wptr, dst) \
++ ({ \
++ bool valid = false; \
++ if ((mmptr) && (wptr)) { \
++ if ((mmptr) == current->mm) { \
++ valid = !get_user((dst), (wptr)); \
++ } else if (current->mm == NULL) { \
++ use_mm(mmptr); \
++ valid = !get_user((dst), (wptr)); \
++ unuse_mm(mmptr); \
++ } \
++ } \
++ valid; \
++ })
+
+ /* GPUVM API */
+ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+index ae860ec..ac9d6ec 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+@@ -398,7 +398,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ /* Copy userspace write pointer value to register.
+ * Doorbell logic is active and will monitor subsequent changes.
+ */
+- if (read_user_wptr(mm, wptr, &wptr_val))
++ if (read_user_wptr(mm, wptr, wptr_val))
+ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
+
+ /* Write CP_HQD_ACTIVE last. */
+@@ -485,7 +485,7 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdma_rlc_doorbell);
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);
+
+- if (read_user_wptr(mm, wptr, &data))
++ if (read_user_wptr(mm, wptr, data))
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
+ else
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+index fbea6a6..2c6a199 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+@@ -403,7 +403,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
+ /* Copy userspace write pointer value to register.
+ * Doorbell logic is active and will monitor subsequent changes.
+ */
+- if (read_user_wptr(mm, wptr, &wptr_val))
++ if (read_user_wptr(mm, wptr, wptr_val))
+ WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
+
+ /* Write CP_HQD_ACTIVE last. */
+@@ -489,7 +489,7 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdmax_rlcx_doorbell);
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
+
+- if (read_user_wptr(mm, wptr, &data))
++ if (read_user_wptr(mm, wptr, data))
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
+ else
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+new file mode 100644
+index 0000000..e9b2db9
+--- /dev/null
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+@@ -0,0 +1,1124 @@
++/*
++ * Copyright 2014 Advanced Micro Devices, Inc.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
++ * OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#define pr_fmt(fmt) "kfd2kgd: " fmt
++
++#include <linux/module.h>
++#include <linux/fdtable.h>
++#include <linux/uaccess.h>
++#include <linux/firmware.h>
++#include <drm/drmP.h>
++#include "amdgpu.h"
++#include "amdgpu_amdkfd.h"
++#include "amdgpu_ucode.h"
++#include "amdgpu_amdkfd_gfx_v8.h"
++#include "vega10/soc15ip.h"
++#include "vega10/GC/gc_9_0_offset.h"
++#include "vega10/GC/gc_9_0_sh_mask.h"
++#include "vega10/vega10_enum.h"
++#include "vega10/SDMA0/sdma0_4_0_offset.h"
++#include "vega10/SDMA0/sdma0_4_0_sh_mask.h"
++#include "vega10/SDMA1/sdma1_4_0_offset.h"
++#include "vega10/SDMA1/sdma1_4_0_sh_mask.h"
++#include "vega10/ATHUB/athub_1_0_offset.h"
++#include "vega10/ATHUB/athub_1_0_sh_mask.h"
++#include "vega10/OSSSYS/osssys_4_0_offset.h"
++#include "vega10/OSSSYS/osssys_4_0_sh_mask.h"
++#include "soc15_common.h"
++#include "v9_structs.h"
++#include "soc15.h"
++
++/* HACK: MMHUB and GC both have VM-related register with the same
++ * names but different offsets. Define the MMHUB register we need here
++ * with a prefix. A proper solution would be to move the functions
++ * programming these registers into gfx_v9_0.c and mmhub_v1_0.c
++ * respectively.
++ */
++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3
++#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0
++
++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705
++#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c
++#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0
++
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728
++#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0
++
++#define V9_PIPE_PER_MEC (4)
++#define V9_QUEUES_PER_PIPE_MEC (8)
++
++enum hqd_dequeue_request_type {
++ NO_ACTION = 0,
++ DRAIN_PIPE,
++ RESET_WAVES,
++ SAVE_WAVES
++};
++
++static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
++ mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
++ mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
++ mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
++ mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
++};
++
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem);
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem);
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem);
++
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
++
++/*
++ * Register access functions
++ */
++
++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t sh_mem_config,
++ uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
++ uint32_t sh_mem_bases);
++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
++ unsigned int vmid);
++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t hpd_size, uint64_t hpd_gpu_addr);
++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm);
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm);
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs);
++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
++ uint32_t pipe_id, uint32_t queue_id);
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
++static int kgd_hqd_destroy(struct kgd_dev *kgd,
++ enum kfd_preempt_type reset_type,
++ unsigned int utimeout, uint32_t pipe_id,
++ uint32_t queue_id);
++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
++ unsigned int utimeout);
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
++static int kgd_address_watch_disable(struct kgd_dev *kgd);
++static int kgd_address_watch_execute(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ uint32_t cntl_val,
++ uint32_t addr_hi,
++ uint32_t addr_lo);
++static int kgd_wave_control_execute(struct kgd_dev *kgd,
++ uint32_t gfx_index_val,
++ uint32_t sq_cmd);
++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ unsigned int reg_offset);
++
++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
++ uint8_t vmid);
++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
++ uint8_t vmid);
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests);
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid);
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype);
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base);
++
++/* Because of REG_GET_FIELD() being used, we put this function in the
++ * asic specific file.
++ */
++static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
++ struct tile_config *config)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
++
++ config->gb_addr_config = adev->gfx.config.gb_addr_config;
++#if 0
++/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but
++ * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu
++ * changes commented out related code, doing the same here for now but
++ * need to sync with Ken et al
++ */
++ config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFBANK);
++ config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg,
++ MC_ARB_RAMCFG, NOOFRANKS);
++#endif
++
++ config->tile_config_ptr = adev->gfx.config.tile_mode_array;
++ config->num_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.tile_mode_array);
++ config->macro_tile_config_ptr =
++ adev->gfx.config.macrotile_mode_array;
++ config->num_macro_tile_configs =
++ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
++
++ return 0;
++}
++
++static const struct kfd2kgd_calls kfd2kgd = {
++ .init_gtt_mem_allocation = alloc_gtt_mem,
++ .free_gtt_mem = free_gtt_mem,
++ .get_local_mem_info = get_local_mem_info,
++ .get_gpu_clock_counter = get_gpu_clock_counter,
++ .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
++ .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
++ .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
++ .create_process_gpumem = create_process_gpumem,
++ .destroy_process_gpumem = destroy_process_gpumem,
++ .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
++ .open_graphic_handle = open_graphic_handle,
++ .program_sh_mem_settings = kgd_program_sh_mem_settings,
++ .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
++ .init_pipeline = kgd_init_pipeline,
++ .init_interrupts = kgd_init_interrupts,
++ .hqd_load = kgd_hqd_load,
++ .hqd_sdma_load = kgd_hqd_sdma_load,
++ .hqd_dump = kgd_hqd_dump,
++ .hqd_sdma_dump = kgd_hqd_sdma_dump,
++ .hqd_is_occupied = kgd_hqd_is_occupied,
++ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
++ .hqd_destroy = kgd_hqd_destroy,
++ .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
++ .address_watch_disable = kgd_address_watch_disable,
++ .address_watch_execute = kgd_address_watch_execute,
++ .wave_control_execute = kgd_wave_control_execute,
++ .address_watch_get_offset = kgd_address_watch_get_offset,
++ .get_atc_vmid_pasid_mapping_pasid =
++ get_atc_vmid_pasid_mapping_pasid,
++ .get_atc_vmid_pasid_mapping_valid =
++ get_atc_vmid_pasid_mapping_valid,
++ .write_vmid_invalidate_request = write_vmid_invalidate_request,
++ .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
++ .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
++ .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
++ .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
++ .get_fw_version = get_fw_version,
++ .set_num_of_requests = set_num_of_requests,
++ .get_cu_info = get_cu_info,
++ .alloc_memory_of_scratch = alloc_memory_of_scratch,
++ .write_config_static_mem = write_config_static_mem,
++ .mmap_bo = amdgpu_amdkfd_gpuvm_mmap_bo,
++ .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
++ .set_vm_context_page_table_base = set_vm_context_page_table_base,
++ .pin_get_sg_table_bo = amdgpu_amdkfd_gpuvm_pin_get_sg_table,
++ .unpin_put_sg_table_bo = amdgpu_amdkfd_gpuvm_unpin_put_sg_table,
++ .get_dmabuf_info = amdgpu_amdkfd_get_dmabuf_info,
++ .import_dmabuf = amdgpu_amdkfd_gpuvm_import_dmabuf,
++ .export_dmabuf = amdgpu_amdkfd_gpuvm_export_dmabuf,
++ .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info,
++ .submit_ib = amdgpu_amdkfd_submit_ib,
++ .get_tile_config = amdgpu_amdkfd_get_tile_config,
++ .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
++ .copy_mem_to_mem = amdgpu_amdkfd_copy_mem_to_mem
++};
++
++struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions()
++{
++ return (struct kfd2kgd_calls *)&kfd2kgd;
++}
++
++static int create_process_gpumem(struct kgd_dev *kgd, uint64_t va, size_t size,
++ void *vm, struct kgd_mem **mem)
++{
++ return 0;
++}
++
++/* Destroys the GPU allocation and frees the kgd_mem structure */
++static void destroy_process_gpumem(struct kgd_dev *kgd, struct kgd_mem *mem)
++{
++
++}
++
++static int open_graphic_handle(struct kgd_dev *kgd, uint64_t va, void *vm,
++ int fd, uint32_t handle, struct kgd_mem **mem)
++{
++ return 0;
++}
++
++static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
++{
++ return (struct amdgpu_device *)kgd;
++}
++
++static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
++ uint32_t queue, uint32_t vmid)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ mutex_lock(&adev->srbm_mutex);
++ soc15_grbm_select(adev, mec, pipe, queue, vmid);
++}
++
++static void unlock_srbm(struct kgd_dev *kgd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ soc15_grbm_select(adev, 0, 0, 0, 0);
++ mutex_unlock(&adev->srbm_mutex);
++}
++
++static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t queue_id)
++{
++ uint32_t mec = (++pipe_id / V9_PIPE_PER_MEC) + 1;
++ uint32_t pipe = (pipe_id % V9_PIPE_PER_MEC);
++
++ lock_srbm(kgd, mec, pipe, queue_id, 0);
++}
++
++static uint32_t get_queue_mask(uint32_t pipe_id, uint32_t queue_id)
++{
++ /* assumes that pipe0 is used by graphics and that the correct
++ * MEC is selected by acquire_queue already
++ */
++ unsigned int bit = ((pipe_id+1) * V9_QUEUES_PER_PIPE_MEC +
++ queue_id) & 31;
++
++ return ((uint32_t)1) << bit;
++}
++
++static void release_queue(struct kgd_dev *kgd)
++{
++ unlock_srbm(kgd);
++}
++
++static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t sh_mem_config,
++ uint32_t sh_mem_ape1_base,
++ uint32_t sh_mem_ape1_limit,
++ uint32_t sh_mem_bases)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ lock_srbm(kgd, 0, 0, 0, vmid);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
++ /* APE1 no longer exists on GFX9 */
++
++ unlock_srbm(kgd);
++}
++
++static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
++ unsigned int vmid)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++
++ /*
++ * We have to assume that there is no outstanding mapping.
++ * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
++ * a mapping is in progress or because a mapping finished
++ * and the SW cleared it.
++ * So the protocol is to always wait & clear.
++ */
++ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
++ ATC_VMID0_PASID_MAPPING__VALID_MASK;
++
++ /*
++ * need to do this twice, once for gfx and once for mmhub
++ * for ATC add 16 to VMID for mmhub, for IH different registers.
++ * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
++ */
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
++ pasid_mapping);
++
++ while (!(RREG32(SOC15_REG_OFFSET(
++ ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
++ (1U << vmid)))
++ cpu_relax();
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
++ 1U << vmid);
++
++ /* Mapping vmid to pasid also for IH block */
++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
++ pasid_mapping);
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
++ pasid_mapping);
++
++ while (!(RREG32(SOC15_REG_OFFSET(
++ ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
++ (1U << (vmid + 16))))
++ cpu_relax();
++
++ WREG32(SOC15_REG_OFFSET(ATHUB, 0,
++ mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
++ 1U << (vmid + 16));
++
++ /* Mapping vmid to pasid also for IH block */
++ WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
++ pasid_mapping);
++ return 0;
++}
++
++static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
++ uint32_t hpd_size, uint64_t hpd_gpu_addr)
++{
++ return 0;
++}
++
++/* TODO - RING0 form of field is obsolete, seems to date back to SI
++ * but still works
++ */
++
++static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t mec;
++ uint32_t pipe;
++
++ mec = (++pipe_id / V9_PIPE_PER_MEC) + 1;
++ pipe = (pipe_id % V9_PIPE_PER_MEC);
++
++ lock_srbm(kgd, mec, pipe, 0, 0);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
++ CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK);
++
++ unlock_srbm(kgd);
++
++ return 0;
++}
++
++static uint32_t get_sdma_base_addr(unsigned int engine_id,
++ unsigned int queue_id)
++{
++ static const uint32_t base[2] = {
++ SOC15_REG_OFFSET(SDMA0, 0,
++ mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
++ SOC15_REG_OFFSET(SDMA1, 0,
++ mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
++ };
++ uint32_t retval;
++
++ BUG_ON(engine_id > 1);
++
++ retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
++ mmSDMA0_RLC0_RB_CNTL);
++
++ pr_debug("sdma base address: 0x%x\n", retval);
++
++ return retval;
++}
++
++static inline struct v9_mqd *get_mqd(void *mqd)
++{
++ return (struct v9_mqd *)mqd;
++}
++
++static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
++{
++ return (struct v9_sdma_mqd *)mqd;
++}
++
++static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
++ uint32_t queue_id, uint32_t __user *wptr,
++ uint32_t wptr_shift, uint32_t wptr_mask,
++ struct mm_struct *mm)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_mqd *m;
++ uint32_t *mqd_hqd;
++ uint32_t reg, hqd_base;
++
++ m = get_mqd(mqd);
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
++ mqd_hqd = &m->cp_mqd_base_addr_lo;
++ hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
++
++ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_HQD_VMID);
++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
++ WREG32(reg, mqd_hqd[reg - hqd_base]);
++
++ if (wptr) {
++ /* Don't read wptr with get_user because the user
++ * context may not be accessible (if this function
++ * runs in a work queue). Instead trigger a one-shot
++ * polling read from memory in the CP. This assumes
++ * that wptr is GPU-accessible in the queue's VMID via
++ * ATC or SVM. WPTR==RPTR before starting the poll so
++ * the CP starts fetching new commands from the right
++ * place.
++ *
++ * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
++ * tricky. Assume that the queue didn't overflow. The
++ * number of valid bits in the 32-bit RPTR depends on
++ * the queue size. The remaining bits are taken from
++ * the saved 64-bit WPTR. If the WPTR wrapped, add the
++ * queue size.
++ */
++ uint32_t queue_size =
++ 2 << REG_GET_FIELD(m->cp_hqd_pq_control,
++ CP_HQD_PQ_CONTROL, QUEUE_SIZE);
++ uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
++
++ if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
++ guessed_wptr += queue_size;
++ guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
++ guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
++ lower_32_bits(guessed_wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
++ upper_32_bits(guessed_wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
++ lower_32_bits((uint64_t)wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
++ upper_32_bits((uint64_t)wptr));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
++ get_queue_mask(pipe_id, queue_id));
++ }
++
++ /* Start the EOP fetcher */
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
++ REG_SET_FIELD(m->cp_hqd_eop_rptr,
++ CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
++
++ /* Write CP_HQD_ACTIVE last. */
++ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE); reg++)
++ WREG32(reg, mqd_hqd[reg - hqd_base]);
++
++ release_queue(kgd);
++
++ return 0;
++}
++
++static int kgd_hqd_dump(struct kgd_dev *kgd,
++ uint32_t pipe_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t i = 0, reg;
++#define HQD_N_REGS 56
++#define DUMP_REG(addr) do { \
++ if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
++ break; \
++ (*dump)[i][0] = (addr) << 2; \
++ (*dump)[i++][1] = RREG32(addr); \
++ } while (0)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
++ reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
++ DUMP_REG(reg);
++
++ release_queue(kgd);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
++ uint32_t __user *wptr, struct mm_struct *mm)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
++ uint32_t temp, timeout = 2000;
++ uint32_t data;
++ uint64_t data64;
++ uint64_t __user *wptr64 = (uint64_t __user *)wptr;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++ sdmax_gfx_context_cntl = m->sdma_engine_id ?
++ SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
++ SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
++
++ while (true) {
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (timeout == 0)
++ return -ETIME;
++ msleep(10);
++ timeout -= 10;
++ }
++ data = RREG32(sdmax_gfx_context_cntl);
++ data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
++ RESUME_CTX, 0);
++ WREG32(sdmax_gfx_context_cntl, data);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
++ m->sdmax_rlcx_doorbell_offset);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, m->sdmax_rlcx_doorbell);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, m->sdmax_rlcx_rb_rptr_hi);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
++ if (read_user_wptr(mm, wptr64, data64)) {
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ lower_32_bits(data64));
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
++ upper_32_bits(data64));
++ } else {
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
++ m->sdmax_rlcx_rb_rptr);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
++ m->sdmax_rlcx_rb_rptr_hi);
++ }
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdmax_rlcx_rb_base_hi);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdmax_rlcx_rb_rptr_addr_lo);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdmax_rlcx_rb_rptr_addr_hi);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdmax_rlcx_rb_cntl);
++
++ return 0;
++}
++
++static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
++ uint32_t engine_id, uint32_t queue_id,
++ uint32_t (**dump)[2], uint32_t *n_regs)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t sdma_base_addr = get_sdma_base_addr(engine_id, queue_id);
++ uint32_t i = 0, reg;
++#undef HQD_N_REGS
++#define HQD_N_REGS (19+6+7+10)
++
++ *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
++ if (*dump == NULL)
++ return -ENOMEM;
++
++ for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
++ reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++ for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
++ reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
++ DUMP_REG(sdma_base_addr + reg);
++
++ WARN_ON_ONCE(i != HQD_N_REGS);
++ *n_regs = i;
++
++ return 0;
++}
++
++static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
++ uint32_t pipe_id, uint32_t queue_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t act;
++ bool retval = false;
++ uint32_t low, high;
++
++ acquire_queue(kgd, pipe_id, queue_id);
++ act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
++ if (act) {
++ low = lower_32_bits(queue_address >> 8);
++ high = upper_32_bits(queue_address >> 8);
++
++ if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
++ high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
++ retval = true;
++ }
++ release_queue(kgd);
++ return retval;
++}
++
++static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr;
++ uint32_t sdma_rlc_rb_cntl;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++
++ sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
++
++ if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
++ return true;
++
++ return false;
++}
++
++static int kgd_hqd_destroy(struct kgd_dev *kgd,
++ enum kfd_preempt_type reset_type,
++ unsigned int utimeout, uint32_t pipe_id,
++ uint32_t queue_id)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ enum hqd_dequeue_request_type type;
++ unsigned long end_jiffies;
++ uint32_t temp;
++#if 0
++ unsigned long flags;
++ int retry;
++#endif
++
++ acquire_queue(kgd, pipe_id, queue_id);
++
++ switch (reset_type) {
++ case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
++ type = DRAIN_PIPE;
++ break;
++ case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
++ type = RESET_WAVES;
++ break;
++ default:
++ type = DRAIN_PIPE;
++ break;
++ }
++
++#if 0 /* Is this still needed? */
++ /* Workaround: If IQ timer is active and the wait time is close to or
++ * equal to 0, dequeueing is not safe. Wait until either the wait time
++ * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is
++ * cleared before continuing. Also, ensure wait times are set to at
++ * least 0x3.
++ */
++ local_irq_save(flags);
++ preempt_disable();
++ retry = 5000; /* wait for 500 usecs at maximum */
++ while (true) {
++ temp = RREG32(mmCP_HQD_IQ_TIMER);
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) {
++ pr_debug("HW is processing IQ\n");
++ goto loop;
++ }
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) {
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE)
++ == 3) /* SEM-rearm is safe */
++ break;
++ /* Wait time 3 is safe for CP, but our MMIO read/write
++ * time is close to 1 microsecond, so check for 10 to
++ * leave more buffer room
++ */
++ if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME)
++ >= 10)
++ break;
++ pr_debug("IQ timer is active\n");
++ } else
++ break;
++loop:
++ if (!retry) {
++ pr_err("CP HQD IQ timer status time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ retry = 1000;
++ while (true) {
++ temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST);
++ if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK))
++ break;
++ pr_debug("Dequeue request is pending\n");
++
++ if (!retry) {
++ pr_err("CP HQD dequeue request time out\n");
++ break;
++ }
++ ndelay(100);
++ --retry;
++ }
++ local_irq_restore(flags);
++ preempt_enable();
++#endif
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
++
++ end_jiffies = (utimeout * HZ / 1000) + jiffies;
++ while (true) {
++ temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
++ if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
++ break;
++ if (time_after(jiffies, end_jiffies)) {
++ pr_err("cp queue preemption time out.\n");
++ release_queue(kgd);
++ return -ETIME;
++ }
++ usleep_range(500, 1000);
++ }
++
++ release_queue(kgd);
++ return 0;
++}
++
++static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
++ unsigned int utimeout)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ struct v9_sdma_mqd *m;
++ uint32_t sdma_base_addr;
++ uint32_t temp;
++ unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
++
++ m = get_sdma_mqd(mqd);
++ sdma_base_addr = get_sdma_base_addr(m->sdma_engine_id,
++ m->sdma_queue_id);
++
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
++ temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
++
++ while (true) {
++ temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
++ if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
++ break;
++ if (time_after(jiffies, end_jiffies))
++ return -ETIME;
++ usleep_range(500, 1000);
++ }
++
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
++ WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
++ SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
++
++ m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
++ m->sdmax_rlcx_rb_rptr_hi =
++ RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
++
++ return 0;
++}
++
++static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
++ uint8_t vmid)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
++ + vmid);
++ return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
++}
++
++static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
++ uint8_t vmid)
++{
++ uint32_t reg;
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++
++ reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
++ + vmid);
++ return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
++}
++
++static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ uint32_t req = (1 << vmid) |
++ (1 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* light */
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK |
++ VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK;
++
++ /* Use light weight invalidation.
++ *
++ * TODO 1: agree on the right set of invalidation registers for
++ * KFD use. Use the last one for now. Invalidate both GC and
++ * MMHUB.
++ *
++ * TODO 2: support range-based invalidation, requires kfg2kgd
++ * interface change
++ */
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), 0xffffffff);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), 0x0000001f);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), 0xffffffff);
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), 0x0000001f);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), req);
++
++ while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) &
++ (1 << vmid)))
++ cpu_relax();
++
++ while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_ACK)) &
++ (1 << vmid)))
++ cpu_relax();
++}
++
++static int kgd_address_watch_disable(struct kgd_dev *kgd)
++{
++ WARN_ONCE(1, "Not implemented");
++
++#if 0 /* TODO: Update to SOC15 register */
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++ unsigned int i;
++
++ cntl.u32All = 0;
++
++ cntl.bitfields.valid = 0;
++ cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
++ cntl.bitfields.atc = 1;
++
++ /* Turning off this address until we set all the registers */
++ for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
++ WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++#endif
++
++ return 0;
++}
++
++static int kgd_address_watch_execute(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ uint32_t cntl_val,
++ uint32_t addr_hi,
++ uint32_t addr_lo)
++{
++ WARN_ONCE(1, "Not implemented");
++
++#if 0 /* TODO: Update to SOC15 register */
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ union TCP_WATCH_CNTL_BITS cntl;
++
++ cntl.u32All = cntl_val;
++
++ /* Turning off this watch point until we set all the registers */
++ cntl.bitfields.valid = 0;
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_HI],
++ addr_hi);
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_ADDR_LO],
++ addr_lo);
++
++ /* Enable the watch point */
++ cntl.bitfields.valid = 1;
++
++ WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + ADDRESS_WATCH_REG_CNTL],
++ cntl.u32All);
++#endif
++
++ return 0;
++}
++
++static int kgd_wave_control_execute(struct kgd_dev *kgd,
++ uint32_t gfx_index_val,
++ uint32_t sq_cmd)
++{
++ WARN_ONCE(1, "Not implemented");
++
++#if 0 /* TODO: Update to SOC15 register */
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint32_t data = 0;
++
++ mutex_lock(&adev->grbm_idx_mutex);
++
++ WREG32(mmGRBM_GFX_INDEX, gfx_index_val);
++ WREG32(mmSQ_CMD, sq_cmd);
++
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ INSTANCE_BROADCAST_WRITES, 1);
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ SH_BROADCAST_WRITES, 1);
++ data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
++ SE_BROADCAST_WRITES, 1);
++
++ WREG32(mmGRBM_GFX_INDEX, data);
++ mutex_unlock(&adev->grbm_idx_mutex);
++#endif
++
++ return 0;
++}
++
++static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
++ unsigned int watch_point_id,
++ unsigned int reg_offset)
++{
++ return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
++}
++
++static int write_config_static_mem(struct kgd_dev *kgd, bool swizzle_enable,
++ uint8_t element_size, uint8_t index_stride, uint8_t mtype)
++{
++ /* No longer needed on GFXv9. These values are now hard-coded,
++ * except for the MTYPE which comes from the page table.
++ */
++
++ return 0;
++}
++static int alloc_memory_of_scratch(struct kgd_dev *kgd,
++ uint64_t va, uint32_t vmid)
++{
++ /* No longer needed on GFXv9. The scratch base address is
++ * passed to the shader by the CP. It's the user mode driver's
++ * responsibility.
++ */
++
++ return 0;
++}
++
++/* FIXME: Does this need to be ASIC-specific code? */
++static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
++{
++ struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
++ const union amdgpu_firmware_header *hdr;
++
++ switch (type) {
++ case KGD_ENGINE_PFP:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.pfp_fw->data;
++ break;
++
++ case KGD_ENGINE_ME:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.me_fw->data;
++ break;
++
++ case KGD_ENGINE_CE:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.ce_fw->data;
++ break;
++
++ case KGD_ENGINE_MEC1:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.mec_fw->data;
++ break;
++
++ case KGD_ENGINE_MEC2:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.mec2_fw->data;
++ break;
++
++ case KGD_ENGINE_RLC:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->gfx.rlc_fw->data;
++ break;
++
++ case KGD_ENGINE_SDMA1:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->sdma.instance[0].fw->data;
++ break;
++
++ case KGD_ENGINE_SDMA2:
++ hdr = (const union amdgpu_firmware_header *)
++ adev->sdma.instance[1].fw->data;
++ break;
++
++ default:
++ return 0;
++ }
++
++ if (hdr == NULL)
++ return 0;
++
++ /* Only 12 bit in use*/
++ return hdr->common.ucode_version;
++}
++
++static void set_num_of_requests(struct kgd_dev *kgd,
++ uint8_t num_of_requests)
++{
++ pr_debug("in %s this is a stub\n", __func__);
++}
++
++static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
++ uint32_t page_table_base)
++{
++ struct amdgpu_device *adev = get_amdgpu_device(kgd);
++ uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT |
++ AMDGPU_PTE_VALID;
++
++ /* TODO: Don't use hardcoded VMIDs */
++ if (vmid < 8 || vmid > 15) {
++ pr_err("trying to set page table base for wrong VMID %u\n",
++ vmid);
++ return;
++ }
++
++ /* TODO: take advantage of per-process address space size. For
++ * now, all processes share the same address space size, like
++ * on GFX8 and older.
++ */
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), adev->vm_manager.max_pfn - 1);
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
++ WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), adev->vm_manager.max_pfn - 1);
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), 0);
++
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
++ WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+index 5853b59..29dcc29 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+@@ -38,8 +38,6 @@
+ #include "oss/oss_3_0_d.h"
+ #include "gmc/gmc_8_1_sh_mask.h"
+ #include "gmc/gmc_8_1_d.h"
+-#include "vi_structs.h"
+-#include "vid.h"
+
+ /* Special VM and GART address alignment needed for VI pre-Fiji due to
+ * a HW bug. */
+@@ -496,7 +494,8 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
+ uint64_t size, void *vm, struct kgd_mem **mem,
+ uint64_t *offset, void **kptr,
+ u32 domain, u64 flags, struct sg_table *sg, bool aql_queue,
+- bool readonly, bool execute, bool no_sub, bool userptr)
++ bool readonly, bool execute, bool coherent, bool no_sub,
++ bool userptr)
+ {
+ struct amdgpu_device *adev;
+ int ret;
+@@ -504,7 +503,7 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
+ uint64_t user_addr = 0;
+ int byte_align;
+ u32 alloc_domain;
+- uint32_t pte_flags;
++ uint32_t get_pte_flags;
+ struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
+
+ BUG_ON(kgd == NULL);
+@@ -534,17 +533,21 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
+ }
+ INIT_LIST_HEAD(&(*mem)->bo_va_list);
+ mutex_init(&(*mem)->lock);
+-
++ (*mem)->coherent = coherent;
+ (*mem)->no_substitute = no_sub;
+ (*mem)->aql_queue = aql_queue;
+
+- pte_flags = AMDGPU_PTE_READABLE | AMDGPU_PTE_VALID;
++ get_pte_flags = AMDGPU_VM_PAGE_READABLE;
+ if (!readonly)
+- pte_flags |= AMDGPU_PTE_WRITEABLE;
++ get_pte_flags |= AMDGPU_VM_PAGE_WRITEABLE;
+ if (execute)
+- pte_flags |= AMDGPU_PTE_EXECUTABLE;
++ get_pte_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
++ if (coherent)
++ get_pte_flags |= AMDGPU_VM_MTYPE_UC;
++ else
++ get_pte_flags |= AMDGPU_VM_MTYPE_NC;
+
+- (*mem)->pte_flags = pte_flags;
++ (*mem)->pte_flags = amdgpu_vm_get_pte_flags(adev, get_pte_flags);
+
+ alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain;
+
+@@ -971,7 +974,7 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
+ }
+
+ static int map_bo_to_gpuvm(struct amdgpu_device *adev,
+- struct kfd_bo_va_list *entry, uint32_t pte_flags,
++ struct kfd_bo_va_list *entry, uint64_t pte_flags,
+ struct amdgpu_sync *sync)
+ {
+ int ret;
+@@ -1060,7 +1063,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
+ uint64_t *offset, void **kptr,
+ uint32_t flags)
+ {
+- bool aql_queue, public, readonly, execute, no_sub, userptr;
++ bool aql_queue, public, readonly, execute, coherent, no_sub, userptr;
+ u64 alloc_flag;
+ uint32_t domain;
+ uint64_t *temp_offset;
+@@ -1079,6 +1082,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
+ public = (flags & ALLOC_MEM_FLAGS_PUBLIC) ? true : false;
+ readonly = (flags & ALLOC_MEM_FLAGS_READONLY) ? true : false;
+ execute = (flags & ALLOC_MEM_FLAGS_EXECUTE_ACCESS) ? true : false;
++ coherent = (flags & ALLOC_MEM_FLAGS_COHERENT) ? true : false;
+ no_sub = (flags & ALLOC_MEM_FLAGS_NO_SUBSTITUTE) ? true : false;
+ userptr = (flags & ALLOC_MEM_FLAGS_USERPTR) ? true : false;
+
+@@ -1119,16 +1123,16 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
+ va, va + size, domain_string(domain),
+ BOOL_TO_STR(aql_queue));
+
+- pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s no_sub %s\n",
++ pr_debug("\t alloc_flag 0x%llx public %s readonly %s execute %s coherent %s no_sub %s\n",
+ alloc_flag, BOOL_TO_STR(public),
+ BOOL_TO_STR(readonly), BOOL_TO_STR(execute),
+- BOOL_TO_STR(no_sub));
++ BOOL_TO_STR(coherent), BOOL_TO_STR(no_sub));
+
+ return __alloc_memory_of_gpu(kgd, va, size, vm, mem,
+ temp_offset, kptr, domain,
+ alloc_flag, sg,
+ aql_queue, readonly, execute,
+- no_sub, userptr);
++ coherent, no_sub, userptr);
+ }
+
+ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
+@@ -1347,6 +1351,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
+ static u64 get_vm_pd_gpu_offset(void *vm)
+ {
+ struct amdgpu_vm *avm = (struct amdgpu_vm *) vm;
++ struct amdgpu_device *adev =
++ amdgpu_ttm_adev(avm->root.bo->tbo.bdev);
+ u64 offset;
+
+ BUG_ON(avm == NULL);
+@@ -1357,6 +1363,12 @@ static u64 get_vm_pd_gpu_offset(void *vm)
+
+ amdgpu_bo_unreserve(avm->root.bo);
+
++ /* On some ASICs the FB doesn't start at 0. Adjust FB offset
++ * to an actual MC address.
++ */
++ if (adev->gart.gart_funcs->adjust_mc_addr)
++ offset = adev->gart.gart_funcs->adjust_mc_addr(adev, offset);
++
+ return offset;
+ }
+
+@@ -1854,8 +1866,11 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
+
+ INIT_LIST_HEAD(&(*mem)->bo_va_list);
+ mutex_init(&(*mem)->lock);
+- (*mem)->pte_flags = AMDGPU_PTE_READABLE | AMDGPU_PTE_VALID
+- | AMDGPU_PTE_WRITEABLE | AMDGPU_PTE_EXECUTABLE;
++ (*mem)->pte_flags = amdgpu_vm_get_pte_flags(adev,
++ AMDGPU_VM_PAGE_READABLE |
++ AMDGPU_VM_PAGE_WRITEABLE |
++ AMDGPU_VM_PAGE_EXECUTABLE |
++ AMDGPU_VM_MTYPE_NC);
+
+ (*mem)->bo = amdgpu_bo_ref(bo);
+ (*mem)->va = va;
+diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+index b33ff31..5ee3429 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+@@ -4397,6 +4397,19 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
+
+ cu_info->number = active_cu_number;
+ cu_info->ao_cu_mask = ao_cu_mask;
++ cu_info->simd_per_cu = NUM_SIMD_PER_CU;
++
++ switch (adev->asic_type) {
++ case CHIP_VEGA10: /* TODO: check if any of this changed */
++ cu_info->max_waves_per_simd = 10;
++ cu_info->max_scratch_slots_per_cu = 32;
++ cu_info->wave_front_size = 64;
++ cu_info->lds_size = 64;
++ break;
++ default:
++ dev_warn(adev->dev, "CU info asic_type [0x%x] not supported\n",
++ adev->asic_type);
++ }
+
+ return 0;
+ }
+diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+index 37e1edb..cff1ee6 100644
+--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
++++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+@@ -114,6 +114,21 @@ struct kgd2kfd_shared_resources {
+ /* Bit n == 1 means Queue n is available for KFD */
+ DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES);
+
++ /* Doorbell assignments (SOC15 and later chips only). Only
++ * specific doorbells are routed to each SDMA engine. Others
++ * are routed to IH and VCN. They are not usable by the CP.
++ *
++ * Any doorbell number D that satisfies the following condition
++ * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val
++ *
++ * KFD currently uses 1024 (= 0x3ff) doorbells per process. If
++ * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means
++ * mask would be set to 0x1f8 and val set to 0x0f0.
++ */
++ unsigned int sdma_doorbell[2][2];
++ unsigned int reserved_doorbell_mask;
++ unsigned int reserved_doorbell_val;
++
+ /* Base address of doorbell aperture. */
+ phys_addr_t doorbell_physical_address;
+
+@@ -155,6 +170,7 @@ struct tile_config {
+ #define ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28)
+ #define ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
+ #define ALLOC_MEM_FLAGS_EXECUTE_ACCESS (1 << 26)
++#define ALLOC_MEM_FLAGS_COHERENT (1 << 25)
+
+ /**
+ * struct kfd2kgd_calls
+--
+2.7.4
+