From 8c8e41b130f06c684c2455d26ff4523264fdfdef Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Wed, 5 Oct 2016 16:25:45 -0400
Subject: [PATCH 1586/4131] drm/amdgpu: Automatic power profile switching

Switch between compute and graphic profiles automatically when KFD
compute work starts and stops. It uses the number of KFD VMs as a
criteria for the existence of KFD compute work.

Change-Id: I11d34f45d901f4dd1e16e4a64c1ad1010088d9b8
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

 Conflicts:
        drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c           | 625 +++++++++++------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h           |   5 +
 3 files changed, 311 insertions(+), 321 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 3ec1ff1..155de54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1327,7 +1327,7 @@ int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
 		return -ENOMEM;
 
 	/* Initialize the VM context, allocate the page directory and zero it */
-	ret = amdgpu_vm_init(adev, &new_vm->base);
+	ret = amdgpu_vm_init(adev, &new_vm->base, AMDGPU_VM_CONTEXT_COMPUTE);
 	if (ret != 0) {
 		pr_err("Failed init vm ret %d\n", ret);
 		/* Undo everything related to the new VM context */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bdf2d6c..c300397 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -25,8 +25,14 @@
  *          Alex Deucher
  *          Jerome Glisse
  */
+#if defined(BUILD_AS_DKMS)
+#include <kcl/kcl_fence_array.h>
+#else
 #include <linux/dma-fence-array.h>
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
 #include <linux/interval_tree_generic.h>
+#endif
 #include <drm/drmP.h>
 #include <drm/amdgpu_drm.h>
 #include "amdgpu.h"
@@ -140,7 +146,7 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
 			 struct amdgpu_bo_list_entry *entry)
 {
-	entry->robj = vm->root.base.bo;
+	entry->robj = vm->root.bo;
 	entry->priority = 0;
 	entry->tv.bo = &entry->robj->tbo;
 	entry->tv.shared = true;
@@ -149,6 +155,61 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 }
 
 /**
+ * amdgpu_vm_validate_layer - validate a single page table level
+ *
+ * @parent: parent page table level
+ * @validate: callback to do the validation
+ * @param: parameter for the validation callback
+ *
+ * Validate the page table BOs on command submission if neccessary.
+ */
+static int amdgpu_vm_validate_level(struct amdgpu_vm_pt *parent,
+				    int (*validate)(void *, struct amdgpu_bo *),
+				    void *param, bool use_cpu_for_update,
+				    struct ttm_bo_global *glob)
+{
+	unsigned i;
+	int r;
+
+	if (use_cpu_for_update) {
+		r = amdgpu_bo_kmap(parent->bo, NULL);
+		if (r)
+			return r;
+	}
+
+	if (!parent->entries)
+		return 0;
+
+	for (i = 0; i <= parent->last_entry_used; ++i) {
+		struct amdgpu_vm_pt *entry = &parent->entries[i];
+
+		if (!entry->bo)
+			continue;
+
+		r = validate(param, entry->bo);
+		if (r)
+			return r;
+
+		spin_lock(&glob->lru_lock);
+		ttm_bo_move_to_lru_tail(&entry->bo->tbo);
+		if (entry->bo->shadow)
+			ttm_bo_move_to_lru_tail(&entry->bo->shadow->tbo);
+		spin_unlock(&glob->lru_lock);
+
+		/*
+		 * Recurse into the sub directory. This is harmless because we
+		 * have only a maximum of 5 layers.
+		 */
+		r = amdgpu_vm_validate_level(entry, validate, param,
+					     use_cpu_for_update, glob);
+		if (r)
+			return r;
+	}
+
+	return r;
+}
+
+/**
  * amdgpu_vm_validate_pt_bos - validate the page table BOs
  *
  * @adev: amdgpu device pointer
@@ -162,47 +223,32 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			      int (*validate)(void *p, struct amdgpu_bo *bo),
 			      void *param)
 {
-	struct ttm_bo_global *glob = adev->mman.bdev.glob;
-	int r;
-
-	spin_lock(&vm->status_lock);
-	while (!list_empty(&vm->evicted)) {
-		struct amdgpu_vm_bo_base *bo_base;
-		struct amdgpu_bo *bo;
-
-		bo_base = list_first_entry(&vm->evicted,
-					   struct amdgpu_vm_bo_base,
-					   vm_status);
-		spin_unlock(&vm->status_lock);
+	uint64_t num_evictions;
 
-		bo = bo_base->bo;
-		BUG_ON(!bo);
-		if (bo->parent) {
-			r = validate(param, bo);
-			if (r)
-				return r;
+	/* We only need to validate the page tables
+	 * if they aren't already valid.
+	 */
+	num_evictions = atomic64_read(&adev->num_evictions);
+	if (num_evictions == vm->last_eviction_counter)
+		return 0;
 
-			spin_lock(&glob->lru_lock);
-			ttm_bo_move_to_lru_tail(&bo->tbo);
-			if (bo->shadow)
-				ttm_bo_move_to_lru_tail(&bo->shadow->tbo);
-			spin_unlock(&glob->lru_lock);
-		}
+	return amdgpu_vm_validate_level(&vm->root, validate, param,
+					vm->use_cpu_for_update,
+					adev->mman.bdev.glob);
+}
 
-		if (bo->tbo.type == ttm_bo_type_kernel &&
-		    vm->use_cpu_for_update) {
-			r = amdgpu_bo_kmap(bo, NULL);
-			if (r)
-				return r;
-		}
+/**
+ * amdgpu_vm_check - helper for amdgpu_vm_ready
+ */
+static int amdgpu_vm_check(void *param, struct amdgpu_bo *bo)
+{
+	/* if anything is swapped out don't swap it in here,
+	   just abort and wait for the next CS */
+	if (!amdgpu_bo_gpu_accessible(bo))
+		return -ERESTARTSYS;
 
-		spin_lock(&vm->status_lock);
-		if (bo->tbo.type != ttm_bo_type_kernel)
-			list_move(&bo_base->vm_status, &vm->moved);
-		else
-			list_move(&bo_base->vm_status, &vm->relocated);
-	}
-	spin_unlock(&vm->status_lock);
+	if (bo->shadow && !amdgpu_bo_gpu_accessible(bo->shadow))
+		return -ERESTARTSYS;
 
 	return 0;
 }
@@ -210,19 +256,17 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 /**
  * amdgpu_vm_ready - check VM is ready for updates
  *
+ * @adev: amdgpu device
  * @vm: VM to check
  *
  * Check if all VM PDs/PTs are ready for updates
  */
-bool amdgpu_vm_ready(struct amdgpu_vm *vm)
+bool amdgpu_vm_ready(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-	bool ready;
-
-	spin_lock(&vm->status_lock);
-	ready = list_empty(&vm->evicted);
-	spin_unlock(&vm->status_lock);
+	if (amdgpu_vm_check(NULL, vm->root.bo))
+		return false;
 
-	return ready;
+	return !amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_vm_check, NULL);
 }
 
 /**
@@ -251,9 +295,14 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 	if (!parent->entries) {
 		unsigned num_entries = amdgpu_vm_num_entries(adev, level);
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
+		parent->entries = drm_calloc_large(num_entries,
+						   sizeof(struct amdgpu_vm_pt));
+#else
 		parent->entries = kvmalloc_array(num_entries,
 						   sizeof(struct amdgpu_vm_pt),
 						   GFP_KERNEL | __GFP_ZERO);
+#endif
 		if (!parent->entries)
 			return -ENOMEM;
 		memset(parent->entries, 0 , sizeof(struct amdgpu_vm_pt));
@@ -288,11 +337,11 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 
 	/* walk over the address space and allocate the page tables */
 	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
-		struct reservation_object *resv = vm->root.base.bo->tbo.resv;
+		struct reservation_object *resv = vm->root.bo->tbo.resv;
 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 		struct amdgpu_bo *pt;
 
-		if (!entry->base.bo) {
+		if (!entry->bo) {
 			r = amdgpu_bo_create(adev,
 					     amdgpu_vm_bo_size(adev, level),
 					     AMDGPU_GPU_PAGE_SIZE, true,
@@ -313,14 +362,9 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 			/* Keep a reference to the root directory to avoid
 			* freeing them up in the wrong order.
 			*/
-			pt->parent = amdgpu_bo_ref(parent->base.bo);
-
-			entry->base.vm = vm;
-			entry->base.bo = pt;
-			list_add_tail(&entry->base.bo_list, &pt->va);
-			spin_lock(&vm->status_lock);
-			list_add(&entry->base.vm_status, &vm->relocated);
-			spin_unlock(&vm->status_lock);
+			pt->parent = amdgpu_bo_ref(vm->root.bo);
+
+			entry->bo = pt;
 			entry->addr = 0;
 		}
 
@@ -987,7 +1031,7 @@ static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	int r;
 
 	amdgpu_sync_create(&sync);
-	amdgpu_sync_resv(adev, &sync, vm->root.base.bo->tbo.resv, owner);
+	amdgpu_sync_resv(adev, &sync, vm->root.bo->tbo.resv, owner);
 	r = amdgpu_sync_wait(&sync, true);
 	amdgpu_sync_free(&sync);
 
@@ -1006,17 +1050,18 @@ static int amdgpu_vm_wait_pd(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  */
 static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 				  struct amdgpu_vm *vm,
-				  struct amdgpu_vm_pt *parent)
+				  struct amdgpu_vm_pt *parent,
+				  unsigned level)
 {
 	struct amdgpu_bo *shadow;
 	struct amdgpu_ring *ring = NULL;
 	uint64_t pd_addr, shadow_addr = 0;
+	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
 	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
 	unsigned count = 0, pt_idx, ndw = 0;
 	struct amdgpu_job *job;
 	struct amdgpu_pte_update_params params;
 	struct dma_fence *fence = NULL;
-	uint32_t incr;
 
 	int r;
 
@@ -1025,10 +1070,10 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	memset(&params, 0, sizeof(params));
 	params.adev = adev;
-	shadow = parent->base.bo->shadow;
+	shadow = parent->bo->shadow;
 
 	if (vm->use_cpu_for_update) {
-		pd_addr = (unsigned long)amdgpu_bo_kptr(parent->base.bo);
+		pd_addr = (unsigned long)amdgpu_bo_kptr(parent->bo);
 		r = amdgpu_vm_wait_pd(adev, vm, AMDGPU_FENCE_OWNER_VM);
 		if (unlikely(r))
 			return r;
@@ -1044,7 +1089,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		/* assume the worst case */
 		ndw += parent->last_entry_used * 6;
 
-		pd_addr = amdgpu_bo_gpu_offset(parent->base.bo);
+		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
 
 		if (shadow) {
 			shadow_addr = amdgpu_bo_gpu_offset(shadow);
@@ -1064,17 +1109,12 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	/* walk over the address space and update the directory */
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
-		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
-		struct amdgpu_bo *bo = entry->base.bo;
+		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
 		uint64_t pde, pt;
 
 		if (bo == NULL)
 			continue;
 
-		spin_lock(&vm->status_lock);
-		list_del_init(&entry->base.vm_status);
-		spin_unlock(&vm->status_lock);
-
 		pt = amdgpu_bo_gpu_offset(bo);
 		pt = amdgpu_gart_get_vm_pde(adev, pt);
 		/* Don't update huge pages here */
@@ -1085,7 +1125,6 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		parent->entries[pt_idx].addr = pt | AMDGPU_PTE_VALID;
 
 		pde = pd_addr + pt_idx * 8;
-		incr = amdgpu_bo_size(bo);
 		if (((last_pde + 8 * count) != pde) ||
 		    ((last_pt + incr * count) != pt) ||
 		    (count == AMDGPU_VM_MAX_UPDATE_SIZE)) {
@@ -1113,7 +1152,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 	}
 
 	if (count) {
-		if (vm->root.base.bo->shadow)
+		if (vm->root.bo->shadow)
 			params.func(&params, last_shadow, last_pt,
 				    count, incr, AMDGPU_PTE_VALID);
 
@@ -1126,8 +1165,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 			amdgpu_job_free(job);
 		} else {
 			amdgpu_ring_pad_ib(ring, params.ib);
-			amdgpu_sync_resv(adev, &job->sync,
-					 parent->base.bo->tbo.resv,
+			amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
 					 AMDGPU_FENCE_OWNER_VM);
 			if (shadow)
 				amdgpu_sync_resv(adev, &job->sync,
@@ -1140,11 +1178,26 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 			if (r)
 				goto error_free;
 
-			amdgpu_bo_fence(parent->base.bo, fence, true);
-			dma_fence_put(vm->last_update);
-			vm->last_update = fence;
+			amdgpu_bo_fence(parent->bo, fence, true);
+			dma_fence_put(vm->last_dir_update);
+			vm->last_dir_update = dma_fence_get(fence);
+			dma_fence_put(fence);
 		}
 	}
+	/*
+	 * Recurse into the subdirectories. This recursion is harmless because
+	 * we only have a maximum of 5 layers.
+	 */
+	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
+		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
+
+		if (!entry->bo)
+			continue;
+
+		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
+		if (r)
+			return r;
+	}
 
 	return 0;
 
@@ -1160,8 +1213,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
  *
  * Mark all PD level as invalid after an error.
  */
-static void amdgpu_vm_invalidate_level(struct amdgpu_vm *vm,
-				       struct amdgpu_vm_pt *parent)
+static void amdgpu_vm_invalidate_level(struct amdgpu_vm_pt *parent)
 {
 	unsigned pt_idx;
 
@@ -1172,15 +1224,11 @@ static void amdgpu_vm_invalidate_level(struct amdgpu_vm *vm,
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 
-		if (!entry->base.bo)
+		if (!entry->bo)
 			continue;
 
 		entry->addr = ~0ULL;
-		spin_lock(&vm->status_lock);
-		if (list_empty(&entry->base.vm_status))
-			list_add(&entry->base.vm_status, &vm->relocated);
-		spin_unlock(&vm->status_lock);
-		amdgpu_vm_invalidate_level(vm, entry);
+		amdgpu_vm_invalidate_level(entry);
 	}
 }
 
@@ -1196,40 +1244,11 @@ static void amdgpu_vm_invalidate_level(struct amdgpu_vm *vm,
 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
 				 struct amdgpu_vm *vm)
 {
-	int r = 0;
-
-	spin_lock(&vm->status_lock);
-	while (!list_empty(&vm->relocated)) {
-		struct amdgpu_vm_bo_base *bo_base;
-		struct amdgpu_bo *bo;
-
-		bo_base = list_first_entry(&vm->relocated,
-					   struct amdgpu_vm_bo_base,
-					   vm_status);
-		spin_unlock(&vm->status_lock);
-
-		bo = bo_base->bo->parent;
-		if (bo) {
-			struct amdgpu_vm_bo_base *parent;
-			struct amdgpu_vm_pt *pt;
-
-			parent = list_first_entry(&bo->va,
-						  struct amdgpu_vm_bo_base,
-						  bo_list);
-			pt = container_of(parent, struct amdgpu_vm_pt, base);
+	int r;
 
-			r = amdgpu_vm_update_level(adev, vm, pt);
-			if (r) {
-				amdgpu_vm_invalidate_level(vm, &vm->root);
-				return r;
-			}
-			spin_lock(&vm->status_lock);
-		} else {
-			spin_lock(&vm->status_lock);
-			list_del_init(&bo_base->vm_status);
-		}
-	}
-	spin_unlock(&vm->status_lock);
+	r = amdgpu_vm_update_level(adev, vm, &vm->root, 0);
+	if (r)
+		amdgpu_vm_invalidate_level(&vm->root);
 
 	if (vm->use_cpu_for_update) {
 		/* Flush HDP */
@@ -1260,7 +1279,7 @@ void amdgpu_vm_get_entry(struct amdgpu_pte_update_params *p, uint64_t addr,
 	*entry = &p->vm->root;
 	while ((*entry)->entries) {
 		idx = addr >> (p->adev->vm_manager.block_size * level--);
-		idx %= amdgpu_bo_size((*entry)->base.bo) / 8;
+		idx %= amdgpu_bo_size((*entry)->bo) / 8;
 		*parent = *entry;
 		*entry = &(*entry)->entries[idx];
 	}
@@ -1296,7 +1315,7 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
 	    p->src ||
 	    !(flags & AMDGPU_PTE_VALID)) {
 
-		dst = amdgpu_bo_gpu_offset(entry->base.bo);
+		dst = amdgpu_bo_gpu_offset(entry->bo);
 		dst = amdgpu_gart_get_vm_pde(p->adev, dst);
 		flags = AMDGPU_PTE_VALID;
 	} else {
@@ -1322,18 +1341,18 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p,
 		tmp = p->pages_addr;
 		p->pages_addr = NULL;
 
-		pd_addr = (unsigned long)amdgpu_bo_kptr(parent->base.bo);
+		pd_addr = (unsigned long)amdgpu_bo_kptr(parent->bo);
 		pde = pd_addr + (entry - parent->entries) * 8;
 		amdgpu_vm_cpu_set_ptes(p, pde, dst, 1, 0, flags);
 
 		p->pages_addr = tmp;
 	} else {
-		if (parent->base.bo->shadow) {
-			pd_addr = amdgpu_bo_gpu_offset(parent->base.bo->shadow);
+		if (parent->bo->shadow) {
+			pd_addr = amdgpu_bo_gpu_offset(parent->bo->shadow);
 			pde = pd_addr + (entry - parent->entries) * 8;
 			amdgpu_vm_do_set_ptes(p, pde, dst, 1, 0, flags);
 		}
-		pd_addr = amdgpu_bo_gpu_offset(parent->base.bo);
+		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
 		pde = pd_addr + (entry - parent->entries) * 8;
 		amdgpu_vm_do_set_ptes(p, pde, dst, 1, 0, flags);
 	}
@@ -1384,7 +1403,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 		if (entry->addr & AMDGPU_PDE_PTE)
 			continue;
 
-		pt = entry->base.bo;
+		pt = entry->bo;
 		if (use_cpu_update) {
 			pe_start = (unsigned long)amdgpu_bo_kptr(pt);
 		} else {
@@ -1420,6 +1439,8 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 				uint64_t start, uint64_t end,
 				uint64_t dst, uint64_t flags)
 {
+	int r;
+
 	/**
 	 * The MC L1 TLB supports variable sized pages, based on a fragment
 	 * field in the PTE. When this field is set to a non-zero value, page
@@ -1438,38 +1459,39 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 	 * Userspace can support this by aligning virtual base address and
 	 * allocation size to the fragment size.
 	 */
-	unsigned max_frag = params->adev->vm_manager.fragment_size;
-	int r;
+	unsigned pages_per_frag = params->adev->vm_manager.fragment_size;
+	uint64_t frag_flags = AMDGPU_PTE_FRAG(pages_per_frag);
+	uint64_t frag_align = 1 << pages_per_frag;
+
+	uint64_t frag_start = ALIGN(start, frag_align);
+	uint64_t frag_end = end & ~(frag_align - 1);
 
 	/* system pages are non continuously */
-	if (params->src || !(flags & AMDGPU_PTE_VALID))
+	if (params->src || !(flags & AMDGPU_PTE_VALID) ||
+	    (frag_start >= frag_end))
 		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
 
-	while (start != end) {
-		uint64_t frag_flags, frag_end;
-		unsigned frag;
-
-		/* This intentionally wraps around if no bit is set */
-		frag = min((unsigned)ffs(start) - 1,
-			   (unsigned)fls64(end - start) - 1);
-		if (frag >= max_frag) {
-			frag_flags = AMDGPU_PTE_FRAG(max_frag);
-			frag_end = end & ~((1ULL << max_frag) - 1);
-		} else {
-			frag_flags = AMDGPU_PTE_FRAG(frag);
-			frag_end = start + (1 << frag);
-		}
-
-		r = amdgpu_vm_update_ptes(params, start, frag_end, dst,
-					  flags | frag_flags);
+	/* handle the 4K area at the beginning */
+	if (start != frag_start) {
+		r = amdgpu_vm_update_ptes(params, start, frag_start,
+					  dst, flags);
 		if (r)
 			return r;
-
-		dst += (frag_end - start) * AMDGPU_GPU_PAGE_SIZE;
-		start = frag_end;
+		dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
 	}
 
-	return 0;
+	/* handle the area in the middle */
+	r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst,
+				  flags | frag_flags);
+	if (r)
+		return r;
+
+	/* handle the 4K area at the end */
+	if (frag_end != end) {
+		dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
+		r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags);
+	}
+	return r;
 }
 
 /**
@@ -1477,6 +1499,7 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
  *
  * @adev: amdgpu_device pointer
  * @exclusive: fence we need to sync to
+ * @src: address where to copy page table entries from
  * @pages_addr: DMA addresses to use for mapping
  * @vm: requested vm
  * @start: start of mapped range
@@ -1490,6 +1513,7 @@ static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
  */
 static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 				       struct dma_fence *exclusive,
+				       uint64_t src,
 				       dma_addr_t *pages_addr,
 				       struct amdgpu_vm *vm,
 				       uint64_t start, uint64_t last,
@@ -1507,6 +1531,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	memset(&params, 0, sizeof(params));
 	params.adev = adev;
 	params.vm = vm;
+	params.src = src;
 
 	/* sync to everything on unmapping */
 	if (!(flags & AMDGPU_PTE_VALID))
@@ -1535,12 +1560,10 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	nptes = last - start + 1;
 
 	/*
-	 * reserve space for two commands every (1 << BLOCK_SIZE)
+	 * reserve space for one command every (1 << BLOCK_SIZE)
 	 *  entries or 2k dwords (whatever is smaller)
-         *
-         * The second command is for the shadow pagetables.
 	 */
-	ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1) * 2;
+	ncmds = (nptes >> min(adev->vm_manager.block_size, 11u)) + 1;
 
 	/* padding, etc. */
 	ndw = 64;
@@ -1548,9 +1571,15 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	/* one PDE write for each huge page */
 	ndw += ((nptes >> adev->vm_manager.block_size) + 1) * 6;
 
-	if (pages_addr) {
+	if (src) {
+		/* only copy commands needed */
+		ndw += ncmds * 7;
+
+		params.func = amdgpu_vm_do_copy_ptes;
+
+	} else if (pages_addr) {
 		/* copy commands needed */
-		ndw += ncmds * adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
+		ndw += ncmds * 7;
 
 		/* and also PTEs */
 		ndw += nptes * 2;
@@ -1559,11 +1588,10 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 
 	} else {
 		/* set page commands needed */
-		ndw += ncmds * adev->vm_manager.vm_pte_funcs->set_pte_pde_num_dw;
+		ndw += ncmds * 10;
 
-		/* extra commands for begin/end fragments */
-		ndw += 2 * adev->vm_manager.vm_pte_funcs->set_pte_pde_num_dw
-				* adev->vm_manager.fragment_size;
+		/* two extra commands for begin/end of fragment */
+		ndw += 2 * 10;
 
 		params.func = amdgpu_vm_do_set_ptes;
 	}
@@ -1574,7 +1602,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 
 	params.ib = &job->ibs[0];
 
-	if (pages_addr) {
+	if (!src && pages_addr) {
 		uint64_t *pte;
 		unsigned i;
 
@@ -1595,12 +1623,12 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	if (r)
 		goto error_free;
 
-	r = amdgpu_sync_resv(adev, &job->sync, vm->root.base.bo->tbo.resv,
+	r = amdgpu_sync_resv(adev, &job->sync, vm->root.bo->tbo.resv,
 			     owner);
 	if (r)
 		goto error_free;
 
-	r = reservation_object_reserve_shared(vm->root.base.bo->tbo.resv);
+	r = reservation_object_reserve_shared(vm->root.bo->tbo.resv);
 	if (r)
 		goto error_free;
 
@@ -1615,14 +1643,14 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	if (r)
 		goto error_free;
 
-	amdgpu_bo_fence(vm->root.base.bo, f, true);
+	amdgpu_bo_fence(vm->root.bo, f, true);
 	dma_fence_put(*fence);
 	*fence = f;
 	return 0;
 
 error_free:
 	amdgpu_job_free(job);
-	amdgpu_vm_invalidate_level(vm, &vm->root);
+	amdgpu_vm_invalidate_level(&vm->root);
 	return r;
 }
 
@@ -1647,13 +1675,12 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 				      dma_addr_t *pages_addr,
 				      struct amdgpu_vm *vm,
 				      struct amdgpu_bo_va_mapping *mapping,
-				      uint64_t vram_base_offset,
 				      uint64_t flags,
 				      struct ttm_mem_reg *mem,
 				      struct dma_fence **fence)
 {
 	struct drm_mm_node *nodes = mem ? mem->mm_node : NULL;
-	uint64_t pfn, start = mapping->start;
+	uint64_t pfn, src = 0, start = mapping->start;
 	int r;
 
 	/* normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here
@@ -1704,12 +1731,12 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 				max_entries = min(max_entries, 16ull * 1024ull);
 				break;
 			case AMDGPU_PL_DGMA:
-				addr += vram_base_offset +
+				addr += adev->vm_manager.vram_base_offset +
 					adev->mman.bdev.man[mem->mem_type].gpu_offset -
 					adev->mman.bdev.man[TTM_PL_VRAM].gpu_offset;
 				break;
 			case TTM_PL_VRAM:
-				addr += vram_base_offset;
+				addr += adev->vm_manager.vram_base_offset;
 				break;
 			default:
 				break;
@@ -1722,7 +1749,8 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 		addr += pfn << PAGE_SHIFT;
 
 		last = min((uint64_t)mapping->last, start + max_entries - 1);
-		r = amdgpu_vm_bo_update_mapping(adev, exclusive, pages_addr, vm,
+		r = amdgpu_vm_bo_update_mapping(adev, exclusive,
+						src, pages_addr, vm,
 						start, last, flags, addr,
 						fence);
 		if (r)
@@ -1760,10 +1788,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 	dma_addr_t *pages_addr = NULL;
 	struct ttm_mem_reg *mem;
 	struct drm_mm_node *nodes;
-	struct dma_fence *exclusive, **last_update;
+	struct dma_fence *exclusive;
 	uint64_t flags;
-	uint64_t vram_base_offset = adev->vm_manager.vram_base_offset;
-	struct amdgpu_device *bo_adev;
 	int r;
 
 	if (clear || !bo_va->base.bo) {
@@ -1785,54 +1811,43 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 		exclusive = reservation_object_get_excl(bo->tbo.resv);
 	}
 
-	if (bo) {
+	if (bo)
 		flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
-		bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
-		if (mem && mem->mem_type == TTM_PL_VRAM &&
-			adev != bo_adev) {
-			flags |= AMDGPU_PTE_SYSTEM;
-			vram_base_offset = bo_adev->mc.aper_base;
-		}
-	} else
-		flags = 0x0;
-
-	if (clear || (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv))
-		last_update = &vm->last_update;
 	else
-		last_update = &bo_va->last_pt_update;
-
-	if (!clear && bo_va->base.moved) {
-		bo_va->base.moved = false;
-		list_splice_init(&bo_va->valids, &bo_va->invalids);
+		flags = 0x0;
 
-	} else if (bo_va->cleared != clear) {
+	spin_lock(&vm->status_lock);
+	if (!list_empty(&bo_va->base.vm_status))
 		list_splice_init(&bo_va->valids, &bo_va->invalids);
-	}
+	spin_unlock(&vm->status_lock);
 
 	list_for_each_entry(mapping, &bo_va->invalids, list) {
 		r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
-					       mapping, vram_base_offset, flags,
-					       mem, last_update);
+					       mapping, flags, mem,
+					       &bo_va->last_pt_update);
 		if (r)
 			return r;
 	}
 
-	if (vm->use_cpu_for_update) {
-		/* Flush HDP */
-		mb();
-		amdgpu_gart_flush_gpu_tlb(adev, 0);
+	if (trace_amdgpu_vm_bo_mapping_enabled()) {
+		list_for_each_entry(mapping, &bo_va->valids, list)
+			trace_amdgpu_vm_bo_mapping(mapping);
+
+		list_for_each_entry(mapping, &bo_va->invalids, list)
+			trace_amdgpu_vm_bo_mapping(mapping);
 	}
 
 	spin_lock(&vm->status_lock);
+	list_splice_init(&bo_va->invalids, &bo_va->valids);
 	list_del_init(&bo_va->base.vm_status);
+	if (clear)
+		list_add(&bo_va->base.vm_status, &vm->cleared);
 	spin_unlock(&vm->status_lock);
 
-	list_splice_init(&bo_va->invalids, &bo_va->valids);
-	bo_va->cleared = clear;
-
-	if (trace_amdgpu_vm_bo_mapping_enabled()) {
-		list_for_each_entry(mapping, &bo_va->valids, list)
-			trace_amdgpu_vm_bo_mapping(mapping);
+	if (vm->use_cpu_for_update) {
+		/* Flush HDP */
+		mb();
+		amdgpu_gart_flush_gpu_tlb(adev, 0);
 	}
 
 	return 0;
@@ -1940,7 +1955,7 @@ static void amdgpu_vm_free_mapping(struct amdgpu_device *adev,
  */
 static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
-	struct reservation_object *resv = vm->root.base.bo->tbo.resv;
+	struct reservation_object *resv = vm->root.bo->tbo.resv;
 	struct dma_fence *excl, **shared;
 	unsigned i, shared_count;
 	int r;
@@ -1998,7 +2013,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
 		if (vm->pte_support_ats)
 			init_pte_value = AMDGPU_PTE_SYSTEM;
 
-		r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
+		r = amdgpu_vm_bo_update_mapping(adev, NULL, 0, NULL, vm,
 						mapping->start, mapping->last,
 						init_pte_value, 0, &f);
 		amdgpu_vm_free_mapping(adev, vm, mapping, f);
@@ -2020,35 +2035,29 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
 }
 
 /**
- * amdgpu_vm_handle_moved - handle moved BOs in the PT
+ * amdgpu_vm_clear_moved - clear moved BOs in the PT
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
- * @sync: sync object to add fences to
  *
- * Make sure all BOs which are moved are updated in the PTs.
+ * Make sure all moved BOs are cleared in the PT.
  * Returns 0 for success.
  *
- * PTs have to be reserved!
+ * PTs have to be reserved and mutex must be locked!
  */
-int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
-			   struct amdgpu_vm *vm)
+int amdgpu_vm_clear_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			    struct amdgpu_sync *sync)
 {
-	bool clear;
+	struct amdgpu_bo_va *bo_va = NULL;
 	int r = 0;
 
 	spin_lock(&vm->status_lock);
 	while (!list_empty(&vm->moved)) {
-		struct amdgpu_bo_va *bo_va;
-
 		bo_va = list_first_entry(&vm->moved,
 			struct amdgpu_bo_va, base.vm_status);
 		spin_unlock(&vm->status_lock);
 
-		/* Per VM BOs never need to bo cleared in the page tables */
-		clear = bo_va->base.bo->tbo.resv != vm->root.base.bo->tbo.resv;
-
-		r = amdgpu_vm_bo_update(adev, bo_va, clear);
+		r = amdgpu_vm_bo_update(adev, bo_va, true);
 		if (r)
 			return r;
 
@@ -2056,6 +2065,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
 	}
 	spin_unlock(&vm->status_lock);
 
+	if (bo_va)
+		r = amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
+
 	return r;
 }
 
@@ -2097,39 +2109,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
 	return bo_va;
 }
 
-
-/**
- * amdgpu_vm_bo_insert_mapping - insert a new mapping
- *
- * @adev: amdgpu_device pointer
- * @bo_va: bo_va to store the address
- * @mapping: the mapping to insert
- *
- * Insert a new mapping into all structures.
- */
-static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
-				    struct amdgpu_bo_va *bo_va,
-				    struct amdgpu_bo_va_mapping *mapping)
-{
-	struct amdgpu_vm *vm = bo_va->base.vm;
-	struct amdgpu_bo *bo = bo_va->base.bo;
-
-	mapping->bo_va = bo_va;
-	list_add(&mapping->list, &bo_va->invalids);
-	amdgpu_vm_it_insert(mapping, &vm->va);
-
-	if (mapping->flags & AMDGPU_PTE_PRT)
-		amdgpu_vm_prt_get(adev);
-
-	if (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
-		spin_lock(&vm->status_lock);
-		if (list_empty(&bo_va->base.vm_status))
-			list_add(&bo_va->base.vm_status, &vm->moved);
-		spin_unlock(&vm->status_lock);
-	}
-	trace_amdgpu_vm_bo_map(bo_va, mapping);
-}
-
 /**
  * amdgpu_vm_bo_map - map bo inside a vm
  *
@@ -2181,12 +2160,18 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
 	if (!mapping)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&mapping->list);
 	mapping->start = saddr;
 	mapping->last = eaddr;
 	mapping->offset = offset;
 	mapping->flags = flags;
 
-	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
+	list_add(&mapping->list, &bo_va->invalids);
+	amdgpu_vm_it_insert(mapping, &vm->va);
+
+	if (flags & AMDGPU_PTE_PRT)
+		amdgpu_vm_prt_get(adev);
+	trace_amdgpu_vm_bo_map(bo_va, mapping);
 
 	return 0;
 }
@@ -2213,6 +2198,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
 {
 	struct amdgpu_bo_va_mapping *mapping;
 	struct amdgpu_bo *bo = bo_va->base.bo;
+	struct amdgpu_vm *vm = bo_va->base.vm;
 	uint64_t eaddr;
 	int r;
 
@@ -2246,7 +2232,12 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
 	mapping->offset = offset;
 	mapping->flags = flags;
 
-	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
+	list_add(&mapping->list, &bo_va->invalids);
+	amdgpu_vm_it_insert(mapping, &vm->va);
+
+	if (flags & AMDGPU_PTE_PRT)
+		amdgpu_vm_prt_get(adev);
+	trace_amdgpu_vm_bo_map(bo_va, mapping);
 
 	return 0;
 }
@@ -2292,7 +2283,6 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
 
 	list_del(&mapping->list);
 	amdgpu_vm_it_remove(mapping, &vm->va);
-	mapping->bo_va = NULL;
 	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 
 	if (valid)
@@ -2378,7 +2368,6 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
 		if (tmp->last > eaddr)
 		    tmp->last = eaddr;
 
-		tmp->bo_va = NULL;
 		list_add(&tmp->list, &vm->freed);
 		trace_amdgpu_vm_bo_unmap(NULL, tmp);
 	}
@@ -2405,19 +2394,6 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
 }
 
 /**
- * amdgpu_vm_bo_lookup_mapping - find mapping by address
- *
- * @vm: the requested VM
- *
- * Find a mapping by it's address.
- */
-struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm,
-							 uint64_t addr)
-{
-	return amdgpu_vm_it_iter_first(&vm->va, addr, addr);
-}
-
-/**
  * amdgpu_vm_bo_rmv - remove a bo to a specific vm
  *
  * @adev: amdgpu_device pointer
@@ -2442,7 +2418,6 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
 	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
 		list_del(&mapping->list);
 		amdgpu_vm_it_remove(mapping, &vm->va);
-		mapping->bo_va = NULL;
 		trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 		list_add(&mapping->list, &vm->freed);
 	}
@@ -2467,36 +2442,15 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
  * Mark @bo as invalid.
  */
 void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
-			     struct amdgpu_bo *bo, bool evicted)
+			     struct amdgpu_bo *bo)
 {
 	struct amdgpu_vm_bo_base *bo_base;
 
 	list_for_each_entry(bo_base, &bo->va, bo_list) {
-		struct amdgpu_vm *vm = bo_base->vm;
-
-		bo_base->moved = true;
-		if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) {
-			spin_lock(&bo_base->vm->status_lock);
-			if (bo->tbo.type == ttm_bo_type_kernel)
-				list_move(&bo_base->vm_status, &vm->evicted);
-			else
-				list_move_tail(&bo_base->vm_status,
-					       &vm->evicted);
-			spin_unlock(&bo_base->vm->status_lock);
-			continue;
-		}
-
-		if (bo->tbo.type == ttm_bo_type_kernel) {
-			spin_lock(&bo_base->vm->status_lock);
-			if (list_empty(&bo_base->vm_status))
-				list_add(&bo_base->vm_status, &vm->relocated);
-			spin_unlock(&bo_base->vm->status_lock);
-			continue;
-		}
-
 		spin_lock(&bo_base->vm->status_lock);
 		if (list_empty(&bo_base->vm_status))
-			list_add(&bo_base->vm_status, &vm->moved);
+			list_add(&bo_base->vm_status,
+				 &bo_base->vm->moved);
 		spin_unlock(&bo_base->vm->status_lock);
 	}
 }
@@ -2577,14 +2531,13 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	u64 flags;
 	uint64_t init_pde_value = 0;
 
-	vm->va = RB_ROOT_CACHED;
+	vm->va = RB_ROOT;
 	vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter);
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
 		vm->reserved_vmid[i] = NULL;
 	spin_lock_init(&vm->status_lock);
-	INIT_LIST_HEAD(&vm->evicted);
-	INIT_LIST_HEAD(&vm->relocated);
 	INIT_LIST_HEAD(&vm->moved);
+	INIT_LIST_HEAD(&vm->cleared);
 	INIT_LIST_HEAD(&vm->freed);
 
 	/* create scheduler entity for page table updates */
@@ -2615,7 +2568,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
 		  "CPU update of VM recommended only for large BAR system\n");
-	vm->last_update = NULL;
+	vm->last_dir_update = NULL;
 
 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 			AMDGPU_GEM_CREATE_VRAM_CLEARED;
@@ -2628,31 +2581,46 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	r = amdgpu_bo_create(adev, amdgpu_vm_bo_size(adev, 0), align, true,
 			     AMDGPU_GEM_DOMAIN_VRAM,
 			     flags,
-			     NULL, NULL, init_pde_value, &vm->root.base.bo);
+			     NULL, NULL, init_pde_value, &vm->root.bo);
 	if (r)
 		goto error_free_sched_entity;
 
-	vm->root.base.vm = vm;
-	list_add_tail(&vm->root.base.bo_list, &vm->root.base.bo->va);
-	INIT_LIST_HEAD(&vm->root.base.vm_status);
+	r = amdgpu_bo_reserve(vm->root.bo, false);
+	if (r)
+		goto error_free_root;
+
+	vm->last_eviction_counter = atomic64_read(&adev->num_evictions);
 
 	if (vm->use_cpu_for_update) {
-		r = amdgpu_bo_reserve(vm->root.base.bo, false);
+		r = amdgpu_bo_kmap(vm->root.bo, NULL);
 		if (r)
 			goto error_free_root;
+	}
 
-		r = amdgpu_bo_kmap(vm->root.base.bo, NULL);
-		if (r)
-			goto error_free_root;
-		amdgpu_bo_unreserve(vm->root.base.bo);
+	amdgpu_bo_unreserve(vm->root.bo);
+
+	vm->vm_context = vm_context;
+	if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
+		mutex_lock(&adev->vm_manager.lock);
+
+		if (adev->vm_manager.n_compute_vms++ == 0) {
+			/* First Compute VM: enable compute power profile */
+			if (adev->pp_enabled)
+				amdgpu_dpm_switch_power_profile(adev,
+						AMD_PP_COMPUTE_PROFILE);
+			else if (adev->pm.funcs->switch_power_profile)
+				adev->pm.funcs->switch_power_profile(adev,
+						AMD_PP_COMPUTE_PROFILE);
+		}
+		mutex_unlock(&adev->vm_manager.lock);
 	}
 
 	return 0;
 
 error_free_root:
-	amdgpu_bo_unref(&vm->root.base.bo->shadow);
-	amdgpu_bo_unref(&vm->root.base.bo);
-	vm->root.base.bo = NULL;
+	amdgpu_bo_unref(&vm->root.bo->shadow);
+	amdgpu_bo_unref(&vm->root.bo);
+	vm->root.bo = NULL;
 
 error_free_sched_entity:
 	amd_sched_entity_fini(&ring->sched, &vm->entity);
@@ -2671,18 +2639,20 @@ static void amdgpu_vm_free_levels(struct amdgpu_vm_pt *level)
 {
 	unsigned i;
 
-	if (level->base.bo) {
-		list_del(&level->base.bo_list);
-		list_del(&level->base.vm_status);
-		amdgpu_bo_unref(&level->base.bo->shadow);
-		amdgpu_bo_unref(&level->base.bo);
+	if (level->bo) {
+		amdgpu_bo_unref(&level->bo->shadow);
+		amdgpu_bo_unref(&level->bo);
 	}
 
 	if (level->entries)
 		for (i = 0; i <= level->last_entry_used; i++)
 			amdgpu_vm_free_levels(&level->entries[i]);
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)
+	drm_free_large(level->entries);
+#else
 	kvfree(level->entries);
+#endif
 }
 
 /**
@@ -2698,16 +2668,31 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
 	struct amdgpu_bo_va_mapping *mapping, *tmp;
 	bool prt_fini_needed = !!adev->gart.gart_funcs->set_prt;
-	struct amdgpu_bo *root;
-	int i, r;
+	int i;
+
+	if (vm->vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
+		mutex_lock(&adev->vm_manager.lock);
+
+		WARN(adev->vm_manager.n_compute_vms == 0, "Unbalanced number of Compute VMs");
+
+		if (--adev->vm_manager.n_compute_vms == 0) {
+			/* Last Compute VM: enable graphics power profile */
+			if (adev->pp_enabled)
+				amdgpu_dpm_switch_power_profile(adev,
+						AMD_PP_GFX_PROFILE);
+			else if (adev->pm.funcs->switch_power_profile)
+				adev->pm.funcs->switch_power_profile(adev,
+						AMD_PP_GFX_PROFILE);
+		}
+		mutex_unlock(&adev->vm_manager.lock);
+	}
 
 	amd_sched_entity_fini(vm->entity.sched, &vm->entity);
 
-	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
+	if (!RB_EMPTY_ROOT(&vm->va)) {
 		dev_err(adev->dev, "still active bo inside vm\n");
 	}
-	rbtree_postorder_for_each_entry_safe(mapping, tmp,
-					     &vm->va.rb_root, rb) {
+	rbtree_postorder_for_each_entry_safe(mapping, tmp, &vm->va, rb) {
 		list_del(&mapping->list);
 		amdgpu_vm_it_remove(mapping, &vm->va);
 		kfree(mapping);
@@ -2721,9 +2706,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		list_del(&mapping->list);
 		amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
 	}
-	
+
 	amdgpu_vm_free_levels(&vm->root);
-	dma_fence_put(vm->last_update);
+	dma_fence_put(vm->last_dir_update);
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
 		amdgpu_vm_free_reserved_vmid(adev, vm, i);
 }
@@ -2755,8 +2740,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
 		}
 	}
 
-	adev->vm_manager.fence_context =
-		dma_fence_context_alloc(AMDGPU_MAX_RINGS);
+	adev->vm_manager.fence_context = kcl_fence_context_alloc(AMDGPU_MAX_RINGS);
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i)
 		adev->vm_manager.seqno[i] = 0;
 
@@ -2781,6 +2765,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
 	adev->vm_manager.vm_update_mode = 0;
 #endif
 
+	adev->vm_manager.n_compute_vms = 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 28cf20b..415e659 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -153,6 +153,9 @@ struct amdgpu_vm {
 	/* dedicated to vm */
 	struct amdgpu_vm_id	*reserved_vmid[AMDGPU_MAX_VMHUBS];
 
+	/* Whether this is a Compute or GFX Context */
+	int			vm_context;
+
 	/* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */
 	bool                    use_cpu_for_update;
 
@@ -220,6 +223,8 @@ struct amdgpu_vm_manager {
 	 * BIT1[= 0] Compute updated by SDMA [= 1] by CPU
 	 */
 	int					vm_update_mode;
+	/* Number of Compute VMs, used for detecting Compute activity */
+	unsigned                                n_compute_vms;
 };
 
 void amdgpu_vm_manager_init(struct amdgpu_device *adev);
-- 
2.7.4