diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/1443-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.14.71/1443-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch | 312 |
1 files changed, 312 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/1443-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/1443-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch new file mode 100644 index 00000000..264ff29d --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/1443-drm-amdkfd-Fix-suspend-resume-issue-on-Carrizo.patch @@ -0,0 +1,312 @@ +From 45b36801b47893b0e57e8256e733f27d18c19a0a Mon Sep 17 00:00:00 2001 +From: Yong Zhao <yong.zhao@amd.com> +Date: Mon, 30 May 2016 20:47:17 -0400 +Subject: [PATCH 1443/4131] drm/amdkfd: Fix suspend/resume issue on Carrizo + +When we do suspend/resume through "sudo pm-suspend" while there is +HSA activity running, upon resume we will encounter HWS hanging, which +is caused by memory read/write failures. The root cause is that when +suspend, we neglected to unbind pasid from kfd device. + +Another major change is that the bind/unbinding is changed to be +performed on a per process basis, instead of whether there are queues +in dqm. + +There are some other small changes as well. + +Change-Id: If9ac972fc4309b688f6c1d07e27cede54814410e +Signed-off-by: Yong Zhao <yong.zhao@amd.com> + + Conflicts: + drivers/gpu/drm/amd/amdkfd/kfd_process.c +--- + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 26 +++--- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 --- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++- + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 94 ++++++++++++++++++---- + 4 files changed, 105 insertions(+), 42 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index 8bf209d..aae4e5a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -258,7 +258,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) + struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); + + if (dev) +- kfd_unbind_process_from_device(dev, pasid); ++ kfd_process_iommu_unbind_callback(dev, pasid); + } + + /* +@@ -488,14 +488,18 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) + { + BUG_ON(kfd == NULL); + +- if (kfd->init_complete) { +- kfd->dqm->ops.stop(kfd->dqm); +- if (kfd->device_info->is_need_iommu_device) { +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); +- amd_iommu_free_device(kfd->pdev); +- } +- } ++ if (!kfd->init_complete) ++ return; ++ ++ kfd->dqm->ops.stop(kfd->dqm); ++ if (!kfd->device_info->is_need_iommu_device) ++ return; ++ ++ kfd_unbind_processes_from_device(kfd); ++ ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); ++ amd_iommu_free_device(kfd->pdev); + } + + int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem) +@@ -533,6 +537,10 @@ static int kfd_resume(struct kfd_dev *kfd) + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, + iommu_invalid_ppr_cb); ++ ++ err = kfd_bind_processes_to_device(kfd); ++ if (err) ++ return -ENXIO; + } + + err = kfd->dqm->ops.start(kfd->dqm); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index 5fb3c1e..6f01393 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -863,7 +863,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + + static int start_cpsch(struct device_queue_manager *dqm) + { +- struct device_process_node *node; + int retval; + + BUG_ON(!dqm); +@@ -892,11 +891,6 @@ static int start_cpsch(struct device_queue_manager *dqm) + + init_interrupts(dqm); + +- list_for_each_entry(node, &dqm->queues, list) +- if (node->qpd->pqm->process && dqm->dev) +- kfd_bind_process_to_device(dqm->dev, +- node->qpd->pqm->process); +- + mutex_lock(&dqm->lock); + execute_queues_cpsch(dqm, false); + mutex_unlock(&dqm->lock); +@@ -911,9 +905,6 @@ static int start_cpsch(struct device_queue_manager *dqm) + + static int stop_cpsch(struct device_queue_manager *dqm) + { +- struct device_process_node *node; +- struct kfd_process_device *pdd; +- + BUG_ON(!dqm); + + mutex_lock(&dqm->lock); +@@ -922,10 +913,6 @@ static int stop_cpsch(struct device_queue_manager *dqm) + + mutex_unlock(&dqm->lock); + +- list_for_each_entry(node, &dqm->queues, list) { +- pdd = qpd_to_pdd(node->qpd); +- pdd->bound = false; +- } + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); + pm_uninit(&dqm->packets); + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 1715c74..6727e4a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -531,6 +531,12 @@ struct qcm_process_device { + #define GET_GPU_ID(handle) (handle >> 32) + #define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) + ++enum kfd_pdd_bound { ++ PDD_UNBOUND = 0, ++ PDD_BOUND, ++ PDD_BOUND_SUSPENDED, ++}; ++ + /* Data that is per-process-per device. */ + struct kfd_process_device { + /* +@@ -564,7 +570,7 @@ struct kfd_process_device { + uint64_t sh_hidden_private_base_vmid; + + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ +- bool bound; ++ enum kfd_pdd_bound bound; + + /* VM context for GPUVM allocations */ + void *vm; +@@ -658,8 +664,10 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); + struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); + + struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, +- struct kfd_process *p); +-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); ++ struct kfd_process *p); ++int kfd_bind_processes_to_device(struct kfd_dev *dev); ++void kfd_unbind_processes_from_device(struct kfd_dev *dev); ++void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index 2c371cd..3b312b7 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -319,8 +319,13 @@ static void kfd_process_wq_release(struct work_struct *work) + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + +- if (pdd->dev->device_info->is_need_iommu_device) +- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); ++ if (pdd->dev->device_info->is_need_iommu_device) { ++ if (pdd->bound == PDD_BOUND) { ++ amd_iommu_unbind_pasid(pdd->dev->pdev, ++ p->pasid); ++ pdd->bound = PDD_UNBOUND; ++ } ++ } + + /* + * Remove all handles from idr and release appropriate +@@ -616,9 +621,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->dev == dev) +- break; ++ return pdd; + +- return pdd; ++ return NULL; + } + + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, +@@ -636,6 +641,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + pdd->qpd.evicted = 0; + pdd->reset_wavefronts = false; + pdd->process = p; ++ pdd->bound = PDD_UNBOUND; + list_add(&pdd->per_device_list, &p->per_device_data); + + /* Init idr used for memory handle translation */ +@@ -672,21 +678,85 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + return ERR_PTR(-ENOMEM); + } + +- if (pdd->bound) ++ if (pdd->bound == PDD_BOUND) + return pdd; + ++ if (pdd->bound == PDD_BOUND_SUSPENDED) { ++ pr_err("kfd: binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); ++ return ERR_PTR(-EINVAL); ++ } ++ + if (dev->device_info->is_need_iommu_device) { + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); + } + +- pdd->bound = true; ++ pdd->bound = PDD_BOUND; + + return pdd; + } + +-void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) ++int kfd_bind_processes_to_device(struct kfd_dev *dev) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p; ++ unsigned int temp; ++ int err = 0; ++ ++ int idx = srcu_read_lock(&kfd_processes_srcu); ++ ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ down_write(&p->lock); ++ pdd = kfd_get_process_device_data(dev, p); ++ if (pdd->bound != PDD_BOUND_SUSPENDED) { ++ up_write(&p->lock); ++ continue; ++ } ++ ++ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, ++ p->lead_thread); ++ if (err < 0) { ++ pr_err("unexpected pasid %d binding failure\n", ++ p->pasid); ++ up_write(&p->lock); ++ break; ++ } ++ ++ pdd->bound = PDD_BOUND; ++ up_write(&p->lock); ++ } ++ ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++ ++ return err; ++} ++ ++void kfd_unbind_processes_from_device(struct kfd_dev *dev) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p; ++ unsigned int temp, temp_bound, temp_pasid; ++ ++ int idx = srcu_read_lock(&kfd_processes_srcu); ++ ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ down_write(&p->lock); ++ pdd = kfd_get_process_device_data(dev, p); ++ temp_bound = pdd->bound; ++ temp_pasid = p->pasid; ++ if (pdd->bound == PDD_BOUND) ++ pdd->bound = PDD_BOUND_SUSPENDED; ++ up_write(&p->lock); ++ ++ if (temp_bound == PDD_BOUND) ++ amd_iommu_unbind_pasid(dev->pdev, temp_pasid); ++ } ++ ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++} ++ ++void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) + { + struct kfd_process *p; + struct kfd_process_device *pdd; +@@ -722,16 +792,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) + pdd->reset_wavefronts = false; + } + +- /* +- * Just mark pdd as unbound, because we still need it +- * to call amd_iommu_unbind_pasid() in when the +- * process exits. +- * We don't call amd_iommu_unbind_pasid() here +- * because the IOMMU called us. +- */ +- if (pdd) +- pdd->bound = false; +- + up_write(&p->lock); + } + +-- +2.7.4 + |