From 284b4b30d1d110d5f11a861981729b0f26e38a73 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 30 May 2016 20:47:17 -0400 Subject: [PATCH 1144/4131] drm/amdkfd: Fix suspend/resume issue on Carrizo When we do suspend/resume through "sudo pm-suspend" while there is HSA activity running, upon resume we will encounter HWS hanging, which is caused by memory read/write failures. The root cause is that when suspend, we neglected to unbind pasid from kfd device. Another major change is that the bind/unbinding is changed to be performed on a per process basis, instead of whether there are queues in dqm. There are some other small changes as well. Change-Id: If9ac972fc4309b688f6c1d07e27cede54814410e Signed-off-by: Yong Zhao Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_process.c --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 26 +++--- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 94 ++++++++++++++++++---- 4 files changed, 105 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 8bf209d..aae4e5a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -258,7 +258,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); if (dev) - kfd_unbind_process_from_device(dev, pasid); + kfd_process_iommu_unbind_callback(dev, pasid); } /* @@ -488,14 +488,18 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) { BUG_ON(kfd == NULL); - if (kfd->init_complete) { - kfd->dqm->ops.stop(kfd->dqm); - if (kfd->device_info->is_need_iommu_device) { - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); - amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); - amd_iommu_free_device(kfd->pdev); - } - } + if (!kfd->init_complete) + return; + + kfd->dqm->ops.stop(kfd->dqm); + if (!kfd->device_info->is_need_iommu_device) + return; + + kfd_unbind_processes_from_device(kfd); + + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); } int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem) @@ -533,6 +537,10 @@ static int kfd_resume(struct kfd_dev *kfd) iommu_pasid_shutdown_callback); amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb); + + err = kfd_bind_processes_to_device(kfd); + if (err) + return -ENXIO; } err = kfd->dqm->ops.start(kfd->dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 5fb3c1e..6f01393 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -863,7 +863,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm) static int start_cpsch(struct device_queue_manager *dqm) { - struct device_process_node *node; int retval; BUG_ON(!dqm); @@ -892,11 +891,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - list_for_each_entry(node, &dqm->queues, list) - if (node->qpd->pqm->process && dqm->dev) - kfd_bind_process_to_device(dqm->dev, - node->qpd->pqm->process); - mutex_lock(&dqm->lock); execute_queues_cpsch(dqm, false); mutex_unlock(&dqm->lock); @@ -911,9 +905,6 @@ static int start_cpsch(struct device_queue_manager *dqm) static int stop_cpsch(struct device_queue_manager *dqm) { - struct device_process_node *node; - struct kfd_process_device *pdd; - BUG_ON(!dqm); mutex_lock(&dqm->lock); @@ -922,10 +913,6 @@ static int stop_cpsch(struct device_queue_manager *dqm) mutex_unlock(&dqm->lock); - list_for_each_entry(node, &dqm->queues, list) { - pdd = qpd_to_pdd(node->qpd); - pdd->bound = false; - } kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packets); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index e3335db..bfce9a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -531,6 +531,12 @@ struct qcm_process_device { #define GET_GPU_ID(handle) (handle >> 32) #define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) +enum kfd_pdd_bound { + PDD_UNBOUND = 0, + PDD_BOUND, + PDD_BOUND_SUSPENDED, +}; + /* Data that is per-process-per device. */ struct kfd_process_device { /* @@ -564,7 +570,7 @@ struct kfd_process_device { uint64_t sh_hidden_private_base_vmid; /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ - bool bound; + enum kfd_pdd_bound bound; /* VM context for GPUVM allocations */ void *vm; @@ -658,8 +664,10 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - struct kfd_process *p); -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); + struct kfd_process *p); +int kfd_bind_processes_to_device(struct kfd_dev *dev); +void kfd_unbind_processes_from_device(struct kfd_dev *dev); +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p); struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 2c371cd..3b312b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -319,8 +319,13 @@ static void kfd_process_wq_release(struct work_struct *work) pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", pdd->dev->id, p->pasid); - if (pdd->dev->device_info->is_need_iommu_device) - amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + if (pdd->dev->device_info->is_need_iommu_device) { + if (pdd->bound == PDD_BOUND) { + amd_iommu_unbind_pasid(pdd->dev->pdev, + p->pasid); + pdd->bound = PDD_UNBOUND; + } + } /* * Remove all handles from idr and release appropriate @@ -616,9 +621,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, list_for_each_entry(pdd, &p->per_device_data, per_device_list) if (pdd->dev == dev) - break; + return pdd; - return pdd; + return NULL; } struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, @@ -636,6 +641,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, pdd->qpd.evicted = 0; pdd->reset_wavefronts = false; pdd->process = p; + pdd->bound = PDD_UNBOUND; list_add(&pdd->per_device_list, &p->per_device_data); /* Init idr used for memory handle translation */ @@ -672,21 +678,85 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, return ERR_PTR(-ENOMEM); } - if (pdd->bound) + if (pdd->bound == PDD_BOUND) return pdd; + if (pdd->bound == PDD_BOUND_SUSPENDED) { + pr_err("kfd: binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); + return ERR_PTR(-EINVAL); + } + if (dev->device_info->is_need_iommu_device) { err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); if (err < 0) return ERR_PTR(err); } - pdd->bound = true; + pdd->bound = PDD_BOUND; return pdd; } -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) +int kfd_bind_processes_to_device(struct kfd_dev *dev) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + unsigned int temp; + int err = 0; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + down_write(&p->lock); + pdd = kfd_get_process_device_data(dev, p); + if (pdd->bound != PDD_BOUND_SUSPENDED) { + up_write(&p->lock); + continue; + } + + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, + p->lead_thread); + if (err < 0) { + pr_err("unexpected pasid %d binding failure\n", + p->pasid); + up_write(&p->lock); + break; + } + + pdd->bound = PDD_BOUND; + up_write(&p->lock); + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return err; +} + +void kfd_unbind_processes_from_device(struct kfd_dev *dev) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + unsigned int temp, temp_bound, temp_pasid; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + down_write(&p->lock); + pdd = kfd_get_process_device_data(dev, p); + temp_bound = pdd->bound; + temp_pasid = p->pasid; + if (pdd->bound == PDD_BOUND) + pdd->bound = PDD_BOUND_SUSPENDED; + up_write(&p->lock); + + if (temp_bound == PDD_BOUND) + amd_iommu_unbind_pasid(dev->pdev, temp_pasid); + } + + srcu_read_unlock(&kfd_processes_srcu, idx); +} + +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) { struct kfd_process *p; struct kfd_process_device *pdd; @@ -722,16 +792,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) pdd->reset_wavefronts = false; } - /* - * Just mark pdd as unbound, because we still need it - * to call amd_iommu_unbind_pasid() in when the - * process exits. - * We don't call amd_iommu_unbind_pasid() here - * because the IOMMU called us. - */ - if (pdd) - pdd->bound = false; - up_write(&p->lock); } -- 2.7.4