1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
From 1f8443941d79b921e4860e06bf2ed87ca96e740c Mon Sep 17 00:00:00 2001
From: pding <Pixel.Ding@amd.com>
Date: Mon, 23 Oct 2017 17:22:09 +0800
Subject: [PATCH 2078/4131] drm/amdgpu: retry init if it fails due to exclusive
mode timeout (v3)
The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.
If we find the init fails due to exclusive mode timeout, try it again.
v2:
- rewrite the condition for readable value.
v3:
- fix typo, add comments for sleep
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: pding <Pixel.Ding@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 15 +++++++++++++--
2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3e45e93..a8f64f82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2326,6 +2326,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_init(adev);
if (r) {
+ /* failed in exclusive mode due to timeout */
+ if (amdgpu_sriov_vf(adev) &&
+ !amdgpu_sriov_runtime(adev) &&
+ amdgpu_virt_mmio_blocked(adev) &&
+ !amdgpu_virt_wait_reset(adev)) {
+ dev_err(adev->dev, "VF exclusive mode timeout\n");
+ r = -EAGAIN;
+ goto failed;
+ }
dev_err(adev->dev, "amdgpu_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
amdgpu_fini(adev);
@@ -2413,6 +2422,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
amdgpu_vf_error_trans_all(adev);
if (runtime)
vga_switcheroo_fini_domain_pm_ops(adev->dev);
+
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 4a35d1b..9098d89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -97,7 +97,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
{
struct amdgpu_device *adev;
- int r, acpi_status;
+ int r, acpi_status, retry = 0;
#ifdef CONFIG_DRM_AMDGPU_SI
if (!amdgpu_si_support) {
@@ -130,6 +130,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
}
}
#endif
+retry_init:
adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
if (adev == NULL) {
@@ -156,7 +157,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
* VRAM allocation
*/
r = amdgpu_device_init(adev, dev, dev->pdev, flags);
- if (r) {
+ if (r == -EAGAIN && ++retry <= 3) {
+ adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+ adev->virt.ops = NULL;
+ amdgpu_device_fini(adev);
+ kfree(adev);
+ dev->dev_private = NULL;
+ /* Don't request EX mode too frequently which is attacking */
+ msleep(5000);
+ dev_err(&dev->pdev->dev, "retry init %d\n", retry);
+ goto retry_init;
+ } else if (r) {
dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
goto out;
}
--
2.7.4
|