diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/1276-drm-scheduler-Fix-bad-job-be-re-processed-in-TDR.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.19.8/1276-drm-scheduler-Fix-bad-job-be-re-processed-in-TDR.patch | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/1276-drm-scheduler-Fix-bad-job-be-re-processed-in-TDR.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/1276-drm-scheduler-Fix-bad-job-be-re-processed-in-TDR.patch new file mode 100644 index 00000000..6caacc06 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/1276-drm-scheduler-Fix-bad-job-be-re-processed-in-TDR.patch @@ -0,0 +1,118 @@ +From fd828b43d9e737642ba124345825ef0149d4d8de Mon Sep 17 00:00:00 2001 +From: Trigger Huang <Trigger.Huang@amd.com> +Date: Wed, 14 Nov 2018 10:44:50 +0800 +Subject: [PATCH 1276/2940] drm/scheduler: Fix bad job be re-processed in TDR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +A bad job is the one triggered TDR(In the current amdgpu's +implementation, actually all the jobs in the current joq-queue will +be treated as bad jobs). In the recovery process, its fence +will be fake signaled and as a result, the work behind will be scheduled +to delete it from the mirror list, but if the TDR process is invoked +before the work's execution, then this bad job might be processed again +and the call dma_fence_set_error to its fence in TDR process will lead to +kernel warning trace: + +[ 143.033605] WARNING: CPU: 2 PID: 53 at ./include/linux/dma-fence.h:437 amddrm_sched_job_recovery+0x1af/0x1c0 [amd_sched] +kernel: [ 143.033606] Modules linked in: amdgpu(OE) amdchash(OE) amdttm(OE) amd_sched(OE) amdkcl(OE) amd_iommu_v2 drm_kms_helper drm i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 snd_hda_codec_generic crypto_simd glue_helper cryptd snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq joydev snd_seq_device snd_timer snd soundcore binfmt_misc input_leds mac_hid serio_raw nfsd auth_rpcgss nfs_acl lockd grace sunrpc sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 8139too floppy psmouse 8139cp mii i2c_piix4 pata_acpi +[ 143.033649] CPU: 2 PID: 53 Comm: kworker/2:1 Tainted: G OE 4.15.0-20-generic #21-Ubuntu +[ 143.033650] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 +[ 143.033653] Workqueue: events drm_sched_job_timedout [amd_sched] +[ 143.033656] RIP: 0010:amddrm_sched_job_recovery+0x1af/0x1c0 [amd_sched] +[ 143.033657] RSP: 0018:ffffa9f880fe7d48 EFLAGS: 00010202 +[ 143.033659] RAX: 0000000000000007 RBX: ffff9b98f2b24c00 RCX: ffff9b98efef4f08 +[ 143.033660] RDX: ffff9b98f2b27400 RSI: ffff9b98f2b24c50 RDI: ffff9b98efef4f18 +[ 143.033660] RBP: ffffa9f880fe7d98 R08: 0000000000000001 R09: 00000000000002b6 +[ 143.033661] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9b98efef3430 +[ 143.033662] R13: ffff9b98efef4d80 R14: ffff9b98efef4e98 R15: ffff9b98eaf91c00 +[ 143.033663] FS: 0000000000000000(0000) GS:ffff9b98ffd00000(0000) knlGS:0000000000000000 +[ 143.033664] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 143.033665] CR2: 00007fc49c96d470 CR3: 000000001400a005 CR4: 00000000003606e0 +[ 143.033669] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 143.033669] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 143.033670] Call Trace: +[ 143.033744] amdgpu_device_gpu_recover+0x144/0x820 [amdgpu] +[ 143.033788] amdgpu_job_timedout+0x9b/0xa0 [amdgpu] +[ 143.033791] drm_sched_job_timedout+0xcc/0x150 [amd_sched] +[ 143.033795] process_one_work+0x1de/0x410 +[ 143.033797] worker_thread+0x32/0x410 +[ 143.033799] kthread+0x121/0x140 +[ 143.033801] ? process_one_work+0x410/0x410 +[ 143.033803] ? kthread_create_worker_on_cpu+0x70/0x70 +[ 143.033806] ret_from_fork+0x35/0x40 + +So just delete the bad job from mirror list directly + +Changes in v3: + - Add a helper function to delete the bad jobs from mirror list and call + it directly *before* the job's fence is signaled + +Changes in v2: + - delete the useless list node check + - also delete bad jobs in drm_sched_main because: + kthread_unpark(ring->sched.thread) will be invoked very early before + amdgpu_device_gpu_recover's return, then drm_sched_main will have + chance to pick up a new job from the job queue. This new job will be + added into the mirror list and processed by amdgpu_job_run, but may + not be deleted from the mirror list on time due to the same reason. + And finally re-processed by drm_sched_job_recovery + +Signed-off-by: Trigger Huang <Trigger.Huang@amd.com> +Reviewed-by: Christian König <chrstian.koenig@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +--- + drivers/gpu/drm/scheduler/sched_main.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c +index e944cdec9e2d..ac2a35cec00c 100644 +--- a/drivers/gpu/drm/scheduler/sched_main.c ++++ b/drivers/gpu/drm/scheduler/sched_main.c +@@ -60,6 +60,8 @@ + + static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); + ++static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job); ++ + /** + * drm_sched_rq_init - initialize a given run queue struct + * +@@ -285,7 +287,7 @@ static void drm_sched_job_finish(struct work_struct *work) + + spin_lock_irqsave(&sched->job_list_lock, flags); + /* remove job from ring_mirror_list */ +- list_del(&s_job->node); ++ list_del_init(&s_job->node); + /* queue TDR for next job */ + drm_sched_start_timeout(sched); + spin_unlock_irqrestore(&sched->job_list_lock, flags); +@@ -687,6 +689,8 @@ static int drm_sched_main(void *param) + r); + dma_fence_put(fence); + } else { ++ if (s_fence->finished.error < 0) ++ drm_sched_expel_job_unlocked(s_job); + drm_sched_process_job(NULL, &s_fence->cb); + } + +@@ -695,6 +699,15 @@ static int drm_sched_main(void *param) + return 0; + } + ++static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job) ++{ ++ struct drm_gpu_scheduler *sched = s_job->sched; ++ ++ spin_lock(&sched->job_list_lock); ++ list_del_init(&s_job->node); ++ spin_unlock(&sched->job_list_lock); ++} ++ + /** + * drm_sched_init - Init a gpu scheduler instance + * +-- +2.17.1 + |