Hi, Aureau
Thank you for the fix.
On 4/5/24 08:38, devel-request(a)lists.crash-utility.osci.io wrote:
Date: Thu, 4 Apr 2024 14:39:06 +0000
From: "Aureau, Georges (Kernel Tools ERT)"<georges.aureau(a)hpe.com>
Subject: [Crash-utility] [PATCH] x86_64: rhel 9.3: "crash" doesn't
handle all IRQ exception properly.
To:"devel@lists.crash-utility.osci.io"
<devel(a)lists.crash-utility.osci.io>
Message-ID: <SJ0PR84MB14821A83D9631672B4F0F9449F3C2(a)SJ0PR84MB1482.NAMP
RD84.PROD.OUTLOOK.COM>
Content-Type: text/plain; charset="us-ascii"
Hello,
On x86_64, with rhel 9.3, there are cases where "crash" can't handle IRQ
exception frames properly.
For example, "crash" fails with "WARNING: possibly bogus exception
frame":
crash> bt -c 30
PID: 2898241 TASK: ff4cb0ce0da0c680 CPU: 30 COMMAND: "star-ccm+"
#0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
#1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
#2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
#3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
#4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
[exception RIP: native_queued_spin_lock_slowpath+636]
RIP: ffffffffa0c6b80c RSP: ff5eba269937ce10 RFLAGS: 00000046
RAX: 0000000000000000 RBX: ff4cb12bcfbb2940 RCX: 00000000007c0000
RDX: 0000000000000057 RSI: 0000000001600000 RDI: ff4cb12d67205608
RBP: ff4cb12d67205608 R8: ff90b9c682475940 R9: 0000000000000000
R10: ff4cb0d70ee4e100 R11: 0000000000000000 R12: 0000000000000000
R13: 000000000000001e R14: ff90b9c682474918 R15: ff90b9c682477120
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
#6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
#7 [ff5eba269937ce40] free_iova at ffffffffa07df716
#8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
#9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
#10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
#11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
#12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
#13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
#14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
#15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
--- <IRQ stack> ---
RIP: 0000000000000010 RSP: 0000000000000018 RFLAGS: ff5eba26ddc9f7e8
RAX: 0000000000000a20 RBX: ff5eba26ddc9f940 RCX: 0000000000001000
RDX: ffffffb559980000 RSI: ff4cb12d67207400 RDI: ffffffffffffffff
RBP: 0000000000001000 R8: ff5eba26ddc9f940 R9: ff5eba26ddc9f8af
R10: 0000000000000003 R11: 0000000000000a20 R12: ff5eba26ddc9f8b0
R13: 000000283c07f000 R14: ff4cb0f5a29a1c00 R15: 0000000000000001
ORIG_RAX: ffffffffa07c4e60 CS: 0206 SS: 7000001cf0380001
bt: WARNING: possibly bogus exception frame
Running "crash" with "--machdep irq_eframe_link=0xffffffffffffffe8"
(ie. thus irq_eframe_link = -24) works properly:
PID: 2898241 TASK: ff4cb0ce0da0c680 CPU: 30 COMMAND: "star-ccm+"
#0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
#1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
#2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
#3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
#4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
[exception RIP: native_queued_spin_lock_slowpath+636]
RIP: ffffffffa0c6b80c RSP: ff5eba269937ce10 RFLAGS: 00000046
RAX: 0000000000000000 RBX: ff4cb12bcfbb2940 RCX: 00000000007c0000
RDX: 0000000000000057 RSI: 0000000001600000 RDI: ff4cb12d67205608
RBP: ff4cb12d67205608 R8: ff90b9c682475940 R9: 0000000000000000
R10: ff4cb0d70ee4e100 R11: 0000000000000000 R12: 0000000000000000
R13: 000000000000001e R14: ff90b9c682474918 R15: ff90b9c682477120
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
#6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
#7 [ff5eba269937ce40] free_iova at ffffffffa07df716
#8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
#9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
#10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
#11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
#12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
#13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
#14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
#15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
--- <IRQ stack> ---
#16 [ff5eba26ddc9f738] asm_sysvec_apic_timer_interrupt at ffffffffa0e00e06
[exception RIP: alloc_pte.constprop.0+32]
RIP: ffffffffa07c4e60 RSP: ff5eba26ddc9f7e8 RFLAGS: 00000206
RAX: ff5eba26ddc9f940 RBX: 0000000000001000 RCX: 0000000000000a20
RDX: 0000000000001000 RSI: ffffffb559980000 RDI: ff4cb12d67207400
RBP: ff5eba26ddc9f8b0 R8: ff5eba26ddc9f8af R9: 0000000000000003
R10: 0000000000000a20 R11: ff5eba26ddc9f940 R12: 000000283c07f000
R13: ff4cb0f5a29a1c00 R14: 0000000000000001 R15: ff4cb0f5a29a1bf8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#17 [ff5eba26ddc9f830] iommu_v1_map_pages at ffffffffa07c5648
#18 [ff5eba26ddc9f8f8] __iommu_map at ffffffffa07d7803
#19 [ff5eba26ddc9f990] iommu_map_sg at ffffffffa07d7b71
#20 [ff5eba26ddc9f9f0] iommu_dma_map_sg at ffffffffa07ddcc9
#21 [ff5eba26ddc9fa90] __dma_map_sg_attrs at ffffffffa01b5205
#22 [ff5eba26ddc9fa98] dma_map_sgtable at ffffffffa01b526d
#23 [ff5eba26ddc9faa8] ib_umem_get at ffffffffc112f308 [ib_uverbs]
#24 [ff5eba26ddc9fb00] mlx5_ib_reg_user_mr at ffffffffc11eb991 [mlx5_ib]
#25 [ff5eba26ddc9fb40] ib_uverbs_reg_mr at ffffffffc1123bc4 [ib_uverbs]
#26 [ff5eba26ddc9fbb0] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE at ffffffffc112d064
[ib_uverbs]
#27 [ff5eba26ddc9fbe0] ib_uverbs_run_method at ffffffffc112a121 [ib_uverbs]
#28 [ff5eba26ddc9fc30] ib_uverbs_cmd_verbs at ffffffffc112a332 [ib_uverbs]
#29 [ff5eba26ddc9fe68] ib_uverbs_ioctl at ffffffffc112a494 [ib_uverbs]
#30 [ff5eba26ddc9fea8] __x64_sys_ioctl at ffffffffa0438f67
#31 [ff5eba26ddc9fed8] do_syscall_64 at ffffffffa0c55189
#32 [ff5eba26ddc9ff50] entry_SYSCALL_64_after_hwframe at ffffffffa0e000ea
RIP: 0000146ef3a3ec6b RSP: 00007ffe3b7dc5b8 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 00007ffe3b7dc6a8 RCX: 0000146ef3a3ec6b
RDX: 00007ffe3b7dc690 RSI: 00000000c0181b01 RDI: 0000000000000011
RBP: 00007ffe3b7dc670 R8: 0000146ed9549010 R9: 00007ffe3b7dc7c4
R10: 0000000000000009 R11: 0000000000000246 R12: 00007ffe3b7dc7c4
R13: 000000000000000c R14: 000000000000000c R15: 0000146ed9549150
ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
Some background:
asm_common_interrupt:
callq error_entry
movq %rax,%rsp
movq %rsp,%rdi
movq 0x78(%rsp),%rsi
movq $-0x1,0x78(%rsp)
call common_interrupt # rsp pointing to regs
common_interrupt
pushq %r12
pushq %rbp
pushq %rbx
[...]
movq hardirq_stack_ptr,%r11
movq %rsp,(%r11)
movq %r11,%rsp
[...]
call __common_interrupt # rip:common_interrupt
So frame_size(rip:common_interrupt) = 32 (3 push + ret).
Hence "machdep->machspec->irq_eframe_link = -32;" (see
x86_64_irq_eframe_link_init()).
Now:
asm_sysvec_apic_timer_interrupt:
pushq $-0x1
callq error_entry
movq %rax,%rsp
movq %rsp,%rdi
callq sysvec_apic_timer_interrupt
sysvec_apic_timer_interrupt
pushq %r12
pushq %rbp
[...]
movq hardirq_stack_ptr,%r11
movq %rsp,(%r11)
movq %r11,%rsp
[...]
call __sysvec_apic_timer_interrupt # rip:sysvec_apic_timer_interrupt
Here frame_size(rip:sysvec_apic_timer_interrupt) = 24 (2 push + ret)
We should also notice that:
rip = *(hardirq_stack_ptr - 8)
rsp = *(hardirq_stack_ptr)
regs = rsp - frame_size(rip)
But x86_64_get_framesize() does not work with IRQ handlers (returns 0).
So not many options other than hardcoding the most likely value and looking around it.
Actually x86_64_irq_eframe_link() was trying -32 (default), and then -40, but not -24 !
Signed-off-by: Georges Aureau<georges.aureau(a)hpe.com>
--
diff --git a/x86_64.c b/x86_64.c
index aec82b0..90c2a91 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -6623,13 +6623,14 @@ x86_64_irq_eframe_link_init(void)
/*
* Calculate and verify the IRQ exception frame location from the
- * stack reference at the top of the IRQ stack, possibly adjusting
- * the ms->irq_eframe_link value.
+ * stack reference at the top of the IRQ stack, keep ms->irq_eframe_link
+ * as the most likely value, and try a few sizes around it.
*/
static ulong
x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE *ofp)
{
ulong irq_eframe;
+ int i, try[] = { 8, -8, 16, -16 };
if (x86_64_exception_frame(EFRAME_VERIFY, stkref, 0, bt, ofp))
return stkref;
@@ -6639,9 +6640,10 @@ x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE
*ofp)
if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe, 0, bt, ofp))
return irq_eframe;
- if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+8, 0, bt, ofp)) {
- machdep->machspec->irq_eframe_link -= 8;
- return (irq_eframe + 8);
+ for (i=0; i<sizeof(try)/sizeof(int); i++) {
+ if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+try[i], 0, bt, ofp))
{
+ return (irq_eframe + try[i]);
+ }
}
return irq_eframe;
Can you help to try the following changes? It could be good to check the
relevant symbol, but I'm not sure if this can work well for you and
cover some corner cases.
diff --git a/x86_64.c b/x86_64.c
index aec82b03b62e..251e224c013b 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -6574,6 +6574,8 @@ x86_64_irq_eframe_link_init(void)
if (symbol_exists("asm_common_interrupt")) {
if (symbol_exists("asm_call_on_stack"))
machdep->machspec->irq_eframe_link = -64;
+ else if (symbol_exists("sysvec_apic_timer_interrupt"))
+ machdep->machspec->irq_eframe_link = -24;
else
machdep->machspec->irq_eframe_link = -32;
return;
Thanks
Lianbo