Hello,
On x86_64, with rhel 9.3, there are cases where "crash" can't handle IRQ
exception frames properly.
For example, "crash" fails with "WARNING: possibly bogus exception
frame":
crash> bt -c 30
PID: 2898241 TASK: ff4cb0ce0da0c680 CPU: 30 COMMAND: "star-ccm+"
#0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
#1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
#2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
#3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
#4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
[exception RIP: native_queued_spin_lock_slowpath+636]
RIP: ffffffffa0c6b80c RSP: ff5eba269937ce10 RFLAGS: 00000046
RAX: 0000000000000000 RBX: ff4cb12bcfbb2940 RCX: 00000000007c0000
RDX: 0000000000000057 RSI: 0000000001600000 RDI: ff4cb12d67205608
RBP: ff4cb12d67205608 R8: ff90b9c682475940 R9: 0000000000000000
R10: ff4cb0d70ee4e100 R11: 0000000000000000 R12: 0000000000000000
R13: 000000000000001e R14: ff90b9c682474918 R15: ff90b9c682477120
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
#6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
#7 [ff5eba269937ce40] free_iova at ffffffffa07df716
#8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
#9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
#10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
#11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
#12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
#13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
#14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
#15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
--- <IRQ stack> ---
RIP: 0000000000000010 RSP: 0000000000000018 RFLAGS: ff5eba26ddc9f7e8
RAX: 0000000000000a20 RBX: ff5eba26ddc9f940 RCX: 0000000000001000
RDX: ffffffb559980000 RSI: ff4cb12d67207400 RDI: ffffffffffffffff
RBP: 0000000000001000 R8: ff5eba26ddc9f940 R9: ff5eba26ddc9f8af
R10: 0000000000000003 R11: 0000000000000a20 R12: ff5eba26ddc9f8b0
R13: 000000283c07f000 R14: ff4cb0f5a29a1c00 R15: 0000000000000001
ORIG_RAX: ffffffffa07c4e60 CS: 0206 SS: 7000001cf0380001
bt: WARNING: possibly bogus exception frame
Running "crash" with "--machdep irq_eframe_link=0xffffffffffffffe8"
(ie. thus irq_eframe_link = -24) works properly:
PID: 2898241 TASK: ff4cb0ce0da0c680 CPU: 30 COMMAND: "star-ccm+"
#0 [fffffe4658d88e58] crash_nmi_callback at ffffffffa00675e8
#1 [fffffe4658d88e68] nmi_handle at ffffffffa002ebab
#2 [fffffe4658d88eb0] default_do_nmi at ffffffffa0c56cd0
#3 [fffffe4658d88ed0] exc_nmi at ffffffffa0c56ee1
#4 [fffffe4658d88ef0] end_repeat_nmi at ffffffffa0e01639
[exception RIP: native_queued_spin_lock_slowpath+636]
RIP: ffffffffa0c6b80c RSP: ff5eba269937ce10 RFLAGS: 00000046
RAX: 0000000000000000 RBX: ff4cb12bcfbb2940 RCX: 00000000007c0000
RDX: 0000000000000057 RSI: 0000000001600000 RDI: ff4cb12d67205608
RBP: ff4cb12d67205608 R8: ff90b9c682475940 R9: 0000000000000000
R10: ff4cb0d70ee4e100 R11: 0000000000000000 R12: 0000000000000000
R13: 000000000000001e R14: ff90b9c682474918 R15: ff90b9c682477120
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
--- <NMI exception stack> ---
#5 [ff5eba269937ce10] native_queued_spin_lock_slowpath at ffffffffa0c6b80c
#6 [ff5eba269937ce30] _raw_spin_lock_irqsave at ffffffffa0c6ad90
#7 [ff5eba269937ce40] free_iova at ffffffffa07df716
#8 [ff5eba269937ce68] fq_ring_free at ffffffffa07dba15
#9 [ff5eba269937ce98] fq_flush_timeout at ffffffffa07dc8fe
#10 [ff5eba269937ced0] call_timer_fn at ffffffffa01c00a4
#11 [ff5eba269937cef0] __run_timers at ffffffffa01c03ae
#12 [ff5eba269937cf88] run_timer_softirq at ffffffffa01c0476
#13 [ff5eba269937cf90] __do_softirq at ffffffffa0c6c007
#14 [ff5eba269937cfe0] __irq_exit_rcu at ffffffffa010ef61
#15 [ff5eba269937cff0] sysvec_apic_timer_interrupt at ffffffffa0c58ca2
--- <IRQ stack> ---
#16 [ff5eba26ddc9f738] asm_sysvec_apic_timer_interrupt at ffffffffa0e00e06
[exception RIP: alloc_pte.constprop.0+32]
RIP: ffffffffa07c4e60 RSP: ff5eba26ddc9f7e8 RFLAGS: 00000206
RAX: ff5eba26ddc9f940 RBX: 0000000000001000 RCX: 0000000000000a20
RDX: 0000000000001000 RSI: ffffffb559980000 RDI: ff4cb12d67207400
RBP: ff5eba26ddc9f8b0 R8: ff5eba26ddc9f8af R9: 0000000000000003
R10: 0000000000000a20 R11: ff5eba26ddc9f940 R12: 000000283c07f000
R13: ff4cb0f5a29a1c00 R14: 0000000000000001 R15: ff4cb0f5a29a1bf8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#17 [ff5eba26ddc9f830] iommu_v1_map_pages at ffffffffa07c5648
#18 [ff5eba26ddc9f8f8] __iommu_map at ffffffffa07d7803
#19 [ff5eba26ddc9f990] iommu_map_sg at ffffffffa07d7b71
#20 [ff5eba26ddc9f9f0] iommu_dma_map_sg at ffffffffa07ddcc9
#21 [ff5eba26ddc9fa90] __dma_map_sg_attrs at ffffffffa01b5205
#22 [ff5eba26ddc9fa98] dma_map_sgtable at ffffffffa01b526d
#23 [ff5eba26ddc9faa8] ib_umem_get at ffffffffc112f308 [ib_uverbs]
#24 [ff5eba26ddc9fb00] mlx5_ib_reg_user_mr at ffffffffc11eb991 [mlx5_ib]
#25 [ff5eba26ddc9fb40] ib_uverbs_reg_mr at ffffffffc1123bc4 [ib_uverbs]
#26 [ff5eba26ddc9fbb0] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE at ffffffffc112d064
[ib_uverbs]
#27 [ff5eba26ddc9fbe0] ib_uverbs_run_method at ffffffffc112a121 [ib_uverbs]
#28 [ff5eba26ddc9fc30] ib_uverbs_cmd_verbs at ffffffffc112a332 [ib_uverbs]
#29 [ff5eba26ddc9fe68] ib_uverbs_ioctl at ffffffffc112a494 [ib_uverbs]
#30 [ff5eba26ddc9fea8] __x64_sys_ioctl at ffffffffa0438f67
#31 [ff5eba26ddc9fed8] do_syscall_64 at ffffffffa0c55189
#32 [ff5eba26ddc9ff50] entry_SYSCALL_64_after_hwframe at ffffffffa0e000ea
RIP: 0000146ef3a3ec6b RSP: 00007ffe3b7dc5b8 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: 00007ffe3b7dc6a8 RCX: 0000146ef3a3ec6b
RDX: 00007ffe3b7dc690 RSI: 00000000c0181b01 RDI: 0000000000000011
RBP: 00007ffe3b7dc670 R8: 0000146ed9549010 R9: 00007ffe3b7dc7c4
R10: 0000000000000009 R11: 0000000000000246 R12: 00007ffe3b7dc7c4
R13: 000000000000000c R14: 000000000000000c R15: 0000146ed9549150
ORIG_RAX: 0000000000000010 CS: 0033 SS: 002b
Some background:
asm_common_interrupt:
callq error_entry
movq %rax,%rsp
movq %rsp,%rdi
movq 0x78(%rsp),%rsi
movq $-0x1,0x78(%rsp)
call common_interrupt # rsp pointing to regs
common_interrupt
pushq %r12
pushq %rbp
pushq %rbx
[...]
movq hardirq_stack_ptr,%r11
movq %rsp,(%r11)
movq %r11,%rsp
[...]
call __common_interrupt # rip:common_interrupt
So frame_size(rip:common_interrupt) = 32 (3 push + ret).
Hence "machdep->machspec->irq_eframe_link = -32;" (see
x86_64_irq_eframe_link_init()).
Now:
asm_sysvec_apic_timer_interrupt:
pushq $-0x1
callq error_entry
movq %rax,%rsp
movq %rsp,%rdi
callq sysvec_apic_timer_interrupt
sysvec_apic_timer_interrupt
pushq %r12
pushq %rbp
[...]
movq hardirq_stack_ptr,%r11
movq %rsp,(%r11)
movq %r11,%rsp
[...]
call __sysvec_apic_timer_interrupt # rip:sysvec_apic_timer_interrupt
Here frame_size(rip:sysvec_apic_timer_interrupt) = 24 (2 push + ret)
We should also notice that:
rip = *(hardirq_stack_ptr - 8)
rsp = *(hardirq_stack_ptr)
regs = rsp - frame_size(rip)
But x86_64_get_framesize() does not work with IRQ handlers (returns 0).
So not many options other than hardcoding the most likely value and looking around it.
Actually x86_64_irq_eframe_link() was trying -32 (default), and then -40, but not -24 !
Signed-off-by: Georges Aureau <georges.aureau(a)hpe.com>
--
diff --git a/x86_64.c b/x86_64.c
index aec82b0..90c2a91 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -6623,13 +6623,14 @@ x86_64_irq_eframe_link_init(void)
/*
* Calculate and verify the IRQ exception frame location from the
- * stack reference at the top of the IRQ stack, possibly adjusting
- * the ms->irq_eframe_link value.
+ * stack reference at the top of the IRQ stack, keep ms->irq_eframe_link
+ * as the most likely value, and try a few sizes around it.
*/
static ulong
x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE *ofp)
{
ulong irq_eframe;
+ int i, try[] = { 8, -8, 16, -16 };
if (x86_64_exception_frame(EFRAME_VERIFY, stkref, 0, bt, ofp))
return stkref;
@@ -6639,9 +6640,10 @@ x86_64_irq_eframe_link(ulong stkref, struct bt_info *bt, FILE
*ofp)
if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe, 0, bt, ofp))
return irq_eframe;
- if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+8, 0, bt, ofp)) {
- machdep->machspec->irq_eframe_link -= 8;
- return (irq_eframe + 8);
+ for (i=0; i<sizeof(try)/sizeof(int); i++) {
+ if (x86_64_exception_frame(EFRAME_VERIFY, irq_eframe+try[i], 0, bt, ofp))
{
+ return (irq_eframe + try[i]);
+ }
}
return irq_eframe;