On Mon, Feb 6, 2023 at 2:19 PM HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab@nec.com> wrote:

On 2023/02/06 11:04, lijiang wrote:

>> I tried to hardcode and get the expected eframe information, but the backtrace is incorrect:

ok, it looks the expected one.  The irq_eframe_link value has been
hardcoded for a long time, it might be hard to calculate it in the
current implementation..

btw, what is the kernel version of your vmcore this case?


My kernel version is upstream kernel 5.7. 
For my side, the irq_eframe_link is set to -56 as below:

diff --git a/x86_64.c b/x86_64.c
index 7a5d6f050c89..61ecf8dbdb65 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -3938,6 +3938,11 @@ in_exception_stack:
         if (irq_eframe) {
                 bt->flags |= BT_EXCEPTION_FRAME;
                 i = (irq_eframe - bt->stackbase)/sizeof(ulong);
+               if (symbol_exists("asm_common_interrupt")) {
+                       i -= 1;
+                       up = (ulong *)(&bt->stackbuf[i*sizeof(ulong)]);
+                       bt->instptr = *up;
+               }
                 x86_64_print_stack_entry(bt, ofp, level, i, bt->instptr);
                 bt->flags &= ~(ulonglong)BT_EXCEPTION_FRAME;
                 cs = x86_64_exception_frame(EFRAME_PRINT|EFRAME_CS, 0,
@@ -6521,6 +6526,11 @@ x86_64_irq_eframe_link_init(void)
        else
                return;
 
+       if (symbol_exists("asm_common_interrupt")) {
+               machdep->machspec->irq_eframe_link = -56;
+               return;
+       }
+
        if (THIS_KERNEL_VERSION < LINUX(2,6,9))
                return;

And get the following results:
crash> bt
PID: 0        TASK: ffff9e7a47e32f00  CPU: 3    COMMAND: "swapper/3"
 #0 [ffffba7900118bb8] machine_kexec at ffffffff87e5c2c7
 #1 [ffffba7900118c08] __crash_kexec at ffffffff87f9500d
 #2 [ffffba7900118cd0] panic at ffffffff87edfff9
 #3 [ffffba7900118d50] sysrq_handle_crash at ffffffff883ce2c1
 #4 [ffffba7900118d58] __handle_sysrq.cold.15 at ffffffff883ceb56
 #5 [ffffba7900118d88] sysrq_filter at ffffffff883ce9a2
 #6 [ffffba7900118dc0] input_to_handler at ffffffff884fb0bf
 #7 [ffffba7900118df8] input_pass_values at ffffffff884fc1b7
 #8 [ffffba7900118e20] input_handle_event at ffffffff884fe278
 #9 [ffffba7900118e50] input_event at ffffffff884fe74b
#10 [ffffba7900118e88] atkbd_interrupt at ffffffff88504e2f
#11 [ffffba7900118ee0] serio_interrupt at ffffffff884f7516
#12 [ffffba7900118f10] i8042_interrupt at ffffffff884f8b04
#13 [ffffba7900118f50] __handle_irq_event_percpu at ffffffff87f51430
#14 [ffffba7900118f90] handle_irq_event_percpu at ffffffff87f51590
#15 [ffffba7900118fb8] handle_irq_event at ffffffff87f51616
#16 [ffffba7900118fd8] handle_edge_irq at ffffffff87f559f2
#17 [ffffba7900118ff0] asm_call_on_stack at ffffffff88800fa2
--- <IRQ stack> ---
#18 [ffffba790008be08] asm_common_interrupt at ffffffff88800c1e
    [exception RIP: default_idle+37]
    RIP: ffffffff8876ffc5  RSP: ffffba790008beb8  RFLAGS: 00000246
    RAX: ffffffff8876ffa0  RBX: 0000000000000003  RCX: 0000000000000001
    RDX: 000000000001a6de  RSI: 0000000000000087  RDI: 0000000000000003
    RBP: 0000000000000003   R8: 000000146ccbc1f0   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000124  R12: ffffffffffffffff
    R13: 0000000000000000  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
#19 [ffffba790008bed8] do_idle at ffffffff87f1982d
#20 [ffffba790008bf20] cpu_startup_entry at ffffffff87f19a29
#21 [ffffba790008bf30] start_secondary at ffffffff87e5118f
#22 [ffffba790008bf50] secondary_startup_64 at ffffffff87e000e6

The above eframe information looks correct, but the backtrace seems problematic, it doesn't display the
common_interrupt() according to the kernel log:

[   57.552655] Call Trace:
[   57.553994]  <IRQ>
[   57.555143]  dump_stack+0x57/0x70
[   57.556930]  panic+0xfb/0x2d7
[   57.558516]  ? printk+0x58/0x6f
[   57.560194]  sysrq_handle_crash+0x11/0x20
[   57.562371]  __handle_sysrq.cold.15+0x48/0x102
[   57.564663]  sysrq_filter+0x312/0x3c0
[   57.566579]  input_to_handler+0x4f/0xf0
[   57.568583]  input_pass_values.part.7+0x117/0x130
[   57.571188]  input_handle_event+0x138/0x5c0
[   57.573380]  input_event+0x4b/0x70
[   57.575173]  atkbd_interrupt+0x61f/0x6a0
[   57.577228]  serio_interrupt+0x46/0x90
[   57.579192]  i8042_interrupt+0x154/0x240
[   57.581328]  __handle_irq_event_percpu+0x40/0x170
[   57.583788]  handle_irq_event_percpu+0x30/0x80
[   57.586081]  handle_irq_event+0x36/0x53
[   57.588071]  handle_edge_irq+0x82/0x190
[   57.590096]  asm_call_on_stack+0x12/0x20
[   57.592142]  </IRQ>
[   57.593264]  common_interrupt+0xb5/0x140
[   57.595311]  asm_common_interrupt+0x1e/0x40

[   57.597474] RIP: 0010:default_idle+0x25/0x150
[   57.599735] Code: cc cc cc cc cc 66 66 66 66 90 41 55 41 54 55 65 8b 2d af 23 8a 77 53 66 66 66 66 90 e9 07 00 00 00 0f 00 2d 9f 37 49 00 fb f4 <65> 8b 2d 94 23 8a 77 66 66 66 66 90 5b 5d 41 5c 41 5d c3 65 8b 05
[   57.609196] RSP: 0018:ffffba790008beb8 EFLAGS: 00000246
[   57.611875] RAX: ffffffff8876ffa0 RBX: 0000000000000003 RCX: 0000000000000001
[   57.615506] RDX: 000000000001a6de RSI: 0000000000000087 RDI: 0000000000000003
[   57.619142] RBP: 0000000000000003 R08: 000000146ccbc1f0 R09: 0000000000000000
[   57.622770] R10: 0000000000000000 R11: 0000000000000124 R12: ffffffffffffffff
[   57.626386] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[   57.630029]  ? __cpuidle_text_start+0x8/0x8
[   57.632209]  do_idle+0x1bd/0x240
[   57.633914]  cpu_startup_entry+0x19/0x20
[   57.635966]  start_secondary+0x15f/0x1b0
[   57.638021]  secondary_startup_64+0xb6/0xc0
crash>
 
But I can see it in the raw stack data:

ffffba790008bdc0:  0000000000000001 0000000000000001
ffffba790008bdd0:  common_interrupt+181 0000000000000000  <---- Why is it not in the backtrace?
ffffba790008bde0:  0000000000000000 0000000000000000
ffffba790008bdf0:  0000000000000000 0000000000000000
ffffba790008be00:  asm_common_interrupt+30 0000000000000000
ffffba790008be10:  0000000000000000 0000000000000000
ffffba790008be20:  ffffffffffffffff 0000000000000003
ffffba790008be30:  0000000000000003 0000000000000124
ffffba790008be40:  0000000000000000 0000000000000000
ffffba790008be50:  000000146ccbc1f0 default_idle    
ffffba790008be60:  0000000000000001 000000000001a6de
ffffba790008be70:  0000000000000087 0000000000000003
ffffba790008be80:  ffffffffffffffff default_idle+37  
ffffba790008be90:  0000000000000010 0000000000000246
ffffba790008bea0:  ffffba790008beb8 0000000000000018
ffffba790008beb0:  0000000000000003 0000000000000003
ffffba790008bec0:  __cpu_online_mask ffffffffffffffff
ffffba790008bed0:  0000000000000000 do_idle+445      
ffffba790008bee0:  ffffffffffffff04 c4a71aecbd899c00
ffffba790008bef0:  000000000000008f 0000000000000000
ffffba790008bf00:  0000000000000000 0000000000000000
ffffba790008bf10:  0000000000000000 0000000000000000
ffffba790008bf20:  cpu_startup_entry+25 ffffba790008bf38
ffffba790008bf30:  start_secondary+351 c4a71aecbd899c00
ffffba790008bf40:  0000000000000000 0000000000000000
ffffba790008bf50:  secondary_startup_64+182 0000000000000000
 

And for RHEL9, this trial patch looks good, how does this work for
your vmcore?
https://github.com/k-hagio/crash/commit/0719360a2b76b3c03b1f7ad06caf2ce14e2bfcbf.patch

 
Unfortunately, it still doesn't work(with the above patch):

crash> bt
PID: 0        TASK: ffff9e7a47e32f00  CPU: 3    COMMAND: "swapper/3"
 #0 [ffffba7900118bb8] machine_kexec at ffffffff87e5c2c7
 #1 [ffffba7900118c08] __crash_kexec at ffffffff87f9500d
 #2 [ffffba7900118cd0] panic at ffffffff87edfff9
 #3 [ffffba7900118d50] sysrq_handle_crash at ffffffff883ce2c1
 #4 [ffffba7900118d58] __handle_sysrq.cold.15 at ffffffff883ceb56
 #5 [ffffba7900118d88] sysrq_filter at ffffffff883ce9a2
 #6 [ffffba7900118dc0] input_to_handler at ffffffff884fb0bf
 #7 [ffffba7900118df8] input_pass_values at ffffffff884fc1b7
 #8 [ffffba7900118e20] input_handle_event at ffffffff884fe278
 #9 [ffffba7900118e50] input_event at ffffffff884fe74b
#10 [ffffba7900118e88] atkbd_interrupt at ffffffff88504e2f
#11 [ffffba7900118ee0] serio_interrupt at ffffffff884f7516
#12 [ffffba7900118f10] i8042_interrupt at ffffffff884f8b04
#13 [ffffba7900118f50] __handle_irq_event_percpu at ffffffff87f51430
#14 [ffffba7900118f90] handle_irq_event_percpu at ffffffff87f51590
#15 [ffffba7900118fb8] handle_irq_event at ffffffff87f51616
#16 [ffffba7900118fd8] handle_edge_irq at ffffffff87f559f2
#17 [ffffba7900118ff0] asm_call_on_stack at ffffffff88800fa2
--- <IRQ stack> ---
    RIP: 000000000001a6de  RSP: ffffffffffffffff  RFLAGS: 00000003
    RAX: 0000000000000124  RBX: 0000000000000000  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: 000000146ccbc1f0  RDI: ffffffff8876ffa0
    RBP: 0000000000000000   R8: 0000000000000003   R9: 0000000000000003
    R10: ffffffffffffffff  R11: 0000000000000000  R12: ffffffff88800c1e
    R13: 0000000000000000  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: 0000000000000001  CS: 0087  SS: ffffffff8876ffc5
bt: WARNING: possibly bogus exception frame
crash>

Thanks.
Lianbo 

Thanks,
Kazu

>>
>> crash> bt
>> PID: 0        TASK: ffff9e7a47e32f00  CPU: 3    COMMAND: "swapper/3"
>>  #0 [ffffba7900118bb8] machine_kexec at ffffffff87e5c2c7
>>  #1 [ffffba7900118c08] __crash_kexec at ffffffff87f9500d
>>  #2 [ffffba7900118cd0] panic at ffffffff87edfff9
>>  #3 [ffffba7900118d50] sysrq_handle_crash at ffffffff883ce2c1
>>  #4 [ffffba7900118d58] __handle_sysrq.cold.15 at ffffffff883ceb56
>>  #5 [ffffba7900118d88] sysrq_filter at ffffffff883ce9a2
>>  #6 [ffffba7900118dc0] input_to_handler at ffffffff884fb0bf
>>  #7 [ffffba7900118df8] input_pass_values at ffffffff884fc1b7
>>  #8 [ffffba7900118e20] input_handle_event at ffffffff884fe278
>>  #9 [ffffba7900118e50] input_event at ffffffff884fe74b
>> #10 [ffffba7900118e88] atkbd_interrupt at ffffffff88504e2f
>> #11 [ffffba7900118ee0] serio_interrupt at ffffffff884f7516
>> #12 [ffffba7900118f10] i8042_interrupt at ffffffff884f8b04
>> #13 [ffffba7900118f50] __handle_irq_event_percpu at ffffffff87f51430
>> #14 [ffffba7900118f90] handle_irq_event_percpu at ffffffff87f51590
>> #15 [ffffba7900118fb8] handle_irq_event at ffffffff87f51616
>> #16 [ffffba7900118fd8] handle_edge_irq at ffffffff87f559f2
>> #17 [ffffba7900118ff0] asm_call_on_stack at ffffffff88800fa2
>> --- <IRQ stack> ---
>> #18 [ffffba790008be08] asm_call_on_stack at ffffffff88800fa2
>>     [exception RIP: default_idle+37]
>>     RIP: ffffffff8876ffc5  RSP: ffffba790008beb8  RFLAGS: 00000246
>>     RAX: ffffffff8876ffa0  RBX: 0000000000000003  RCX: 0000000000000001
>>     RDX: 000000000001a6de  RSI: 0000000000000087  RDI: 0000000000000003
>>     RBP: 0000000000000003   R8: 000000146ccbc1f0   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000124  R12: ffffffffffffffff
>>     R13: 0000000000000000  R14: 0000000000000000  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>> #19 [ffffba790008bed8] do_idle at ffffffff87f1982d
>> #20 [ffffba790008bf20] cpu_startup_entry at ffffffff87f19a29
>> #21 [ffffba790008bf30] start_secondary at ffffffff87e5118f
>> #22 [ffffba790008bf50] secondary_startup_64 at ffffffff87e000e6
>> crash>>> Let me debug it more and to see what happened.
>>
>>     What is printed as the panic message in "log" command?
>>
>>
>> The kernel panic message:
>> [   57.542538] Kernel panic - not syncing: sysrq triggered crash
>> [   57.545512] CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Not tainted 5.7.0+ #7
>> [   57.549160] Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
>> [   57.552655] Call Trace:
>> [   57.553994]  <IRQ>
>> [   57.555143]  dump_stack+0x57/0x70
>> [   57.556930]  panic+0xfb/0x2d7
>> [   57.558516]  ? printk+0x58/0x6f
>> [   57.560194]  sysrq_handle_crash+0x11/0x20
>> [   57.562371]  __handle_sysrq.cold.15+0x48/0x102
>> [   57.564663]  sysrq_filter+0x312/0x3c0
>> [   57.566579]  input_to_handler+0x4f/0xf0
>> [   57.568583]  input_pass_values.part.7+0x117/0x130
>> [   57.571188]  input_handle_event+0x138/0x5c0
>> [   57.573380]  input_event+0x4b/0x70
>> [   57.575173]  atkbd_interrupt+0x61f/0x6a0
>> [   57.577228]  serio_interrupt+0x46/0x90
>> [   57.579192]  i8042_interrupt+0x154/0x240
>> [   57.581328]  __handle_irq_event_percpu+0x40/0x170
>> [   57.583788]  handle_irq_event_percpu+0x30/0x80
>> [   57.586081]  handle_irq_event+0x36/0x53
>> [   57.588071]  handle_edge_irq+0x82/0x190
>> [   57.590096]  asm_call_on_stack+0x12/0x20
>> [   57.592142]  </IRQ>
>> [   57.593264]  common_interrupt+0xb5/0x140
>> [   57.595311]  asm_common_interrupt+0x1e/0x40
>> [   57.597474] RIP: 0010:default_idle+0x25/0x150
>> [   57.599735] Code: cc cc cc cc cc 66 66 66 66 90 41 55 41 54 55 65 8b 2d af 23 8a 77 53 66 66 66 66 90 e9 07 00 00 00 0f 00 2d 9f 37 49 00 fb f4 <65> 8b 2d 94 23 8a 77 66 66 66 66 90 5b 5d 41 5c 41 5d c3 65 8b 05
>> [   57.609196] RSP: 0018:ffffba790008beb8 EFLAGS: 00000246
>> [   57.611875] RAX: ffffffff8876ffa0 RBX: 0000000000000003 RCX: 0000000000000001
>> [   57.615506] RDX: 000000000001a6de RSI: 0000000000000087 RDI: 0000000000000003
>> [   57.619142] RBP: 0000000000000003 R08: 000000146ccbc1f0 R09: 0000000000000000
>> [   57.622770] R10: 0000000000000000 R11: 0000000000000124 R12: ffffffffffffffff
>> [   57.626386] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
>> [   57.630029]  ? __cpuidle_text_start+0x8/0x8
>> [   57.632209]  do_idle+0x1bd/0x240
>> [   57.633914]  cpu_startup_entry+0x19/0x20
>> [   57.635966]  start_secondary+0x15f/0x1b0
>> [   57.638021]  secondary_startup_64+0xb6/0xc0
>> Thanks.
>> Lianbo
>>