On 2023/02/09 21:35, lijiang wrote:> On Tue, Feb 7, 2023 at 6:08 PM lijiang
<lijiang(a)redhat.com <mailto:lijiang@redhat.com>> wrote:
more important thing is how we determine the irq_eframe_link.
The following patch can work on upstream kernel vmcore and RHEL9 vmcore.
Maybe we can check the symbols asm_common_interrupt and asm_call_on_stack as below:
diff --git a/x86_64.c b/x86_64.c
index 7a5d6f050c89..62036f71f632 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -3938,6 +3938,11 @@ in_exception_stack:
if (irq_eframe) {
bt->flags |= BT_EXCEPTION_FRAME;
i = (irq_eframe - bt->stackbase)/sizeof(ulong);
+ if (symbol_exists("asm_common_interrupt")) {
+ i -= 1;
+ up = (ulong *)(&bt->stackbuf[i*sizeof(ulong)]);
+ bt->instptr = *up;
+ }
x86_64_print_stack_entry(bt, ofp, level, i, bt->instptr);
bt->flags &= ~(ulonglong)BT_EXCEPTION_FRAME;
cs = x86_64_exception_frame(EFRAME_PRINT|EFRAME_CS, 0,
@@ -6521,6 +6526,16 @@ x86_64_irq_eframe_link_init(void)
else
return;
+ if (symbol_exists("asm_common_interrupt") &&
!symbol_exists("asm_call_on_stack")) {
+ machdep->machspec->irq_eframe_link =-32;
+ return;
+ }
+
+ if (symbol_exists("asm_common_interrupt") &&
symbol_exists("asm_call_on_stack")) {
+ machdep->machspec->irq_eframe_link =-56;
+ return;
+ }
+
if (THIS_KERNEL_VERSION < LINUX(2,6,9))
return;
Do you have any other comments about the above changes? Or still looking for a better
solution, any thoughts? Kazu.
Thank you for the information. I've looked for a way to follow
kernel changes, but it would be hard with the current unwinder.
So for now, let's go with this?
+ if (symbol_exists("asm_common_interrupt")) {
+ if (symbol_exists("asm_call_on_stack"))
+ machdep->machspec->irq_eframe_link = -64;
+ else
+ machdep->machspec->irq_eframe_link = -32;
+ return;
+ }
Probably the actual irq_eframe_link for your kernel is -64, I
think it's adjusted in x86_64_irq_eframe_link(). Please check
"help -m". e.g. a 5.8 vmcore on hand:
crash-dev> bt
...
crash-dev> help -m | grep irq_eframe
irq_eframe_link: -64
The following is the reason why the irq_eframe_link is -32 on
RHEL9.1 (5.14.0-162.6.1.el9_1.x86_64).
crash-dev> dis asm_common_interrupt
0xffffffff87a00cc0 <asm_common_interrupt>: clac
0xffffffff87a00cc3 <asm_common_interrupt+3>: cld
0xffffffff87a00cc4 <asm_common_interrupt+4>: call 0xffffffff87a012b0
<error_entry> [1]
0xffffffff87a00cc9 <asm_common_interrupt+9>: mov %rax,%rsp
[2]
0xffffffff87a00ccc <asm_common_interrupt+12>: mov %rsp,%rdi
0xffffffff87a00ccf <asm_common_interrupt+15>: mov 0x78(%rsp),%rsi
0xffffffff87a00cd4 <asm_common_interrupt+20>: movq
$0xffffffffffffffff,0x78(%rsp)
0xffffffff87a00cdd <asm_common_interrupt+29>: call 0xffffffff878d8370
<common_interrupt> [3]
0xffffffff87a00ce2 <asm_common_interrupt+34>: jmp 0xffffffff87a013d0
<error_return>
The asm_common_interrupt calls the error_entry first [1], which pushes
registers (pt_regs) to the stack and returns the address:
leaq 8(%rsp), %rax /* return pt_regs pointer */
Then it writes the address in %rax to %rsp [2]. After that, it calls
the common_interrupt [3], so now *(%rsp) = asm_common_interrupt+34 and
*(%rsp+8) = pt_regs.
crash-dev> dis common_interrupt
0xffffffff878d8370 <common_interrupt>: push %r13 [4]
0xffffffff878d8372 <common_interrupt+2>: push %r12 [4]
0xffffffff878d8374 <common_interrupt+4>: mov %rsi,%r12
0xffffffff878d8377 <common_interrupt+7>: push %rbp [4]
0xffffffff878d8378 <common_interrupt+8>: mov %rdi,%rbp
0xffffffff878d837b <common_interrupt+11>: movzbl %r12b,%r12d
0xffffffff878d837f <common_interrupt+15>: call 0xffffffff878dab00
<irqentry_enter>
0xffffffff878d8384 <common_interrupt+20>: mov %eax,%r13d
0xffffffff878d8387 <common_interrupt+23>: movb $0x1,%gs:0x787522f3(%rip)
# 0x2a682
0xffffffff878d838f <common_interrupt+31>: testb $0x3,0x88(%rbp)
0xffffffff878d8396 <common_interrupt+38>: jne 0xffffffff878d83a3
<common_interrupt+51> [5]
0xffffffff878d8398 <common_interrupt+40>: mov %gs:0x78740b65(%rip),%al
# 0x18f04
0xffffffff878d839f <common_interrupt+47>: test %al,%al
0xffffffff878d83a1 <common_interrupt+49>: je 0xffffffff878d83c8
<common_interrupt+88> [5]
0xffffffff878d83a3 <common_interrupt+51>: call 0xffffffff86efb8a0
<irq_enter_rcu>
0xffffffff878d83a8 <common_interrupt+56>: mov %r12d,%esi
0xffffffff878d83ab <common_interrupt+59>: mov %rbp,%rdi
0xffffffff878d83ae <common_interrupt+62>: call 0xffffffff86e26630
<__common_interrupt>
0xffffffff878d83b3 <common_interrupt+67>: call 0xffffffff86efb920
<irq_exit_rcu>
0xffffffff878d83b8 <common_interrupt+72>: mov %r13d,%esi
0xffffffff878d83bb <common_interrupt+75>: mov %rbp,%rdi
0xffffffff878d83be <common_interrupt+78>: pop %rbp
0xffffffff878d83bf <common_interrupt+79>: pop %r12
0xffffffff878d83c1 <common_interrupt+81>: pop %r13
0xffffffff878d83c3 <common_interrupt+83>: jmp 0xffffffff878dab80
<irqentry_exit>
0xffffffff878d83c8 <common_interrupt+88>: movb $0x1,%gs:0x78740b34(%rip)
# 0x18f04 [6]
0xffffffff878d83d0 <common_interrupt+96>: mov %gs:0x78740b30(%rip),%r11
# 0x18f08 [7]
0xffffffff878d83d8 <common_interrupt+104>: mov %rsp,(%r11)
[7]
0xffffffff878d83db <common_interrupt+107>: mov %r11,%rsp
[7]
0xffffffff878d83de <common_interrupt+110>: call 0xffffffff86efb8a0
<irq_enter_rcu>
0xffffffff878d83e3 <common_interrupt+115>: mov %rbp,%rdi
0xffffffff878d83e6 <common_interrupt+118>: mov %r12d,%esi
0xffffffff878d83e9 <common_interrupt+121>: call 0xffffffff86e26630
<__common_interrupt>
0xffffffff878d83ee <common_interrupt+126>: call 0xffffffff86efb920
<irq_exit_rcu>
0xffffffff878d83f3 <common_interrupt+131>: pop %rsp
0xffffffff878d83f4 <common_interrupt+132>: movb $0x0,%gs:0x78740b08(%rip)
# 0x18f04 [8]
0xffffffff878d83fc <common_interrupt+140>: mov %r13d,%esi
0xffffffff878d83ff <common_interrupt+143>: mov %rbp,%rdi
0xffffffff878d8402 <common_interrupt+146>: pop %rbp
0xffffffff878d8403 <common_interrupt+147>: pop %r12
0xffffffff878d8405 <common_interrupt+149>: pop %r13
0xffffffff878d8407 <common_interrupt+151>: jmp 0xffffffff878dab80
<irqentry_exit>
The common_interrupt pushes 3 registers [4], so now *(%rsp+32) = pt_regs.
[5]s look the following condition, the else block goes to [6]. Then,
writes the current %rsp to the address in hardirq_stack_ptr, which
points to the end of the irq stack, and switches %rsp to it [7].
#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...) \
{ \
/* \
* User mode entry and interrupt on the irq stack do not \
* switch stacks. If from user mode the task stack is empty. \
*/ \
if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ [5]
irq_enter_rcu(); \
func(c_args); \
irq_exit_rcu(); \
} else { \
/* \
* Mark the irq stack inuse _before_ and unmark _after_ \
* switching stacks. Interrupts are disabled in both \
* places. Invoke the stack switch macro with the call \
* sequence which matches the above direct invocation. \
*/ \
__this_cpu_write(hardirq_stack_inuse, true); \ [6]
call_on_irqstack(func, asm_call, constr); \ [7]
__this_cpu_write(hardirq_stack_inuse, false); \ [8]
} \
}
crash-dev> sym -l
...
18f04 (D) hardirq_stack_inuse
18f08 (D) hardirq_stack_ptr
...
crash-dev> p hardirq_stack_ptr:a
per_cpu(hardirq_stack_ptr, 0) = $2 = (void *) 0xffffa5ab40003ff8
per_cpu(hardirq_stack_ptr, 1) = $3 = (void *) 0xffffa5ab400ecff8
per_cpu(hardirq_stack_ptr, 2) = $4 = (void *) 0xffffa5ab40118ff8
per_cpu(hardirq_stack_ptr, 3) = $5 = (void *) 0xffffa5ab40144ff8
On the other hand, the current crash reads the saved %rsp here:
up = (ulong *)(&bt->stackbuf[stacktop - bt->stackbase]);
up -= 1; <<<-- the same as hardirq_stack_ptr, the saved %rsp
irq_eframe = rsp = bt->stkptr = x86_64_irq_eframe_link(*up, bt, ofp);
up -= 1;
bt->instptr = *up;
So the irq_eframe_link should be -32, because *(%rsp+32) is pt_regs.
And the return address from the common_interrupt is *(%rsp+24),
the following code gets it.
if (symbol_exists("asm_common_interrupt")) {
i -= 1;
up = (ulong *)(&bt->stackbuf[i*sizeof(ulong)]);
bt->instptr = *up;
}
With the same way, I think the irq_eframe_link for kernel 5.8 to
5.11 will be -64.
Thanks,
Kazu