On Fri, Feb 10, 2023 at 2:43 PM HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab@nec.com> wrote:
On 2023/02/09 21:35, lijiang wrote:> On Tue, Feb 7, 2023 at 6:08 PM lijiang <lijiang@redhat.com <mailto:lijiang@redhat.com>> wrote:
>
>         more important thing is how we determine the irq_eframe_link.
>
>     The following patch can work on upstream kernel vmcore and RHEL9 vmcore.
>     Maybe we can check the symbols asm_common_interrupt and asm_call_on_stack as below:
>
>     diff --git a/x86_64.c b/x86_64.c
>     index 7a5d6f050c89..62036f71f632 100644
>     --- a/x86_64.c
>     +++ b/x86_64.c
>     @@ -3938,6 +3938,11 @@ in_exception_stack:
>               if (irq_eframe) {
>                       bt->flags |= BT_EXCEPTION_FRAME;
>                       i = (irq_eframe - bt->stackbase)/sizeof(ulong);
>     + if (symbol_exists("asm_common_interrupt")) {
>     + i -= 1;
>     + up = (ulong *)(&bt->stackbuf[i*sizeof(ulong)]);
>     + bt->instptr = *up;
>     + }
>                       x86_64_print_stack_entry(bt, ofp, level, i, bt->instptr);
>                       bt->flags &= ~(ulonglong)BT_EXCEPTION_FRAME;
>                       cs = x86_64_exception_frame(EFRAME_PRINT|EFRAME_CS, 0,
>     @@ -6521,6 +6526,16 @@ x86_64_irq_eframe_link_init(void)
>        else
>        return;
>
>     + if (symbol_exists("asm_common_interrupt") && !symbol_exists("asm_call_on_stack")) {
>     + machdep->machspec->irq_eframe_link =-32;
>     + return;
>     + }
>     +
>     + if (symbol_exists("asm_common_interrupt") && symbol_exists("asm_call_on_stack")) {
>     + machdep->machspec->irq_eframe_link =-56;
>     + return;
>     + }
>     +
>        if (THIS_KERNEL_VERSION < LINUX(2,6,9))
>        return;
>
>
> Do you have any other comments about the above changes? Or still looking for a better solution, any thoughts? Kazu.

Thank you for the information.  I've looked for a way to follow
kernel changes, but it would be hard with the current unwinder.
So for now, let's go with this?


Agree. I will post the v2 later with your signature together.

+       if (symbol_exists("asm_common_interrupt")) {
+               if (symbol_exists("asm_call_on_stack"))
+                       machdep->machspec->irq_eframe_link = -64;
+               else
+                       machdep->machspec->irq_eframe_link = -32;
+               return;
+       }

Probably the actual irq_eframe_link for your kernel is -64, I
think it's adjusted in x86_64_irq_eframe_link().  Please check

You are right. I confirm that the irq_eframe_link will be adjusted in the x86_64_irq_eframe_link().

But it should be better to set the irq_eframe_link to -64 in the x86_64_irq_eframe_link_init(), it doesn't need to be adjusted any more.

"help -m".  e.g. a 5.8 vmcore on hand:

crash-dev> bt
...
crash-dev> help -m | grep irq_eframe
           irq_eframe_link: -64


The following is the reason why the irq_eframe_link is -32 on
RHEL9.1 (5.14.0-162.6.1.el9_1.x86_64).


Thank you for sharing the following process. Kazu.

I get another way to calculate the value of irq eframe, the irq eframe is
located at the stack address where the asm_common_interrupt symbol is stored
plus 8.

Maybe we can use the following formula to calculate the result, but it may need
to walk through the whole raw stack data. At least, we do not need to get the
eframe address from the ASM code. What do you think?

BTW: Maybe we can do it in the future.

crash> bt -r
...
ffffba790008be00:  asm_common_interrupt+30 0000000000000000---> irq eframe = 0xffffba790008be00+8
ffffba790008be10:  0000000000000000 0000000000000000
ffffba790008be20:  ffffffffffffffff 0000000000000003
ffffba790008be30:  0000000000000003 0000000000000124
ffffba790008be40:  0000000000000000 0000000000000000
ffffba790008be50:  000000146ccbc1f0 default_idle    
... 

In the x86_64_low_budget_back_trace_cmd():
...
                 /*
                  *  stack = (unsigned long *) (irqstack_end[-1]);
                  *  (where irqstack_end is 64 bytes below page end)
                  */
                 up = (ulong *)(&bt->stackbuf[stacktop - bt->stackbase]);
                 up -= 1;

                 irq_eframe_link =  (0xffffba790008be00+8) - *up; //*up = 0xffffba790008bdc8

crash> eval 0xffffba790008be08-0xffffba790008bdc8
hexadecimal: 40  
    decimal: 64  
      octal: 100
     binary: 0000000000000000000000000000000000000000000000000000000001000000
crash> 
...
[   57.595311]  asm_common_interrupt+0x1e/0x40
[   57.597474] RIP: 0010:default_idle+0x25/0x150
[   57.599735] Code: cc cc cc cc cc 66 66 66 66 90 41 55 41 54 55 65 8b 2d af 23 8a 77 53 66 66 66 66 90 e9 07 00 00 00 0f 00 2d 9f 37 49 00 fb f4 <65> 8b 2d 94 23 8a 77 66 66 66 66 90 5b 5d 41 5c 41 5d c3 65 8b 05
[   57.609196] RSP: 0018:ffffba790008beb8 EFLAGS: 00000246
[   57.611875] RAX: ffffffff8876ffa0 RBX: 0000000000000003 RCX: 0000000000000001
[   57.615506] RDX: 000000000001a6de RSI: 0000000000000087 RDI: 0000000000000003
[   57.619142] RBP: 0000000000000003 R08: 000000146ccbc1f0 R09: 0000000000000000
[   57.622770] R10: 0000000000000000 R11: 0000000000000124 R12: ffffffffffffffff
[   57.626386] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[   57.630029]  ? __cpuidle_text_start+0x8/0x8
[   57.632209]  do_idle+0x1bd/0x240
[   57.633914]  cpu_startup_entry+0x19/0x20
[   57.635966]  start_secondary+0x15f/0x1b0
[   57.638021]  secondary_startup_64+0xb6/0xc0

crash> pt_regs 0xffffba790008be08 -x
struct pt_regs {
  r15 = 0x0,
  r14 = 0x0,
  r13 = 0x0,
  r12 = 0xffffffffffffffff,
  bp = 0x3,
  bx = 0x3,
  r11 = 0x124,
  r10 = 0x0,
  r9 = 0x0,
  r8 = 0x146ccbc1f0,
  ax = 0xffffffff8876ffa0,
  cx = 0x1,
  dx = 0x1a6de,
  si = 0x87,
  di = 0x3,
  orig_ax = 0xffffffffffffffff,
  ip = 0xffffffff8876ffc5,
  cs = 0x10,
  flags = 0x246,
  sp = 0xffffba790008beb8,
  ss = 0x18
}
crash>
 
Thanks.
Lianbo

crash-dev> dis asm_common_interrupt
0xffffffff87a00cc0 <asm_common_interrupt>:      clac
0xffffffff87a00cc3 <asm_common_interrupt+3>:    cld
0xffffffff87a00cc4 <asm_common_interrupt+4>:    call   0xffffffff87a012b0 <error_entry> [1]
0xffffffff87a00cc9 <asm_common_interrupt+9>:    mov    %rax,%rsp                        [2]
0xffffffff87a00ccc <asm_common_interrupt+12>:   mov    %rsp,%rdi
0xffffffff87a00ccf <asm_common_interrupt+15>:   mov    0x78(%rsp),%rsi
0xffffffff87a00cd4 <asm_common_interrupt+20>:   movq   $0xffffffffffffffff,0x78(%rsp)
0xffffffff87a00cdd <asm_common_interrupt+29>:   call   0xffffffff878d8370 <common_interrupt> [3]
0xffffffff87a00ce2 <asm_common_interrupt+34>:   jmp    0xffffffff87a013d0 <error_return>

The asm_common_interrupt calls the error_entry first [1], which pushes
registers (pt_regs) to the stack and returns the address:

         leaq    8(%rsp), %rax                   /* return pt_regs pointer */

Then it writes the address in %rax to %rsp [2].  After that, it calls
the common_interrupt [3], so now *(%rsp) = asm_common_interrupt+34 and
*(%rsp+8) = pt_regs.

crash-dev> dis common_interrupt
0xffffffff878d8370 <common_interrupt>:          push   %r13       [4]
0xffffffff878d8372 <common_interrupt+2>:        push   %r12       [4]
0xffffffff878d8374 <common_interrupt+4>:        mov    %rsi,%r12
0xffffffff878d8377 <common_interrupt+7>:        push   %rbp       [4]
0xffffffff878d8378 <common_interrupt+8>:        mov    %rdi,%rbp
0xffffffff878d837b <common_interrupt+11>:       movzbl %r12b,%r12d
0xffffffff878d837f <common_interrupt+15>:       call   0xffffffff878dab00 <irqentry_enter>
0xffffffff878d8384 <common_interrupt+20>:       mov    %eax,%r13d
0xffffffff878d8387 <common_interrupt+23>:       movb   $0x1,%gs:0x787522f3(%rip)        # 0x2a682
0xffffffff878d838f <common_interrupt+31>:       testb  $0x3,0x88(%rbp)
0xffffffff878d8396 <common_interrupt+38>:       jne    0xffffffff878d83a3 <common_interrupt+51>  [5]
0xffffffff878d8398 <common_interrupt+40>:       mov    %gs:0x78740b65(%rip),%al        # 0x18f04
0xffffffff878d839f <common_interrupt+47>:       test   %al,%al
0xffffffff878d83a1 <common_interrupt+49>:       je     0xffffffff878d83c8 <common_interrupt+88>  [5]
0xffffffff878d83a3 <common_interrupt+51>:       call   0xffffffff86efb8a0 <irq_enter_rcu>
0xffffffff878d83a8 <common_interrupt+56>:       mov    %r12d,%esi
0xffffffff878d83ab <common_interrupt+59>:       mov    %rbp,%rdi
0xffffffff878d83ae <common_interrupt+62>:       call   0xffffffff86e26630 <__common_interrupt>
0xffffffff878d83b3 <common_interrupt+67>:       call   0xffffffff86efb920 <irq_exit_rcu>
0xffffffff878d83b8 <common_interrupt+72>:       mov    %r13d,%esi
0xffffffff878d83bb <common_interrupt+75>:       mov    %rbp,%rdi
0xffffffff878d83be <common_interrupt+78>:       pop    %rbp
0xffffffff878d83bf <common_interrupt+79>:       pop    %r12
0xffffffff878d83c1 <common_interrupt+81>:       pop    %r13
0xffffffff878d83c3 <common_interrupt+83>:       jmp    0xffffffff878dab80 <irqentry_exit>
0xffffffff878d83c8 <common_interrupt+88>:       movb   $0x1,%gs:0x78740b34(%rip)        # 0x18f04 [6]
0xffffffff878d83d0 <common_interrupt+96>:       mov    %gs:0x78740b30(%rip),%r11        # 0x18f08 [7]
0xffffffff878d83d8 <common_interrupt+104>:      mov    %rsp,(%r11)                                [7]
0xffffffff878d83db <common_interrupt+107>:      mov    %r11,%rsp                                  [7]
0xffffffff878d83de <common_interrupt+110>:      call   0xffffffff86efb8a0 <irq_enter_rcu>
0xffffffff878d83e3 <common_interrupt+115>:      mov    %rbp,%rdi
0xffffffff878d83e6 <common_interrupt+118>:      mov    %r12d,%esi
0xffffffff878d83e9 <common_interrupt+121>:      call   0xffffffff86e26630 <__common_interrupt>
0xffffffff878d83ee <common_interrupt+126>:      call   0xffffffff86efb920 <irq_exit_rcu>
0xffffffff878d83f3 <common_interrupt+131>:      pop    %rsp
0xffffffff878d83f4 <common_interrupt+132>:      movb   $0x0,%gs:0x78740b08(%rip)        # 0x18f04 [8]
0xffffffff878d83fc <common_interrupt+140>:      mov    %r13d,%esi
0xffffffff878d83ff <common_interrupt+143>:      mov    %rbp,%rdi
0xffffffff878d8402 <common_interrupt+146>:      pop    %rbp
0xffffffff878d8403 <common_interrupt+147>:      pop    %r12
0xffffffff878d8405 <common_interrupt+149>:      pop    %r13
0xffffffff878d8407 <common_interrupt+151>:      jmp    0xffffffff878dab80 <irqentry_exit>

The common_interrupt pushes 3 registers [4], so now *(%rsp+32) = pt_regs.
[5]s look the following condition, the else block goes to [6].  Then,
writes the current %rsp to the address in hardirq_stack_ptr, which
points to the end of the irq stack, and switches %rsp to it [7].

#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...)  \
{                                                                       \
         /*                                                              \
          * User mode entry and interrupt on the irq stack do not        \
          * switch stacks. If from user mode the task stack is empty.    \
          */                                                             \
         if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) {  \ [5]
                 irq_enter_rcu();                                        \
                 func(c_args);                                           \
                 irq_exit_rcu();                                         \
         } else {                                                        \
                 /*                                                      \
                  * Mark the irq stack inuse _before_ and unmark _after_ \
                  * switching stacks. Interrupts are disabled in both    \
                  * places. Invoke the stack switch macro with the call  \
                  * sequence which matches the above direct invocation.  \
                  */                                                     \
                 __this_cpu_write(hardirq_stack_inuse, true);            \ [6]
                 call_on_irqstack(func, asm_call, constr);               \ [7]
                 __this_cpu_write(hardirq_stack_inuse, false);           \ [8]
         }                                                               \
}

crash-dev> sym -l
...
18f04 (D) hardirq_stack_inuse
18f08 (D) hardirq_stack_ptr
...
crash-dev> p hardirq_stack_ptr:a
per_cpu(hardirq_stack_ptr, 0) = $2 = (void *) 0xffffa5ab40003ff8
per_cpu(hardirq_stack_ptr, 1) = $3 = (void *) 0xffffa5ab400ecff8
per_cpu(hardirq_stack_ptr, 2) = $4 = (void *) 0xffffa5ab40118ff8
per_cpu(hardirq_stack_ptr, 3) = $5 = (void *) 0xffffa5ab40144ff8


On the other hand, the current crash reads the saved %rsp here:

     up = (ulong *)(&bt->stackbuf[stacktop - bt->stackbase]);
     up -= 1;     <<<-- the same as hardirq_stack_ptr, the saved %rsp
     irq_eframe = rsp = bt->stkptr = x86_64_irq_eframe_link(*up, bt, ofp);
     up -= 1;
     bt->instptr = *up;

So the irq_eframe_link should be -32, because *(%rsp+32) is pt_regs.
And the return address from the common_interrupt is *(%rsp+24),
the following code gets it.

     if (symbol_exists("asm_common_interrupt")) {
             i -= 1;
             up = (ulong *)(&bt->stackbuf[i*sizeof(ulong)]);
             bt->instptr = *up;
     }


With the same way, I think the irq_eframe_link for kernel 5.8 to
5.11 will be -64.

Thanks,
Kazu