[Crash-utility] Re: [PATCH 1/2] x86_64: Correct the unwind address for orc when "call" instruction encountered

Thursday, 21 May 2026

On 2026/05/20 13:31, Tao Liu wrote:
...
 We have enountered the following edge case for orc unwinder which
results an
 incorrect stack unwinding:
 
 $ objdump -d vmlinux
 ffffffff8100be70 <panic>:
 ffffffff8100be70:       f3 0f 1e fa             endbr64
 ffffffff8100be74:       e8 c7 b9 0d 00          call   ffffffff810e7840
<__fentry__>
 ffffffff8100be79:       55                      push   %rbp
 ffffffff8100be7a:       49 89 fa                mov    %rdi,%r10
 ffffffff8100be7d:       48 89 e5                mov    %rsp,%rbp
 ffffffff8100be80:       48 83 ec 50             sub    $0x50,%rsp
 ...
 ffffffff8100bed2:       48 89 44 24 10          mov    %rax,0x10(%rsp)
 ffffffff8100bed7:       e8 f4 fc ff ff          call   ffffffff8100bbd0 <vpanic>
 
 ffffffff8100bedc <nmi_panic.cold>:
 ffffffff8100bedc:       48 c7 c7 6e c2 8b 82    mov    $0xffffffff828bc26e,%rdi
 ffffffff8100bee3:       e8 88 ff ff ff          call   ffffffff8100be70 <panic>
 
 $ objtool --dump=orc vmlinux
 ...
 ffffffff8100be7a:type:call sp:sp+16 bp:prevsp-16 signal:0
 ffffffff8100be80:type:call sp:bp+16 bp:prevsp-16 signal:0
 ffffffff8100bedc:type:call sp:sp+8 bp:(und) signal:0
 ffffffff8100bee8:type:call sp:sp+16 bp:(und) signal:0
 
 crash> gdb bt
   #0  0xffffffff8126b3e4 in crash_setup_regs
   #1  0xffffffff8126b7a2 in __crash_kexec
   #2  0xffffffff8100bca9 in vpanic
   #3  0xffffffff8100bedc in panic
   #4  0xffffffff81903a9a in sysrq_handle_crash
   ...
 
 crash> bt -r
 ...
 ffffc9000126fc40:  machine_kexec+184 sysrq_showstate_op
 ffffc9000126fc50:  .LC4+109         __crash_kexec+114
 ffffc9000126fc60:  0000000000000000 0000000000000007
 ffffc9000126fc70:  0000000000000063 ffffc9000126fd48
 ffffc9000126fc80:  .LC4+109         sysrq_showstate_op
 ffffc9000126fc90:  0000000000000002 _printk_rb_static_infos+64944
 ffffc9000126fca0:  _printk_rb_static_descs+17712 0000000000000000
 ffffc9000126fcb0:  0000000000000000 0000000000000000
 ffffc9000126fcc0:  0000000000000001 0000000000000000
 ffffc9000126fcd0:  ffffc9000126fc60 0000000000000000
 ffffc9000126fce0:  crash_setup_regs+116 0000000000000010
 ffffc9000126fcf0:  0000000000000046 ffffc9000126fc58
 ffffc9000126fd00:  0000000000000018 a9e74447ad8b9200
 ffffc9000126fd10:  sysrq_showstate_op vpanic+217
 ffffc9000126fd20:  sysrq_crash_op   ffffc9000126fd98
 ffffc9000126fd30:  0000000000000000 0000000000000063
 ffffc9000126fd40:  panic+108        0000000000000008
 ffffc9000126fd50:  ffffc9000126fda8 ffffc9000126fd68
 ffffc9000126fd60:  a9e74447ad8b9200 a9e74447ad8b9200
 ffffc9000126fd70:  ffff88813bc1d1c0 0000000000000000
 ffffc9000126fd80:  0000000000000000 0000000000000000
 ffffc9000126fd90:  _printk_rb_static_descs+786408 0000000000000000
 ffffc9000126fda0:  sysrq_handle_crash+26 __handle_sysrq.cold+159
 
 Within the stack, we can identify the following stackframes:
 
 1. rsp ffffc9000126fc40, rip (machine_kexec+184)
 2. rsp ffffc9000126fc58, rip (__crash_kexec+114)
 3. rsp ffffc9000126fd10, rip (vpanic+217)
 4. rsp ffffc9000126fd40, rip (panic+108)
 
 Before the fix, crash will directly pass rip to orc_find() to calculate
 each framesize, actually this is incorrect, because the rip is what
 we found in stack, as we all know the "call" instruction will push
 the address of the next instruction into stack, rather than the address
 of the "call" instruction itself. So for the (panic+108, or ffffffff8100bedc)
 case, orc winder finds the frame of 0xffffffff8100bedc is sp:sp+8.
 This is incorrect, because 0xffffffff8100bedc belongs to a different
 function, aka nmi_panic(), rather than panic(). So we should use (rip -
 5, or ffffffff8100bed7) for orc_find(), which gives sp:bp+16 for unwind
 stack frame. We can confirm this by reading the disassembly at
 ffffffff8100be70 ~ ffffffff8100be80.
 
 This is an edge case because the address after the "call" instruction
 located to a different function, which lead to orc_find() got a wrong
 framesize.
 
 This patch fix this by check the 1st previous instruction of rip we read
 from stack. If the instruction is "call", then we pass the address of
 the "call" to orc_find(), to ensure we orc unwinding the correct function.
 
Hi Tao, thank you for the work!

I tried the patchset, unfortunately it prints a bit wrong trace,
because the correct .cold symbols are removed, I think.

* SysRq case

crash> bt
PID: 6598     TASK: ffff8c0902d2b680  CPU: 2    COMMAND: "bash"
  #0 [ffffcda8c346b970] machine_kexec at ffffffff908e91e8
  #1 [ffffcda8c346b988] __crash_kexec at ffffffff90a6b7a2
  #2 [ffffcda8c346ba48] vpanic at ffffffff9080bca9
  #3 [ffffcda8c346ba70] panic at ffffffff9080bedc
  #4 [ffffcda8c346bad0] sysrq_handle_crash at ffffffff91103a9a
  #5 [ffffcda8c346bad8] sysrq_handle_loglevel at ffffffff90841048  <<--
  #6 [ffffcda8c346bb08] write_sysrq_trigger at ffffffff91104679
  #7 [ffffcda8c346bb30] proc_reg_write at ffffffff90dd3137
  #8 [ffffcda8c346bb48] vfs_write at ffffffff90d17308
  #9 [ffffcda8c346bbd8] ksys_write at ffffffff90d1783d
#10 [ffffcda8c346bc10] do_syscall_64 at ffffffff9168e96d
#11 [ffffcda8c346bf40] entry_SYSCALL_64_after_hwframe at ffffffff9180012f

* NMI case

crash> bt
PID: 0        TASK: ffffffffa3c12940  CPU: 0    COMMAND: "swapper/0"
  #0 [fffffe7e99f94d50] machine_kexec at ffffffffa1ae91e8
  #1 [fffffe7e99f94d68] __crash_kexec at ffffffffa1c6b7a2
  #2 [fffffe7e99f94e28] vpanic at ffffffffa1a0bca9
  #3 [fffffe7e99f94e50] panic at ffffffffa1a0bedc
  #4 [fffffe7e99f94eb0] panic at ffffffffa1a0bee8    <<--
  #5 [fffffe7e99f94eb8] unknown_nmi_error at ffffffffa1aa39a7
  #6 [fffffe7e99f94ed0] exc_nmi at ffffffffa2891576
  #7 [fffffe7e99f94ef0] end_repeat_nmi at ffffffffa2a01d9d
     [exception RIP: default_idle+15]


On the other hand, kernel's call trace looks correct, so I looked into
how the kernel prints this.

[  133.526125] Call Trace:
[  133.526126]  <NMI>
[  133.526128]  dump_stack_lvl+0x4e/0x70
[  133.526131]  vpanic+0xc2/0x290
[  133.526133]  panic+0x6c/0x6c
[  133.526134]  nmi_panic.cold+0xc/0xc
[  133.526135]  unknown_nmi_error+0x77/0xa0
[  133.526137]  exc_nmi+0x106/0x180
[  133.526140]  end_repeat_nmi+0xf/0x53


* When printing a symbol for call trace, it searches for "address - 1".

static void printk_stack_address(unsigned long address, int reliable,
                                  const char *log_lvl)
{
         touch_nmi_watchdog();
         printk("%s %s%pBb\n", log_lvl, reliable ? "" : "?
", (void *)address);
}                    ^^^^

This "%pBb" gets to the following function.

/**
  * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text
buffer
  * @buffer: buffer to be stored
  * @address: address to lookup
  *
  * This function is for stack backtrace and does the same thing as
  * sprint_symbol() but with modified/decreased @address. If there is a
  * tail-call to the function marked "noreturn", gcc optimized out code after
  * the call so that the stack-saved return address could point outside of the
<<--
  * caller. This function ensures that kallsyms will find the original caller
  * by decreasing @address. This function also appends the module build ID to
  * the @buffer if @address is within a kernel module.
  *
  * This function returns the number of bytes stored in @buffer.
  */
int sprint_backtrace_build_id(char *buffer, unsigned long address)
{
         return __sprint_symbol(buffer, address, -1, 1, 1);
}                                               ^^^

/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
                            int symbol_offset, int add_offset, int add_buildid)
{
...
         address += symbol_offset; // search for the symbol with symbol_offset(-1)
         len = kallsyms_lookup_buildid(address, &size, &offset, &modname,
&buildid,
                                        buffer);
         if (!len)
                 return sprintf(buffer, "0x%lx", address - symbol_offset);

         offset -= symbol_offset;  // adjust offset (+1)

         if (add_offset)           // print offset
                 len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);


* When searching for ORC data, usually "ip - 1" is used.

bool unwind_next_frame(struct unwind_state *state)
{
...
         /*
          * Find the orc_entry associated with the text address.
          *
          * For a call frame (as opposed to a signal frame), state->ip points to
          * the instruction after the call.  That instruction's stack layout
          * could be different from the call instruction's layout, for example
          * if the call was to a noreturn function.  So get the ORC data for the 
<<--
          * call instruction itself.
          */
         orc = orc_find(state->signal ? state->ip : state->ip - 1);
                                                    ^^^^^^^^^^^^^

So I tried a simple patch like this at first:

--- a/x86_64.c
+++ b/x86_64.c
@@ -3229,7 +3229,8 @@ x86_64_print_stack_entry(struct bt_info *bt, FILE *ofp, int level,
         if (!(bt->flags & BT_SAVE_EFRAME_IP))
                 bt->eframe_ip = 0;
         offset = 0;
-       sp = value_search(text, &offset);
+       sp = value_search(text-1, &offset);
+       offset++;
         if (!sp)
                 return BACKTRACE_ENTRY_IGNORED;
  
@@ -8811,7 +8812,9 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp,
char *stack_
                         return 0;
         }
  
-        if (!(sp = value_search(textaddr, &offset))) {
+       sp = value_search(textaddr-1, &offset);
+       offset++;
+       if (!sp) {
                 if (!(bt->flags & BT_FRAMESIZE_DEBUG))
                         bt->flags |= BT_FRAMESIZE_DISABLE;
                  return 0;
@@ -8887,7 +8890,7 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp,
char *stack_
         if ((sp->value >= kt->init_begin) && (sp->value <
kt->init_end))
                 return 0;
  
-       if ((machdep->flags & ORC) && (korc = orc_find(textaddr))) {
+       if ((machdep->flags & ORC) && (korc = orc_find(textaddr-1))) {
                 if (CRASHDEBUG(1)) {
                         struct ORC_data *orc = &machdep->machspec->orc;
                         fprintf(fp,


This looks good at first glance.

crash> bt
PID: 0        TASK: ffffffffa3c12940  CPU: 0    COMMAND: "swapper/0"
  #0 [fffffe7e99f94d50] machine_kexec at ffffffffa1ae91e8
  #1 [fffffe7e99f94d68] __crash_kexec at ffffffffa1c6b7a2
  #2 [fffffe7e99f94e28] vpanic at ffffffffa1a0bca9
  #3 [fffffe7e99f94e50] panic at ffffffffa1a0bedc
  #4 [fffffe7e99f94eb0] nmi_panic.cold at ffffffffa1a0bee8
  #5 [fffffe7e99f94eb8] unknown_nmi_error at ffffffffa1aa39a7
  #6 [fffffe7e99f94ed0] exc_nmi at ffffffffa2891576
  #7 [fffffe7e99f94ef0] end_repeat_nmi at ffffffffa2a01d9d
     [exception RIP: default_idle+15]
     RIP: ffffffffa2895c6f  RSP: ffffffffa3c03e88  RFLAGS: 00000256
     RAX: 0000000000000000  RBX: 0000000000000000  RCX: ffff8cb88b0ecf08
     RDX: ffffd2b200dc74a1  RSI: 0000000000000087  RDI: 000000000008086c
     RBP: ffffffffa3c12940   R8: 000000000008086c   R9: 0000000000000000
     R10: 0000000000000001  R11: fefefefefefefeff  R12: 0000000000000000
     R13: 000000007d41f000  R14: 000000000000000c  R15: 000000007d41f000
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
  #8 [ffffffffa3c03e88] default_idle at ffffffffa2895c6f
  #9 [ffffffffa3c03e88] default_idle_call at ffffffffa2895ef9
#10 [ffffffffa3c03e90] cpuidle_idle_call at ffffffffa1bbf4ba
#11 [ffffffffa3c03ec8] do_idle at ffffffffa1bbf573
...


But there are some issues, which I've found so far:

* How we can emulate state->signal=1 case in crash.
Your method (check if the prev is "call") may be good, is there any
more orc-based or robust way?

* "bt -s" option still prints wrong symbols.

What do you think about this way?

Thanks,
Kazu


> 
> Signed-off-by: Tao Liu <ltao(a)redhat.com&gt;
> ---
>   x86_64.c | 34 +++++++++++++++++++++++++++++++++-
>   1 file changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/x86_64.c b/x86_64.c
> index b2cddbf..70bc1da 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -8887,7 +8887,38 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong
rsp, char *stack_
>   	if ((sp->value >= kt->init_begin) && (sp->value <
kt->init_end))
>   		return 0;
>   
> -	if ((machdep->flags & ORC) && (korc = orc_find(textaddr))) {
> +	if (machdep->flags & ORC) {
> +		/*
> +		* Disassemble & check if the 1st previous instruction of textaddr
> +		* is "call". If it does, the orc should unwind against the
> +		* address of the "call" inst, rather than the textaddr itself.
> +		*/
> +		open_tmpfile2();
> +		sprintf(buf, "x/-1i 0x%lx", textaddr);
> +		if (!gdb_pass_through(buf, pc->tmpfile2, GNU_RETURN_ON_ERROR)) {
> +			close_tmpfile2();
> +			bt->flags |= BT_FRAMESIZE_DISABLE;
> +			return 0;
> +		}
> +		rewind(pc->tmpfile2);
> +		if (fgets(buf, BUFSIZE, pc->tmpfile2)) {
> +			c = parse_line(buf, arglist);
> +			for (int i = 0; i < c; i++) {
> +				if (!strcmp(arglist[i], "call")) {
> +					reterror = 0;
> +					ulong tmp_val = htol(arglist[0],
> +							RETURN_ON_ERROR, &reterror);
> +					if (!reterror)
> +						textaddr = tmp_val;
> +					break;
> +				}
> +			}
> +		}
> +		close_tmpfile2();
> +
> +		if ((korc = orc_find(textaddr)) == NULL)
> +			goto out_orc;
> +
>   		if (CRASHDEBUG(1)) {
>   			struct ORC_data *orc = &machdep->machspec->orc;
>   			fprintf(fp,
> @@ -8939,6 +8970,7 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong
rsp, char *stack_
>   		}
>   	}
>   
> +out_orc:
>   	framesize = max = 0;
>           max_instructions = textaddr - sp->value;
>   	instr = arg = -1;
    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

[Crash-utility] Re: [PATCH 1/2] x86_64: Correct the unwind address for orc when "call" instruction encountered