On Thu, Jun 27, 2024 at 4:27 PM lijiang <lijiang(a)redhat.com> wrote:
On Fri, May 31, 2024 at 5:33 PM
<devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:32 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 09/16] x86_64: Add gdb stack
> unwinding support
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-10-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> This patch will add stack unwinding support for x86_64 CPU arch. It
> follows the tech path of ppc64_get_stack_frame() & ppc64_get_cpu_reg()
> in ppc64.c, to get and consume the cpu regs.
>
> One different case is, we need to handle inactive_task_frame structure
> specifically in x86_64.
>
> If inactive_task_frame structure enabled, for inactive tasks, 7 regs can
> be get from inactive_task_frame in stack, and sp need to rewind back to
> skip inactive_task_frame structure (See code comments in
> x86_64.c:x86_64_get_stack_frame()); for active tasks, we get regs by the
> original way.
>
> If inactive_task_frame structure is not enabled, for inactive tasks, the
> stack frame is organized as linked list, and sp/bp can be get from stack;
> for active tasks, we get regs by the original way.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> defs.h | 19 +++-
> x86_64.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 283 insertions(+), 27 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index 8450aea..ed52cc3 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2241,6 +2241,20 @@ struct offset_table { /* stash
> of commonly-used offsets */
> long mnt_namespace_nr_mounts;
> long mount_mnt_node;
> long log_caller_id;
> + long inactive_task_frame_r15;
> + long inactive_task_frame_r14;
> + long inactive_task_frame_r13;
> + long inactive_task_frame_r12;
> + long inactive_task_frame_flags;
> + long inactive_task_frame_si;
> + long inactive_task_frame_di;
> + long inactive_task_frame_bx;
> + long thread_struct_es;
> + long thread_struct_ds;
> + long thread_struct_fsbase;
> + long thread_struct_gsbase;
> + long thread_struct_fs;
> + long thread_struct_gs;
> };
>
Also please dump them in the dump_offset_table().
Thanks
Lianbo
> struct size_table { /* stash of commonly-used sizes */
> @@ -8008,7 +8022,7 @@ extern int have_full_symbols(void);
>
> /*
> * Register numbers must be in sync with gdb/features/i386/64bit-core.c
> - * to make crash_target->fetch_registers() ---> machdep->get_cpu_reg()
> + * to make crash_target->fetch_registers() --->
> machdep->get_current_task_reg()
> * working properly.
> */
> enum x86_64_regnum {
> @@ -8052,6 +8066,9 @@ enum x86_64_regnum {
> FOSEG_REGNUM,
> FOOFF_REGNUM,
> FOP_REGNUM,
> + FS_BASE_REGNUM = 152,
> + GS_BASE_REGNUM,
> + ORIG_RAX_REGNUM,
> LAST_REGNUM
> };
>
> diff --git a/x86_64.c b/x86_64.c
> index 5ab2852..4ba0b40 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -126,7 +126,7 @@ static int x86_64_get_framesize(struct bt_info *,
> ulong, ulong, char *);
> static void x86_64_framesize_debug(struct bt_info *);
> static void x86_64_get_active_set(void);
> static int x86_64_get_kvaddr_ranges(struct vaddr_range *);
> -static int x86_64_get_cpu_reg(int, int, const char *, int, void *);
> +static int x86_64_get_current_task_reg(int, const char *, int, void *);
> static int x86_64_verify_paddr(uint64_t);
> static void GART_init(void);
> static void x86_64_exception_stacks_init(void);
> @@ -143,6 +143,29 @@ struct machine_specific x86_64_machine_specific = {
> 0 };
> static const char *exception_functions_orig[];
> static const char *exception_functions_5_8[];
>
> +/* Use this hardwired version -- sometimes the
> + * debuginfo doesn't pick this up even though
> + * it exists in the kernel; it shouldn't change.
> + */
> +struct x86_64_user_regs_struct {
> + unsigned long r15, r14, r13, r12, bp, bx;
> + unsigned long r11, r10, r9, r8, ax, cx, dx;
> + unsigned long si, di, orig_ax, ip, cs;
> + unsigned long flags, sp, ss, fs_base;
> + unsigned long gs_base, ds, es, fs, gs;
> +};
> +
> +struct user_regs_bitmap_struct {
> + struct {
> + unsigned long r15, r14, r13, r12, bp, bx;
> + unsigned long r11, r10, r9, r8, ax, cx, dx;
> + unsigned long si, di, orig_ax, ip, cs;
> + unsigned long flags, sp, ss, fs_base;
> + unsigned long gs_base, ds, es, fs, gs;
> + };
> + ulong bitmap;
> +};
>
The struct user_regs_bitmap_struct can de defined as below:
+struct user_regs_bitmap_struct {
+ struct user_regs_bitmap_struct uregs;
+ ulong bitmap;
+};
+
> /*
> * Do all necessary machine-specific setup here. This is called several
> * times during initialization.
> @@ -195,7 +218,7 @@ x86_64_init(int when)
> machdep->machspec->irq_eframe_link = UNINITIALIZED;
> machdep->machspec->irq_stack_gap = UNINITIALIZED;
> machdep->get_kvaddr_ranges = x86_64_get_kvaddr_ranges;
> - machdep->get_current_task_reg = x86_64_get_cpu_reg;
> + machdep->get_current_task_reg =
> x86_64_get_current_task_reg;
> if (machdep->cmdline_args[0])
> parse_cmdline_args();
> if ((string = pc->read_vmcoreinfo("relocate"))) {
> @@ -500,6 +523,12 @@ x86_64_init(int when)
> MEMBER_OFFSET_INIT(thread_struct_rsp,
> "thread_struct", "sp");
> if (INVALID_MEMBER(thread_struct_rsp0))
> MEMBER_OFFSET_INIT(thread_struct_rsp0,
> "thread_struct", "sp0");
> + MEMBER_OFFSET_INIT(thread_struct_es, "thread_struct",
> "es");
> + MEMBER_OFFSET_INIT(thread_struct_ds, "thread_struct",
> "ds");
> + MEMBER_OFFSET_INIT(thread_struct_fsbase, "thread_struct",
> "fsbase");
> + MEMBER_OFFSET_INIT(thread_struct_gsbase, "thread_struct",
> "gsbase");
> + MEMBER_OFFSET_INIT(thread_struct_fs, "thread_struct",
> "fs");
> + MEMBER_OFFSET_INIT(thread_struct_gs, "thread_struct",
> "gs");
> STRUCT_SIZE_INIT(tss_struct, "tss_struct");
> MEMBER_OFFSET_INIT(tss_struct_ist, "tss_struct",
"ist");
> if (INVALID_MEMBER(tss_struct_ist)) {
> @@ -583,17 +612,6 @@ x86_64_init(int when)
> "user_regs_struct", "r15");
> STRUCT_SIZE_INIT(user_regs_struct, "user_regs_struct");
> if (!VALID_STRUCT(user_regs_struct)) {
> - /* Use this hardwired version -- sometimes the
> - * debuginfo doesn't pick this up even though
> - * it exists in the kernel; it shouldn't change.
> - */
> - struct x86_64_user_regs_struct {
> - unsigned long r15, r14, r13, r12, bp, bx;
> - unsigned long r11, r10, r9, r8, ax, cx,
> dx;
> - unsigned long si, di, orig_ax, ip, cs;
> - unsigned long flags, sp, ss, fs_base;
> - unsigned long gs_base, ds, es, fs, gs;
> - };
> ASSIGN_SIZE(user_regs_struct) =
> sizeof(struct x86_64_user_regs_struct);
> ASSIGN_OFFSET(user_regs_struct_rip) =
> @@ -891,7 +909,7 @@ x86_64_dump_machdep_table(ulong arg)
> fprintf(fp, " is_page_ptr: x86_64_is_page_ptr()\n");
> fprintf(fp, " verify_paddr: x86_64_verify_paddr()\n");
> fprintf(fp, " get_kvaddr_ranges: x86_64_get_kvaddr_ranges()\n");
> - fprintf(fp, " get_cpu_reg: x86_64_get_cpu_reg()\n");
> + fprintf(fp, "get_current_task_reg:
> x86_64_get_current_task_reg()\n");
> fprintf(fp, " init_kernel_pgd: x86_64_init_kernel_pgd()\n");
> fprintf(fp, "clear_machdep_cache:
> x86_64_clear_machdep_cache()\n");
> fprintf(fp, " xendump_p2m_create: %s\n", PVOPS_XEN() ?
> @@ -4971,22 +4989,124 @@ x86_64_eframe_verify(struct bt_info *bt, long
> kvaddr, long cs, long ss,
> return FALSE;
> }
>
> +#define GET_REG_FROM_INACTIVE_TASK_FRAME(reg) \
> +({ \
> + ulong offset, reg_value = 0, rsp; \
> + if (VALID_MEMBER(inactive_task_frame_bp)) { \
> + offset = OFFSET(task_struct_thread) +
> OFFSET(thread_struct_rsp); \
> + readmem(bt->task + offset, KVADDR, &rsp, \
> + sizeof(ulong), "thread_struct.rsp",
> FAULT_ON_ERROR); \
> + readmem(rsp + OFFSET(inactive_task_frame_##reg), KVADDR,
> ®_value, \
> + sizeof(ulong), "inactive_task_frame.##reg",
> FAULT_ON_ERROR); \
> + } \
> + reg_value; \
> +})
>
I would suggest changing the above macro definition to a function, at
least there are two advantages:
[1] once the task_struct or inactive_task_frame is changed in the future,
it is easy to follow its changes.
[2] easy to debug, and more readable.
> +
> /*
> * Get a stack frame combination of pc and ra from the most relevent
> spot.
> */
> static void
> x86_64_get_stack_frame(struct bt_info *bt, ulong *pcp, ulong *spp)
> {
> - if (bt->flags & BT_DUMPFILE_SEARCH)
> - return x86_64_get_dumpfile_stack_frame(bt, pcp, spp);
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong pcp_save = *pcp;
> + ulong spp_save = *spp;
> + ulong sp;
>
> if (bt->flags & BT_SKIP_IDLE)
> bt->flags &= ~BT_SKIP_IDLE;
> + if (pcp)
> + *pcp = x86_64_get_pc(bt);
> + if (spp)
> + sp = *spp = x86_64_get_sp(bt);
> + ur_bitmap = (struct user_regs_bitmap_struct
> *)GETBUF(sizeof(*ur_bitmap));
> + memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
>
Can you keep the same style?(sorry to be picky :-))
+ ur_bitmap = (struct user_regs_bitmap_struct
*)GETBUF(sizeof(*ur_bitmap));
+ memset(ur_bitmap, 0, sizeof(*ur_bitmap));
or
+ ur_bitmap = (struct user_regs_bitmap_struct *)GETBUF(sizeof(struct
user_regs_bitmap_struct)));
+ memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
+
> + if (VALID_MEMBER(inactive_task_frame_bp)) {
> + if (!is_task_active(bt->task)) {
> + /*
> + * For inactive tasks in live and dumpfile, regs
> can be
> + * get from inactive_task_frame struct.
> + */
> + ur_bitmap->r15 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r15);
> + ur_bitmap->r14 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r14);
> + ur_bitmap->r13 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r13);
> + ur_bitmap->r12 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r12);
> + ur_bitmap->bx =
> GET_REG_FROM_INACTIVE_TASK_FRAME(bx);
> + ur_bitmap->bp =
> GET_REG_FROM_INACTIVE_TASK_FRAME(bp);
> + /*
> + For inactive tasks:
> + crash> task -x 1|grep sp
> + sp = 0xffffc90000013d00
> + crash> rd ffffc90000013d00 32
> + ffffc90000013d00: ffff888104dad4a8
> 0000000000000000 r15,r14
> + ffffc90000013d10: ffff888100280000
> ffff888100216500 r13,r12
> + ffffc90000013d20: ffff888100217018
> ffff88817fd2c800 rbx,rbp
> + ffffc90000013d30: ffffffff81a6a1b3
> ffffc90000013de0 saved_rip,...
> + ffffc90000013d40: ffff888100000004
> 99ccbf53ea493000
> + ffffc90000013d50: ffff888100216500
> ffff888100216500
> +
> + crash> dis __schedule
> + ...
> + 0xffffffff81a6a1ab <__schedule+507>: mov
> %r13,%rsi
> + 0xffffffff81a6a1ae <__schedule+510>: call
> 0xffffffff81003490 <__switch_to_asm>
> + 0xffffffff81a6a1b3 <__schedule+515>: mov
> %rax,%rdi <<=== saved_rip
> + ...
> + crash> dis __switch_to_asm
> + 0xffffffff81003490 <__switch_to_asm>: push
> %rbp
> + 0xffffffff81003491 <__switch_to_asm+1>: push
> %rbx
> + 0xffffffff81003492 <__switch_to_asm+2>: push
> %r12
> + 0xffffffff81003494 <__switch_to_asm+4>: push
> %r13
> + 0xffffffff81003496 <__switch_to_asm+6>: push
> %r14
> + 0xffffffff81003498 <__switch_to_asm+8>: push
> %r15
> + 0xffffffff8100349a <__switch_to_asm+10>:
> mov %rsp,0x14d8(%rdi)
> + ...
> + Now saved_rip = ffffffff81a6a1b3, and we are
> starting
> + the stack unwind at saved_rip, which is function
> __schedule()
> + instead of function __switch_to_asm(), so the
> stack pointer should
> + be rewind from ffffc90000013d00 back to
> ffffc90000013d38,
> + aka *spp += 7 * reg_len. Otherwise we are
> unwinding function
> + __schedule() but with __switch_to_asm()'s stack
> frame, which
> + will fail.
> + */
> + sp += 7 * sizeof(unsigned long);
>
It seems this may still rely on the compiler, have you tried another
compiler? I'm not sure if there are the same results, just my concerns.
Anyway, If there is no better way or solution, at least we should comment
on it and point out the current limitations. What do you think?
+ ur_bitmap->bitmap += 0x3f;
>
Also why is it 0x3f? Can you explain the details?
> + } else {
> + /*
> + * For active tasks in dumpfile, we get regs
> through the
> + * original way. For active tasks in live, we only
> get
> + * ip and sp in the end of the function.
> + */
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
>
See the comments in the previous thread.
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return
> x86_64_get_dumpfile_stack_frame(bt, pcp, spp);
> + }
> + }
> + } else {
> + if (!is_task_active(bt->task)) {
> + readmem(*spp, KVADDR, &(ur_bitmap->bp),
> + sizeof(ulong), "ur_bitmap->bp",
> FAULT_ON_ERROR);
> + ur_bitmap->bitmap += 0x10;
> + } else {
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return
> x86_64_get_dumpfile_stack_frame(bt, pcp, spp);
> + }
> + }
> + }
>
> - if (pcp)
> - *pcp = x86_64_get_pc(bt);
> - if (spp)
> - *spp = x86_64_get_sp(bt);
> + ur_bitmap->ip = *pcp;
> + ur_bitmap->sp = sp;
> + ur_bitmap->bitmap += 0x50000;
> +
> + bt->machdep = ur_bitmap;
> + bt->need_free = TRUE;
> }
>
> /*
> @@ -6458,6 +6578,14 @@ x86_64_ORC_init(void)
>
> MEMBER_OFFSET_INIT(inactive_task_frame_bp, "inactive_task_frame",
> "bp");
> MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr,
> "inactive_task_frame", "ret_addr");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r15,
> "inactive_task_frame", "r15");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r14,
> "inactive_task_frame", "r14");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r13,
> "inactive_task_frame", "r13");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r12,
> "inactive_task_frame", "r12");
> + MEMBER_OFFSET_INIT(inactive_task_frame_flags,
> "inactive_task_frame", "flags");
> + MEMBER_OFFSET_INIT(inactive_task_frame_si, "inactive_task_frame",
> "si");
> + MEMBER_OFFSET_INIT(inactive_task_frame_di, "inactive_task_frame",
> "di");
> + MEMBER_OFFSET_INIT(inactive_task_frame_bx, "inactive_task_frame",
> "bx");
>
> orc->has_signal = MEMBER_EXISTS("orc_entry",
"signal"); /* added
> at 6.3 */
> orc->has_end = MEMBER_EXISTS("orc_entry", "end");
/*
> removed at 6.4 */
> @@ -9071,17 +9199,128 @@ x86_64_get_kvaddr_ranges(struct vaddr_range *vrp)
> return cnt;
> }
>
> +#define REG_CASE(R, r) \
> + case R##_REGNUM: \
> + if (size != sizeof(ur_bitmap->r)) { \
> + ret = FALSE; break; \
> + } else { \
> + memcpy(value, &ur_bitmap->r, size); \
> + ret = TRUE; break; \
> + }
> +
> static int
> -x86_64_get_cpu_reg(int cpu, int regno, const char *name,
> +x86_64_get_current_task_reg(int regno, const char *name,
> int size, void *value)
> {
> - if (regno >= LAST_REGNUM)
> - return FALSE;
> + struct bt_info bt_info, bt_setup;
> + struct task_context *tc;
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong ip, sp;
> + bool ret = FALSE;
>
> - if (VMSS_DUMPFILE())
> - return vmware_vmss_get_cpu_reg(cpu, regno, name, size,
> value);
> + switch (regno) {
> + case RAX_REGNUM ... GS_REGNUM:
> + case FS_BASE_REGNUM ... ORIG_RAX_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
>
>
Ditto.
> - return FALSE;
> + tc = CURRENT_CONTEXT();
> + if (!tc)
> + return FALSE;
> +
> + if (VMSS_DUMPFILE())
> + return vmware_vmss_get_cpu_reg(tc->processor, regno,
> name, size, value);
> +
> + BZERO(&bt_setup, sizeof(struct bt_info));
> + clone_bt_info(&bt_setup, &bt_info, tc);
> + fill_stackbuf(&bt_info);
> +
> + // reusing the get_dumpfile_regs function to get pt regs structure
> + get_dumpfile_regs(&bt_info, &sp, &ip);
> + if (bt_info.stackbuf)
> + FREEBUF(bt_info.stackbuf);
> + ur_bitmap = (struct user_regs_bitmap_struct *)bt_info.machdep;
> + if (!ur_bitmap)
> + return FALSE;
> +
> + /* Get all registers from elf notes*/
> + if (!bt_info.need_free) {
> + goto get_all;
> + }
> +
>
Ditto.
> + /* Get subset registers from stack frame*/
> + switch (ur_bitmap->bitmap) {
> + case 0x5003f:
> + switch (regno) {
> + case R12_REGNUM ... R15_REGNUM:
> + case RBP_REGNUM:
> + case RBX_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50010:
> + switch (regno) {
> + case RBP_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50000:
> + switch (regno) {
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + }
>
Can you help to explain more details about several magic number values?
Such as 0x5003f, 0x50000, 0x50010. Or you may point out where they come
from, it may be helpful to add them here as code comment.
Thanks
Lianbo
+
> +get_all:
> + switch (regno) {
> + REG_CASE(RAX, ax);
> + REG_CASE(RBX, bx);
> + REG_CASE(RCX, cx);
> + REG_CASE(RDX, dx);
> + REG_CASE(RSI, si);
> + REG_CASE(RDI, di);
> + REG_CASE(RBP, bp);
> + REG_CASE(RSP, sp);
> + REG_CASE(R8, r8);
> + REG_CASE(R9, r9);
> + REG_CASE(R10, r10);
> + REG_CASE(R11, r11);
> + REG_CASE(R12, r12);
> + REG_CASE(R13, r13);
> + REG_CASE(R14, r14);
> + REG_CASE(R15, r15);
> + REG_CASE(RIP, ip);
> + REG_CASE(EFLAGS, flags);
> + REG_CASE(CS, cs);
> + REG_CASE(SS, ss);
> + REG_CASE(DS, ds);
> + REG_CASE(ES, es);
> + REG_CASE(FS, fs);
> + REG_CASE(GS, gs);
> + REG_CASE(FS_BASE, fs_base);
> + REG_CASE(GS_BASE, gs_base);
> + REG_CASE(ORIG_RAX, orig_ax);
> + }
> +
> + if (bt_info.need_free) {
> + FREEBUF(ur_bitmap);
> + bt_info.need_free = FALSE;
> + }
> + return ret;
> }
>
> /*
> --
> 2.40.1
>