On Fri, May 31, 2024 at 5:33 PM <devel-request(a)lists.crash-utility.osci.io
wrote:
Date: Fri, 31 May 2024 17:19:32 +0800
From: Tao Liu <ltao(a)redhat.com
Subject: [Crash-utility] [PATCH v4
09/16] x86_64: Add gdb stack
unwinding support
To: devel(a)lists.crash-utility.osci.io
Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
<naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
lijiang(a)redhat.com>,
Alexey Makhalov <alexey.makhalov(a)broadcom.com
Message-ID: <20240531091939.97828-10-ltao(a)redhat.com
Content-Type: text/plain; charset=UTF-8
This patch will add stack unwinding support for x86_64 CPU arch. It
follows the tech path of ppc64_get_stack_frame() & ppc64_get_cpu_reg()
in ppc64.c, to get and consume the cpu regs.
One different case is, we need to handle inactive_task_frame structure
specifically in x86_64.
If inactive_task_frame structure enabled, for inactive tasks, 7 regs can
be get from inactive_task_frame in stack, and sp need to rewind back to
skip inactive_task_frame structure (See code comments in
x86_64.c:x86_64_get_stack_frame()); for active tasks, we get regs by the
original way.
If inactive_task_frame structure is not enabled, for inactive tasks, the
stack frame is organized as linked list, and sp/bp can be get from stack;
for active tasks, we get regs by the original way.
Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com
Cc: Hari
Bathini <hbathini(a)linux.ibm.com
Cc: Mahesh J Salgaonkar
<mahesh(a)linux.ibm.com
Cc: Naveen N. Rao
<naveen.n.rao(a)linux.vnet.ibm.com
Cc:
Lianbo Jiang <lijiang(a)redhat.com
Cc:
HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com
Cc: Tao
Liu <ltao(a)redhat.com
Cc: Alexey Makhalov
<alexey.makhalov(a)broadcom.com
Signed-off-by: Tao Liu
<ltao(a)redhat.com
---
defs.h | 19 +++-
x86_64.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 283 insertions(+), 27 deletions(-)
diff --git a/defs.h b/defs.h
index 8450aea..ed52cc3 100644
--- a/defs.h
+++ b/defs.h
@@ -2241,6 +2241,20 @@ struct offset_table { /* stash
of commonly-used offsets */
long mnt_namespace_nr_mounts;
long mount_mnt_node;
long log_caller_id;
+ long inactive_task_frame_r15;
+ long inactive_task_frame_r14;
+ long inactive_task_frame_r13;
+ long inactive_task_frame_r12;
+ long inactive_task_frame_flags;
+ long inactive_task_frame_si;
+ long inactive_task_frame_di;
+ long inactive_task_frame_bx;
+ long thread_struct_es;
+ long thread_struct_ds;
+ long thread_struct_fsbase;
+ long thread_struct_gsbase;
+ long thread_struct_fs;
+ long thread_struct_gs;
};
struct size_table { /* stash of commonly-used sizes */
@@ -8008,7 +8022,7 @@ extern int have_full_symbols(void);
/*
* Register numbers must be in sync with gdb/features/i386/64bit-core.c
- * to make crash_target->fetch_registers() ---> machdep->get_cpu_reg()
+ * to make crash_target->fetch_registers() ---
machdep->get_current_task_reg()
* working properly.
*/
enum x86_64_regnum {
@@ -8052,6 +8066,9 @@ enum x86_64_regnum {
FOSEG_REGNUM,
FOOFF_REGNUM,
FOP_REGNUM,
+ FS_BASE_REGNUM = 152,
+ GS_BASE_REGNUM,
+ ORIG_RAX_REGNUM,
LAST_REGNUM
};
diff --git a/x86_64.c b/x86_64.c
index 5ab2852..4ba0b40 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -126,7 +126,7 @@ static int x86_64_get_framesize(struct bt_info *,
ulong, ulong, char *);
static void x86_64_framesize_debug(struct bt_info *);
static void x86_64_get_active_set(void);
static int x86_64_get_kvaddr_ranges(struct vaddr_range *);
-static int x86_64_get_cpu_reg(int, int, const char *, int, void *);
+static int x86_64_get_current_task_reg(int, const char *, int, void *);
static int x86_64_verify_paddr(uint64_t);
static void GART_init(void);
static void x86_64_exception_stacks_init(void);
@@ -143,6 +143,29 @@ struct machine_specific x86_64_machine_specific = { 0
};
static const char *exception_functions_orig[];
static const char *exception_functions_5_8[];
+/* Use this hardwired version -- sometimes the
+ * debuginfo doesn't pick this up even though
+ * it exists in the kernel; it shouldn't change.
+ */
+struct x86_64_user_regs_struct {
+ unsigned long r15, r14, r13, r12, bp, bx;
+ unsigned long r11, r10, r9, r8, ax, cx, dx;
+ unsigned long si, di, orig_ax, ip, cs;
+ unsigned long flags, sp, ss, fs_base;
+ unsigned long gs_base, ds, es, fs, gs;
+};
+
+struct user_regs_bitmap_struct {
+ struct {
+ unsigned long r15, r14, r13, r12, bp, bx;
+ unsigned long r11, r10, r9, r8, ax, cx, dx;
+ unsigned long si, di, orig_ax, ip, cs;
+ unsigned long flags, sp, ss, fs_base;
+ unsigned long gs_base, ds, es, fs, gs;
+ };
+ ulong bitmap;
+};
The struct user_regs_bitmap_struct can de defined as below:
+struct user_regs_bitmap_struct {
+ struct user_regs_bitmap_struct uregs;
+ ulong bitmap;
+};
+
/*
* Do all necessary machine-specific setup here. This is called several
* times during initialization.
@@ -195,7 +218,7 @@ x86_64_init(int when)
machdep->machspec->irq_eframe_link = UNINITIALIZED;
machdep->machspec->irq_stack_gap = UNINITIALIZED;
machdep->get_kvaddr_ranges = x86_64_get_kvaddr_ranges;
- machdep->get_current_task_reg = x86_64_get_cpu_reg;
+ machdep->get_current_task_reg =
x86_64_get_current_task_reg;
if (machdep->cmdline_args[0])
parse_cmdline_args();
if ((string = pc->read_vmcoreinfo("relocate"))) {
@@ -500,6 +523,12 @@ x86_64_init(int when)
MEMBER_OFFSET_INIT(thread_struct_rsp,
"thread_struct", "sp");
if (INVALID_MEMBER(thread_struct_rsp0))
MEMBER_OFFSET_INIT(thread_struct_rsp0,
"thread_struct", "sp0");
+ MEMBER_OFFSET_INIT(thread_struct_es, "thread_struct",
"es");
+ MEMBER_OFFSET_INIT(thread_struct_ds, "thread_struct",
"ds");
+ MEMBER_OFFSET_INIT(thread_struct_fsbase, "thread_struct",
"fsbase");
+ MEMBER_OFFSET_INIT(thread_struct_gsbase, "thread_struct",
"gsbase");
+ MEMBER_OFFSET_INIT(thread_struct_fs, "thread_struct",
"fs");
+ MEMBER_OFFSET_INIT(thread_struct_gs, "thread_struct",
"gs");
STRUCT_SIZE_INIT(tss_struct, "tss_struct");
MEMBER_OFFSET_INIT(tss_struct_ist, "tss_struct",
"ist");
if (INVALID_MEMBER(tss_struct_ist)) {
@@ -583,17 +612,6 @@ x86_64_init(int when)
"user_regs_struct", "r15");
STRUCT_SIZE_INIT(user_regs_struct, "user_regs_struct");
if (!VALID_STRUCT(user_regs_struct)) {
- /* Use this hardwired version -- sometimes the
- * debuginfo doesn't pick this up even though
- * it exists in the kernel; it shouldn't change.
- */
- struct x86_64_user_regs_struct {
- unsigned long r15, r14, r13, r12, bp, bx;
- unsigned long r11, r10, r9, r8, ax, cx, dx;
- unsigned long si, di, orig_ax, ip, cs;
- unsigned long flags, sp, ss, fs_base;
- unsigned long gs_base, ds, es, fs, gs;
- };
ASSIGN_SIZE(user_regs_struct) =
sizeof(struct x86_64_user_regs_struct);
ASSIGN_OFFSET(user_regs_struct_rip) =
@@ -891,7 +909,7 @@ x86_64_dump_machdep_table(ulong arg)
fprintf(fp, " is_page_ptr: x86_64_is_page_ptr()\n");
fprintf(fp, " verify_paddr: x86_64_verify_paddr()\n");
fprintf(fp, " get_kvaddr_ranges: x86_64_get_kvaddr_ranges()\n");
- fprintf(fp, " get_cpu_reg: x86_64_get_cpu_reg()\n");
+ fprintf(fp, "get_current_task_reg:
x86_64_get_current_task_reg()\n");
fprintf(fp, " init_kernel_pgd: x86_64_init_kernel_pgd()\n");
fprintf(fp, "clear_machdep_cache:
x86_64_clear_machdep_cache()\n");
fprintf(fp, " xendump_p2m_create: %s\n", PVOPS_XEN() ?
@@ -4971,22 +4989,124 @@ x86_64_eframe_verify(struct bt_info *bt, long
kvaddr, long cs, long ss,
return FALSE;
}
+#define GET_REG_FROM_INACTIVE_TASK_FRAME(reg) \
+({ \
+ ulong offset, reg_value = 0, rsp; \
+ if (VALID_MEMBER(inactive_task_frame_bp)) { \
+ offset = OFFSET(task_struct_thread) +
OFFSET(thread_struct_rsp); \
+ readmem(bt->task + offset, KVADDR, &rsp, \
+ sizeof(ulong), "thread_struct.rsp",
FAULT_ON_ERROR); \
+ readmem(rsp + OFFSET(inactive_task_frame_##reg), KVADDR,
®_value, \
+ sizeof(ulong), "inactive_task_frame.##reg",
FAULT_ON_ERROR); \
+ } \
+ reg_value; \
+})
I would suggest changing the above macro definition to a function, at least
there are two advantages:
[1] once the task_struct or inactive_task_frame is changed in the future,
it is easy to follow its changes.
[2] easy to debug, and more readable.
+
/*
* Get a stack frame combination of pc and ra from the most relevent
spot.
*/
static void
x86_64_get_stack_frame(struct bt_info *bt, ulong *pcp, ulong *spp)
{
- if (bt->flags & BT_DUMPFILE_SEARCH)
- return x86_64_get_dumpfile_stack_frame(bt, pcp, spp);
+ struct user_regs_bitmap_struct *ur_bitmap;
+ ulong pcp_save = *pcp;
+ ulong spp_save = *spp;
+ ulong sp;
if (bt->flags & BT_SKIP_IDLE)
bt->flags &= ~BT_SKIP_IDLE;
+ if (pcp)
+ *pcp = x86_64_get_pc(bt);
+ if (spp)
+ sp = *spp = x86_64_get_sp(bt);
+ ur_bitmap = (struct user_regs_bitmap_struct
*)GETBUF(sizeof(*ur_bitmap));
+ memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
Can you keep the same style?(sorry to be picky :-))
+ ur_bitmap = (struct user_regs_bitmap_struct
*)GETBUF(sizeof(*ur_bitmap));
+ memset(ur_bitmap, 0, sizeof(*ur_bitmap));
or
+ ur_bitmap = (struct user_regs_bitmap_struct *)GETBUF(sizeof(struct
user_regs_bitmap_struct)));
+ memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
+
+ if (VALID_MEMBER(inactive_task_frame_bp)) {
+ if (!is_task_active(bt->task)) {
+ /*
+ * For inactive tasks in live and dumpfile, regs
can be
+ * get from inactive_task_frame struct.
+ */
+ ur_bitmap->r15 =
GET_REG_FROM_INACTIVE_TASK_FRAME(r15);
+ ur_bitmap->r14 =
GET_REG_FROM_INACTIVE_TASK_FRAME(r14);
+ ur_bitmap->r13 =
GET_REG_FROM_INACTIVE_TASK_FRAME(r13);
+ ur_bitmap->r12 =
GET_REG_FROM_INACTIVE_TASK_FRAME(r12);
+ ur_bitmap->bx =
GET_REG_FROM_INACTIVE_TASK_FRAME(bx);
+ ur_bitmap->bp =
GET_REG_FROM_INACTIVE_TASK_FRAME(bp);
+ /*
+ For inactive tasks:
+ crash> task -x 1|grep sp
+ sp = 0xffffc90000013d00
+ crash> rd ffffc90000013d00 32
+ ffffc90000013d00: ffff888104dad4a8
0000000000000000 r15,r14
+ ffffc90000013d10: ffff888100280000
ffff888100216500 r13,r12
+ ffffc90000013d20: ffff888100217018
ffff88817fd2c800 rbx,rbp
+ ffffc90000013d30: ffffffff81a6a1b3
ffffc90000013de0 saved_rip,...
+ ffffc90000013d40: ffff888100000004
99ccbf53ea493000
+ ffffc90000013d50: ffff888100216500
ffff888100216500
+
+ crash> dis __schedule
+ ...
+ 0xffffffff81a6a1ab <__schedule+507>: mov
%r13,%rsi
+ 0xffffffff81a6a1ae <__schedule+510>: call
0xffffffff81003490 <__switch_to_asm
+
0xffffffff81a6a1b3 <__schedule+515>: mov
%rax,%rdi <<=== saved_rip
+ ...
+ crash> dis __switch_to_asm
+ 0xffffffff81003490 <__switch_to_asm>: push %rbp
+ 0xffffffff81003491 <__switch_to_asm+1>: push %rbx
+ 0xffffffff81003492 <__switch_to_asm+2>: push %r12
+ 0xffffffff81003494 <__switch_to_asm+4>: push %r13
+ 0xffffffff81003496 <__switch_to_asm+6>: push %r14
+ 0xffffffff81003498 <__switch_to_asm+8>: push %r15
+ 0xffffffff8100349a <__switch_to_asm+10>:
mov %rsp,0x14d8(%rdi)
+ ...
+ Now saved_rip = ffffffff81a6a1b3, and we are
starting
+ the stack unwind at saved_rip, which is function
__schedule()
+ instead of function __switch_to_asm(), so the
stack pointer should
+ be rewind from ffffc90000013d00 back to
ffffc90000013d38,
+ aka *spp += 7 * reg_len. Otherwise we are
unwinding function
+ __schedule() but with __switch_to_asm()'s stack
frame, which
+ will fail.
+ */
+ sp += 7 * sizeof(unsigned long);
It seems this may still rely on the compiler, have you tried another
compiler? I'm not sure if there are the same results, just my concerns.
Anyway, If there is no better way or solution, at least we should comment
on it and point out the current limitations. What do you think?
+ ur_bitmap->bitmap += 0x3f;
Also why is it 0x3f? Can you explain the details?
> + } else {
> + /*
> + * For active tasks in dumpfile, we get regs
> through the
> + * original way. For active tasks in live, we only
> get
> + * ip and sp in the end of the function.
> + */
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
See the comments in the previous thread.
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return x86_64_get_dumpfile_stack_frame(bt,
> pcp, spp);
> + }
> + }
> + } else {
> + if (!is_task_active(bt->task)) {
> + readmem(*spp, KVADDR, &(ur_bitmap->bp),
> + sizeof(ulong), "ur_bitmap->bp",
> FAULT_ON_ERROR);
> + ur_bitmap->bitmap += 0x10;
> + } else {
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return x86_64_get_dumpfile_stack_frame(bt,
> pcp, spp);
> + }
> + }
> + }
> - if (pcp)
> - *pcp = x86_64_get_pc(bt);
> - if (spp)
> - *spp = x86_64_get_sp(bt);
> + ur_bitmap->ip = *pcp;
> + ur_bitmap->sp = sp;
> + ur_bitmap->bitmap += 0x50000;
> +
> + bt->machdep = ur_bitmap;
> + bt->need_free = TRUE;
> }
> /*
> @@ -6458,6 +6578,14 @@ x86_64_ORC_init(void)
> MEMBER_OFFSET_INIT(inactive_task_frame_bp,
"inactive_task_frame",
> "bp");
> MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr,
> "inactive_task_frame", "ret_addr");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r15, "inactive_task_frame",
> "r15");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r14, "inactive_task_frame",
> "r14");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r13, "inactive_task_frame",
> "r13");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r12, "inactive_task_frame",
> "r12");
> + MEMBER_OFFSET_INIT(inactive_task_frame_flags,
> "inactive_task_frame", "flags");
> + MEMBER_OFFSET_INIT(inactive_task_frame_si, "inactive_task_frame",
> "si");
> + MEMBER_OFFSET_INIT(inactive_task_frame_di, "inactive_task_frame",
> "di");
> + MEMBER_OFFSET_INIT(inactive_task_frame_bx, "inactive_task_frame",
> "bx");
> orc->has_signal =
MEMBER_EXISTS("orc_entry", "signal"); /* added
> at 6.3 */
> orc->has_end = MEMBER_EXISTS("orc_entry", "end");
/* removed
> at 6.4 */
> @@ -9071,17 +9199,128 @@ x86_64_get_kvaddr_ranges(struct vaddr_range *vrp)
> return cnt;
> }
> +#define REG_CASE(R, r) \
> + case R##_REGNUM: \
> + if (size != sizeof(ur_bitmap->r)) { \
> + ret = FALSE; break; \
> + } else { \
> + memcpy(value, &ur_bitmap->r, size); \
> + ret = TRUE; break; \
> + }
> +
> static int
> -x86_64_get_cpu_reg(int cpu, int regno, const char *name,
> +x86_64_get_current_task_reg(int regno, const char *name,
> int size, void *value)
> {
> - if (regno >= LAST_REGNUM)
> - return FALSE;
> + struct bt_info bt_info, bt_setup;
> + struct task_context *tc;
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong ip, sp;
> + bool ret = FALSE;
> - if (VMSS_DUMPFILE())
> - return vmware_vmss_get_cpu_reg(cpu, regno, name, size,
> value);
> + switch (regno) {
> + case RAX_REGNUM ... GS_REGNUM:
> + case FS_BASE_REGNUM ... ORIG_RAX_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
Ditto.
> - return FALSE;
> + tc = CURRENT_CONTEXT();
> + if (!tc)
> + return FALSE;
> +
> + if (VMSS_DUMPFILE())
> + return vmware_vmss_get_cpu_reg(tc->processor, regno, name,
> size, value);
> +
> + BZERO(&bt_setup, sizeof(struct bt_info));
> + clone_bt_info(&bt_setup, &bt_info, tc);
> + fill_stackbuf(&bt_info);
> +
> + // reusing the get_dumpfile_regs function to get pt regs structure
> + get_dumpfile_regs(&bt_info, &sp, &ip);
> + if (bt_info.stackbuf)
> + FREEBUF(bt_info.stackbuf);
> + ur_bitmap = (struct user_regs_bitmap_struct *)bt_info.machdep;
> + if (!ur_bitmap)
> + return FALSE;
> +
> + /* Get all registers from elf notes*/
> + if (!bt_info.need_free) {
> + goto get_all;
> + }
> +
Ditto.
> + /* Get subset registers from stack frame*/
> + switch (ur_bitmap->bitmap) {
> + case 0x5003f:
> + switch (regno) {
> + case R12_REGNUM ... R15_REGNUM:
> + case RBP_REGNUM:
> + case RBX_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50010:
> + switch (regno) {
> + case RBP_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50000:
> + switch (regno) {
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + }
Can you help to explain more details about several magic number values?
Such as 0x5003f, 0x50000, 0x50010. Or you may point out where they come
from, it may be helpful to add them here as code comment.
Thanks
Lianbo
+
> +get_all:
> + switch (regno) {
> + REG_CASE(RAX, ax);
> + REG_CASE(RBX, bx);
> + REG_CASE(RCX, cx);
> + REG_CASE(RDX, dx);
> + REG_CASE(RSI, si);
> + REG_CASE(RDI, di);
> + REG_CASE(RBP, bp);
> + REG_CASE(RSP, sp);
> + REG_CASE(R8, r8);
> + REG_CASE(R9, r9);
> + REG_CASE(R10, r10);
> + REG_CASE(R11, r11);
> + REG_CASE(R12, r12);
> + REG_CASE(R13, r13);
> + REG_CASE(R14, r14);
> + REG_CASE(R15, r15);
> + REG_CASE(RIP, ip);
> + REG_CASE(EFLAGS, flags);
> + REG_CASE(CS, cs);
> + REG_CASE(SS, ss);
> + REG_CASE(DS, ds);
> + REG_CASE(ES, es);
> + REG_CASE(FS, fs);
> + REG_CASE(GS, gs);
> + REG_CASE(FS_BASE, fs_base);
> + REG_CASE(GS_BASE, gs_base);
> + REG_CASE(ORIG_RAX, orig_ax);
> + }
> +
> + if (bt_info.need_free) {
> + FREEBUF(ur_bitmap);
> + bt_info.need_free = FALSE;
> + }
> + return ret;
> }
> /*
> --
> 2.40.1