On kernels configured with CONFIG_RANDOMIZE_KSTACK_OFFSET=y and
random_kstack_offset=on, a random offset is added to the stack with
__kstack_alloca() at the beginning of do_syscall_64() and other syscall
entry functions. This function has the following instruction.
<do_syscall_64+32>: sub %rax,%rsp
On the other hand, crash uses only a part of data for ORC unwinder to
unwind stacks and if an ip value doesn't have a usable ORC data, it
caluculates the frame size with parsing the assembly of the function.
However, crash cannot calculate the frame size correctly with the
instruction above, and prints stale return addresses like this:
crash> bt 1
PID: 1 TASK: ffff9c250023b880 CPU: 0 COMMAND: "systemd"
#0 [ffffb7e5c001fc80] __schedule at ffffffff91ae2b16
#1 [ffffb7e5c001fd00] schedule at ffffffff91ae2ed3
#2 [ffffb7e5c001fd18] schedule_hrtimeout_range_clock at ffffffff91ae7ed8
#3 [ffffb7e5c001fda8] ep_poll at ffffffff913ef828
#4 [ffffb7e5c001fe48] do_epoll_wait at ffffffff913ef943
#5 [ffffb7e5c001fe80] __x64_sys_epoll_wait at ffffffff913f0130
#6 [ffffb7e5c001fed0] do_syscall_64 at ffffffff91ad7169
#7 [ffffb7e5c001fef0] do_syscall_64 at ffffffff91ad7179 <<
#8 [ffffb7e5c001ff10] syscall_exit_to_user_mode at ffffffff91adaab2 << stale entries
#9 [ffffb7e5c001ff20] do_syscall_64 at ffffffff91ad7179 <<
#10 [ffffb7e5c001ff50] entry_SYSCALL_64_after_hwframe at ffffffff91c0009b
RIP: 00007f258d9427ae RSP: 00007fffda631d60 RFLAGS: 00000293
...
To fix this, enhance the usage of ORC data. The ORC unwinder often uses
%rbp value, so keep it from exception frames and inactive task stacks.
Good understanding, Kazu.
The patch looks good to me. So: Ack.
Thanks.
Lianbo
Signed-off-by: Kazuhito Hagio <k-hagio-ab@nec.com>
---
defs.h | 1 +
symbols.c | 1 +
x86_64.c | 115 ++++++++++++++++++++++++++++++++++++++----------------
3 files changed, 83 insertions(+), 34 deletions(-)
diff --git a/defs.h b/defs.h
index ab4f02cc65cf..e76af3c78b69 100644
--- a/defs.h
+++ b/defs.h
@@ -2207,6 +2207,7 @@ struct offset_table { /* stash of commonly-used offsets */
long sock_sk_common;
long sock_common_skc_v6_daddr;
long sock_common_skc_v6_rcv_saddr;
+ long inactive_task_frame_bp;
};
struct size_table { /* stash of commonly-used sizes */
diff --git a/symbols.c b/symbols.c
index b702b9665ec1..a974fc9141a0 100644
--- a/symbols.c
+++ b/symbols.c
@@ -8822,6 +8822,7 @@ dump_offset_table(char *spec, ulong makestruct)
OFFSET(task_struct_tss_ksp));
fprintf(fp, " task_struct_thread_eip: %ld\n",
OFFSET(task_struct_thread_eip));
+ fprintf(fp, " inactive_task_frame_bp: %ld\n", OFFSET(inactive_task_frame_bp));
fprintf(fp, " inactive_task_frame_ret_addr: %ld\n",
OFFSET(inactive_task_frame_ret_addr));
fprintf(fp, " task_struct_thread_esp: %ld\n",
diff --git a/x86_64.c b/x86_64.c
index 6cac3936b33d..ca14ede52884 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -122,7 +122,7 @@ static int x86_64_do_not_cache_framesize(struct syment *, ulong);
static int x86_64_framesize_cache_func(int, ulong, int *, int, struct syment *);
static ulong x86_64_get_framepointer(struct bt_info *, ulong);
int search_for_eframe_target_caller(struct bt_info *, ulong, int *);
-static int x86_64_get_framesize(struct bt_info *, ulong, ulong);
+static int x86_64_get_framesize(struct bt_info *, ulong, ulong, char *);
static void x86_64_framesize_debug(struct bt_info *);
static void x86_64_get_active_set(void);
static int x86_64_get_kvaddr_ranges(struct vaddr_range *);
@@ -3642,7 +3642,7 @@ in_exception_stack:
bt, ofp);
rsp += SIZE(pt_regs); /* guaranteed kernel mode */
if (bt->eframe_ip && ((framesize = x86_64_get_framesize(bt,
- bt->eframe_ip, rsp)) >= 0))
+ bt->eframe_ip, rsp, NULL)) >= 0))
rsp += framesize;
level++;
irq_eframe = 0;
@@ -3674,7 +3674,7 @@ in_exception_stack:
case BACKTRACE_ENTRY_DISPLAYED:
level++;
if ((framesize = x86_64_get_framesize(bt,
- bt->eframe_ip ? bt->eframe_ip : *up, rsp)) >= 0) {
+ bt->eframe_ip ? bt->eframe_ip : *up, rsp, NULL)) >= 0) {
rsp += framesize;
i += framesize/sizeof(ulong);
}
@@ -3747,7 +3747,7 @@ in_exception_stack:
}
level++;
- if ((framesize = x86_64_get_framesize(bt, bt->instptr, rsp)) >= 0)
+ if ((framesize = x86_64_get_framesize(bt, bt->instptr, rsp, NULL)) >= 0)
rsp += framesize;
}
}
@@ -3799,7 +3799,7 @@ in_exception_stack:
case BACKTRACE_ENTRY_DISPLAYED:
level++;
if ((framesize = x86_64_get_framesize(bt,
- bt->eframe_ip ? bt->eframe_ip : *up, rsp)) >= 0) {
+ bt->eframe_ip ? bt->eframe_ip : *up, rsp, NULL)) >= 0) {
rsp += framesize;
i += framesize/sizeof(ulong);
}
@@ -3909,24 +3909,34 @@ in_exception_stack:
(STREQ(rip_symbol, "thread_return") ||
STREQ(rip_symbol, "schedule") ||
STREQ(rip_symbol, "__schedule"))) {
- if (STREQ(rip_symbol, "__schedule")) {
- i = (rsp - bt->stackbase)/sizeof(ulong);
- x86_64_print_stack_entry(bt, ofp, level,
- i, bt->instptr);
- level++;
- rsp = __schedule_frame_adjust(rsp, bt);
- if (STREQ(closest_symbol(bt->instptr), "schedule"))
+ if ((machdep->flags & ORC) && VALID_MEMBER(inactive_task_frame_ret_addr)) {
+ /*
+ * %rsp should have the address of inactive_task_frame, so
+ * skip the registers before ret_addr to adjust rsp.
+ */
+ if (CRASHDEBUG(1))
+ fprintf(fp, "rsp: %lx rbp: %lx\n", rsp, bt->bptr);
+ rsp += OFFSET(inactive_task_frame_ret_addr);
+ } else {
+ if (STREQ(rip_symbol, "__schedule")) {
+ i = (rsp - bt->stackbase)/sizeof(ulong);
+ x86_64_print_stack_entry(bt, ofp, level,
+ i, bt->instptr);
+ level++;
+ rsp = __schedule_frame_adjust(rsp, bt);
+ if (STREQ(closest_symbol(bt->instptr), "schedule"))
+ bt->flags |= BT_SCHEDULE;
+ } else
bt->flags |= BT_SCHEDULE;
- } else
- bt->flags |= BT_SCHEDULE;
-
- if (bt->flags & BT_SCHEDULE) {
- i = (rsp - bt->stackbase)/sizeof(ulong);
- x86_64_print_stack_entry(bt, ofp, level,
- i, bt->instptr);
- bt->flags &= ~(ulonglong)BT_SCHEDULE;
- rsp += sizeof(ulong);
- level++;
+
+ if (bt->flags & BT_SCHEDULE) {
+ i = (rsp - bt->stackbase)/sizeof(ulong);
+ x86_64_print_stack_entry(bt, ofp, level,
+ i, bt->instptr);
+ bt->flags &= ~(ulonglong)BT_SCHEDULE;
+ rsp += sizeof(ulong);
+ level++;
+ }
}
}
@@ -3957,7 +3967,7 @@ in_exception_stack:
irq_eframe = 0;
bt->flags |= BT_EFRAME_TARGET;
if (bt->eframe_ip && ((framesize = x86_64_get_framesize(bt,
- bt->eframe_ip, rsp)) >= 0))
+ bt->eframe_ip, rsp, NULL)) >= 0))
rsp += framesize;
bt->flags &= ~BT_EFRAME_TARGET;
}
@@ -4044,7 +4054,7 @@ in_exception_stack:
case BACKTRACE_ENTRY_DISPLAYED:
level++;
if ((framesize = x86_64_get_framesize(bt,
- bt->eframe_ip ? bt->eframe_ip : *up, rsp)) >= 0) {
+ bt->eframe_ip ? bt->eframe_ip : *up, rsp, (char *)up)) >= 0) {
rsp += framesize;
i += framesize/sizeof(ulong);
}
@@ -4755,7 +4765,8 @@ x86_64_exception_frame(ulong flags, ulong kvaddr, char *local,
bt->instptr = rip;
bt->stkptr = rsp;
bt->bptr = rbp;
- }
+ } else if (machdep->flags & ORC)
+ bt->bptr = rbp;
if (kvaddr)
FREEBUF(pt_regs_buf);
@@ -5315,6 +5326,10 @@ x86_64_get_sp(struct bt_info *bt)
OFFSET(thread_struct_rsp), KVADDR,
&rsp, sizeof(void *),
"thread_struct rsp", FAULT_ON_ERROR);
+ if ((machdep->flags & ORC) && VALID_MEMBER(inactive_task_frame_bp)) {
+ readmem(rsp + OFFSET(inactive_task_frame_bp), KVADDR, &bt->bptr,
+ sizeof(void *), "inactive_task_frame.bp", FAULT_ON_ERROR);
+ }
return rsp;
}
@@ -6421,6 +6436,9 @@ x86_64_ORC_init(void)
orc->__stop_orc_unwind = symbol_value("__stop_orc_unwind");
orc->orc_lookup = symbol_value("orc_lookup");
+ MEMBER_OFFSET_INIT(inactive_task_frame_bp, "inactive_task_frame", "bp");
+ MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr, "inactive_task_frame", "ret_addr");
+
machdep->flags |= ORC;
}
@@ -8489,7 +8507,7 @@ search_for_eframe_target_caller(struct bt_info *bt, ulong stkptr, int *framesize
(BT_OLD_BACK_TRACE|BT_TEXT_SYMBOLS|BT_TEXT_SYMBOLS_ALL|BT_FRAMESIZE_DISABLE)
static int
-x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp)
+x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp, char *stack_ptr)
{
int c, framesize, instr, arg, max;
struct syment *sp;
@@ -8590,19 +8608,48 @@ x86_64_get_framesize(struct bt_info *bt, ulong textaddr, ulong rsp)
if ((machdep->flags & ORC) && (korc = orc_find(textaddr))) {
if (CRASHDEBUG(1)) {
fprintf(fp,
- "rsp: %lx textaddr: %lx framesize: %d -> spo: %d bpo: %d spr: %d bpr: %d type: %d %s",
+ "rsp: %lx textaddr: %lx framesize: %d -> spo: %d bpo: %d spr: %d bpr: %d type: %d",
rsp, textaddr, framesize, korc->sp_offset, korc->bp_offset,
- korc->sp_reg, korc->bp_reg, korc->type,
- (korc->type == ORC_TYPE_CALL) && (korc->sp_reg == ORC_REG_SP) ? "" : "(UNUSED)");
+ korc->sp_reg, korc->bp_reg, korc->type);
if (MEMBER_EXISTS("orc_entry", "end"))
fprintf(fp, " end: %d", korc->end);
fprintf(fp, "\n");
}
- if ((korc->type == ORC_TYPE_CALL) && (korc->sp_reg == ORC_REG_SP)) {
- framesize = (korc->sp_offset - 8);
- return (x86_64_framesize_cache_func(FRAMESIZE_ENTER, textaddr,
- &framesize, exception, NULL));
+ if (korc->type == ORC_TYPE_CALL) {
+ ulong prev_sp = 0, prev_bp = 0;
+ framesize = -1;
+
+ if (korc->sp_reg == ORC_REG_SP) {
+ framesize = (korc->sp_offset - 8);
+
+ /* rsp points to a return address, so +8 to use sp_offset */
+ prev_sp = (rsp + 8) + korc->sp_offset;
+ if (CRASHDEBUG(1))
+ fprintf(fp, "rsp: %lx prev_sp: %lx\n", rsp, prev_sp);
+ } else if ((korc->sp_reg == ORC_REG_BP) && bt->bptr) {
+ prev_sp = bt->bptr + korc->sp_offset;
+ framesize = (prev_sp - (rsp + 8) - 8);
+ if (CRASHDEBUG(1))
+ fprintf(fp, "rsp: %lx rbp: %lx prev_sp: %lx framesize: %d\n",
+ rsp, bt->bptr, prev_sp, framesize);
+ }
+
+ if ((korc->bp_reg == ORC_REG_PREV_SP) && prev_sp) {
+ prev_bp = prev_sp + korc->bp_offset;
+ if (stack_ptr && INSTACK(prev_bp, bt)) {
+ bt->bptr = ULONG(stack_ptr + (prev_bp - rsp));
+ if (CRASHDEBUG(1))
+ fprintf(fp, "rsp: %lx prev_sp: %lx prev_bp: %lx -> %lx\n",
+ rsp, prev_sp, prev_bp, bt->bptr);
+ } else
+ bt->bptr = 0;
+ } else if ((korc->bp_reg != ORC_REG_UNDEFINED))
+ bt->bptr = 0;
+
+ if (framesize >= 0)
+ /* Do not cache this, possibly it may be variable. */
+ return framesize;
}
}
@@ -8758,7 +8805,7 @@ x86_64_framesize_debug(struct bt_info *bt)
if (!bt->hp->eip)
error(INFO, "x86_64_framesize_debug: ignoring command\n");
else
- x86_64_get_framesize(bt, bt->hp->eip, 0);
+ x86_64_get_framesize(bt, bt->hp->eip, 0, NULL);
break;
case -3:
--
2.31.1