Hi Tao,
On 24/09/22 10:25PM, Aditya Gupta wrote:
Hi Tao,
On 18/09/24 05:12, Tao Liu wrote:
> A error stack trace of bt cmd observed:
>
> crash> bt 1
> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND: "systemd"
> #0 [c0000000037735c0] _end at c0000000037154b0 (unreliable)
> #1 [c000000003773770] __switch_to at c00000000001fa9c
> #2 [c0000000037737d0] __schedule at c00000000112e4ec
> #3 [c0000000037738b0] schedule at c00000000112ea80
> ...
>
> The #0 stack trace is incorrect, the function address shouldn't exceed _end.
> The reason is for kernel>=v6.2, the offset of pt_regs to sp changed from
> STACK_FRAME_OVERHEAD, i.e 112, to STACK_SWITCH_FRAME_REGS. For
> CONFIG_PPC64_ELF_ABI_V1, it's 112, for ABI_V2, it's 48. So the nip will read
a
> wrong value from stack when ABI_V2 enabled.
>
> To determine if ABI_V2 enabled is tricky. This patch do it by check the
> following:
>
> In arch/powerpc/include/asm/ppc_asm.h:
> #ifdef CONFIG_PPC64_ELF_ABI_V2
> #define STK_GOT 24
> #else
> #define STK_GOT 40
>
> In arch/powerpc/kernel/tm.S:
> _GLOBAL(tm_reclaim)
> mfcr r5
> mflr r0
> stw r5, 8(r1)
> std r0, 16(r1)
> std r2, STK_GOT(r1)
> ...
>
> So a disassemble on tm_reclaim, and extract the STK_GOT value from std
> instruction is used as the approach.
>
> After the patch:
> crash> bt 1
> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND: "systemd"
> #0 [c0000000037737d0] __schedule at c00000000112e4ec
> #1 [c0000000037738b0] schedule at c00000000112ea80
> ...
>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> defs.h | 1 +
> ppc64.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index 2231cb6..d5cb8cc 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -4643,6 +4643,7 @@ struct efi_memory_desc_t {
> #define MSR_PR_LG 14 /* Problem State / Privilege Level */
> /* Used to find the user or kernel-mode frame*/
> +#define STACK_SWITCH_FRAME_REGS 48
> #define STACK_FRAME_OVERHEAD 112
> #define EXCP_FRAME_MARKER 0x7265677368657265
> diff --git a/ppc64.c b/ppc64.c
> index e8930a1..6e5f155 100644
> --- a/ppc64.c
> +++ b/ppc64.c
> @@ -72,6 +72,7 @@ static ulong pud_page_vaddr_l4(ulong pud);
> static ulong pmd_page_vaddr_l4(ulong pmd);
> static int is_opal_context(ulong sp, ulong nip);
> void opalmsg(void);
> +static bool is_ppc64_elf_abi_v2(void);
> static int is_opal_context(ulong sp, ulong nip)
> {
> @@ -2813,6 +2814,51 @@ ppc64_get_sp(ulong task)
> return sp;
> }
> +static bool
> +is_ppc64_elf_abi_v2(void)
> +{
Thanks for fixing these issues on ppc64 !
It is good. I am also checking if there is a way to implement
'is_ppc64_elf_abi_v2' by getting ABI version from the ELF notes, as
'file'
command somehow does know:
$ file vmlinux-little-endian vmlinux-little-endian: ELF 64-bit LSB
executable, 64-bit PowerPC or cisco 7500, OpenPOWER ELF V2 ABI, version 1
(SYSV), statically linked,
BuildID[sha1]=f6449970e6af6cddeed1e260393e2377ccc0f369, not stripped
That might be more dependable (if we can do that).
How about this approach ?
I feel this might be more dependable, as the file command also uses
this, as well as fadump (PowerPC firmware-assisted dump) in the kernel.
As a sidenote, there is still some issue either in patch #1 or #3
causing all secondary CPUs (where swapper task is running) to show all
threads as '_end'
What do you think ?
Either way what you chose, I do realise the idea to utilise the
'std r2, STK_GOT(r1)' in such a way, was interesting and smart.
static bool
is_ppc64_elf_abi_v2(void)
{
const char *kernel_file = pc->namelist;
int fd, swap;
char eheader[BUFSIZE];
Elf64_Ehdr *elf64;
if ((fd = open(kernel_file, O_RDONLY)) < 0) {
error(INFO, "%s: %s\n", kernel_file, strerror(errno));
return FALSE;
}
if (read(fd, eheader, BUFSIZE) != BUFSIZE) {
error(INFO, "%s: %s\n", kernel_file, strerror(errno));
close(fd);
return FALSE;
}
close(fd);
if (!STRNEQ(eheader, ELFMAG) || eheader[EI_VERSION] != EV_CURRENT)
return FALSE;
elf64 = (Elf64_Ehdr *)&eheader[0];
swap = (((eheader[EI_DATA] == ELFDATA2LSB) &&
(__BYTE_ORDER == __BIG_ENDIAN)) ||
((eheader[EI_DATA] == ELFDATA2MSB) &&
(__BYTE_ORDER == __LITTLE_ENDIAN)));
/*
* The ABI version is stored in E_FLAGS in the ELF header, atleast for
* ppc64 binaries.
*
* These logic is also used by the 'file' command, which states this in
* its magic file:
*
* >18 leshort 20 PowerPC or cisco 4500,
* >18 leshort 21 64-bit PowerPC or cisco 7500,
* >>48 lelong 0 Unspecified or Power ELF V1 ABI,
* >>48 lelong 1 Power ELF V1 ABI,
* >>48 lelong 2 OpenPOWER ELF V2 ABI,
*
* The '>18' above means, data at 18 (0x12), which is basically
* 'elf64->e_machine', which will be 20 for PPC, and 21 for PPC64
*
* Similarly, '>>48' is offset 48 (0x30) in the file, which is
* basically 'elf64->e_flags', which has the ELF ABI version
*/
if (elf64->e_flags == 2) {
/* ABI v2 */
return TRUE;
} else if (elf64->e_flags == 1) {
/* ABI v1 */
return FALSE;
}
error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
return FALSE;
}
Also, we can those values in the ELF header:
$ hexdump vmlinux-little-endian
0000000 457f 464c 0102 0001 0000 0000 0000 0000
0000010 0002 0015 (= 21 = E_MACHINE) 0001 0000 0000 0000 0000 c000
0000020 0040 0000 0000 0000 8bf8 0040 0000 0000
0000030 0002 (= E_FLAGS = ABI) 0000 0040 0038 0002 0040 0027 0026
0000040 0001 0000 0007 0000 0000 0001 0000 0000
Also, I see 'pc->namelist' is basically the kernel filename, is that
correct ? Can that be renamed to kernel_filename or something ?
Thanks,
Aditya Gupta
>
> Thanks,
>
> Aditya Gupta
>
> > + char buf1[BUFSIZE];
> > + char *pos1, *pos2;
> > + int errflag = 0;
> > + ulong stk_got = 0;
> > + static bool ret = false;
> > + static bool checked = false;
> > +
> > + if (checked == true || !symbol_exists("tm_reclaim"))
> > + return ret;
> > +
> > + sprintf(buf1, "x/16i tm_reclaim");
> > + open_tmpfile();
> > + if (!gdb_pass_through(buf1, pc->tmpfile, GNU_RETURN_ON_ERROR))
> > + goto out;
> > + checked = true;
> > + rewind(pc->tmpfile);
> > + while (fgets(buf1, BUFSIZE, pc->tmpfile)) {
> > + // "std r2, STK_GOT(r1)" is expected
> > + if (strstr(buf1, "std") &&
> > + strstr(buf1, "(r1)") &&
> > + (pos1 = strstr(buf1, "r2,"))) {
> > + pos1 += strlen("r2,");
> > + for (pos2 = pos1; *pos2 != '\0' && *pos2 != '(';
pos2++);
> > + *pos2 = '\0';
> > + stk_got = stol(pos1, RETURN_ON_ERROR|QUIET, &errflag);
> > + break;
> > + }
> > + }
> > +
> > + if (!errflag) {
> > + switch (stk_got) {
> > + case 24:
> > + ret = true;
> > + case 40:
> > + goto out;
> > + }
> > + }
> > + error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
> > +out:
> > + close_tmpfile();
> > + return ret;
> > +}
> > /*
> > * get the SP and PC values for idle tasks.
> > @@ -2834,9 +2880,17 @@ get_ppc64_frame(struct bt_info *bt, ulong *getpc, ulong
*getsp)
> > sp = ppc64_get_sp(task);
> > if (!INSTACK(sp, bt))
> > goto out;
> > - readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, ®s,
> > - sizeof(struct ppc64_pt_regs),
> > - "PPC64 pt_regs", FAULT_ON_ERROR);
> > +
> > + if (THIS_KERNEL_VERSION >= LINUX(6,2,0) && is_ppc64_elf_abi_v2())
{
> > + readmem(sp+STACK_SWITCH_FRAME_REGS, KVADDR, ®s,
> > + sizeof(struct ppc64_pt_regs),
> > + "PPC64 pt_regs", FAULT_ON_ERROR);
> > + } else {
> > + readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, ®s,
> > + sizeof(struct ppc64_pt_regs),
> > + "PPC64 pt_regs", FAULT_ON_ERROR);
> > + }
> > +
> > ip = regs.nip;
> > closest = closest_symbol(ip);
> > if (STREQ(closest, ".__switch_to") || STREQ(closest,
"__switch_to")) {