On Fri, Oct 11, 2024 at 1:19 PM Tao Liu <ltao(a)redhat.com> wrote:
On Tue, Oct 8, 2024 at 10:46 PM Tao Liu <ltao(a)redhat.com>
wrote:
>
> Hi Aditya,
>
> On Tue, Oct 8, 2024 at 10:18 PM Aditya Gupta <adityag(a)linux.ibm.com>
wrote:
> >
> > Hi Tao,
> >
> >
> > On 04/10/24 10:40, Tao Lou wrote:
> > > Hi Aditya & lianbo,
> > >
> > > On Tue, Sep 24, 2024 at 2:28 AM Aditya Gupta
<adityag(a)linux.ibm.com>
wrote:
> > >> Hi Tao,
> > >>
> > >> On 24/09/22 10:25PM, Aditya Gupta wrote:
> > >>> Hi Tao,
> > >>>
> > >>>
> > >>> On 18/09/24 05:12, Tao Liu wrote:
> > >>>> A error stack trace of bt cmd observed:
> > >>>>
> > >>>> crash> bt 1
> > >>>> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND:
"systemd"
> > >>>> #0 [c0000000037735c0] _end at c0000000037154b0
(unreliable)
> > >>>> #1 [c000000003773770] __switch_to at c00000000001fa9c
> > >>>> #2 [c0000000037737d0] __schedule at c00000000112e4ec
> > >>>> #3 [c0000000037738b0] schedule at c00000000112ea80
> > >>>> ...
> > >>>>
> > >>>> The #0 stack trace is incorrect, the function address
shouldn't
exceed _end.
> > >>>> The reason is for kernel>=v6.2, the offset of pt_regs to
sp
changed from
> > >>>> STACK_FRAME_OVERHEAD, i.e 112, to STACK_SWITCH_FRAME_REGS.
For
> > >>>> CONFIG_PPC64_ELF_ABI_V1, it's 112, for ABI_V2, it's
48. So the
nip will read a
> > >>>> wrong value from stack when ABI_V2 enabled.
> > >>>>
> > >>>> To determine if ABI_V2 enabled is tricky. This patch do it by
check the
> > >>>> following:
> > >>>>
> > >>>> In arch/powerpc/include/asm/ppc_asm.h:
> > >>>> #ifdef CONFIG_PPC64_ELF_ABI_V2
> > >>>> #define STK_GOT 24
> > >>>> #else
> > >>>> #define STK_GOT 40
> > >>>>
> > >>>> In arch/powerpc/kernel/tm.S:
> > >>>> _GLOBAL(tm_reclaim)
> > >>>> mfcr r5
> > >>>> mflr r0
> > >>>> stw r5, 8(r1)
> > >>>> std r0, 16(r1)
> > >>>> std r2, STK_GOT(r1)
> > >>>> ...
> > >>>>
> > >>>> So a disassemble on tm_reclaim, and extract the STK_GOT value
from std
> > >>>> instruction is used as the approach.
> > >>>>
> > >>>> After the patch:
> > >>>> crash> bt 1
> > >>>> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND:
"systemd"
> > >>>> #0 [c0000000037737d0] __schedule at c00000000112e4ec
> > >>>> #1 [c0000000037738b0] schedule at c00000000112ea80
> > >>>> ...
> > >>>>
> > >>>> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> > >>>> ---
> > >>>> defs.h | 1 +
> > >>>> ppc64.c | 60
++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >>>> 2 files changed, 58 insertions(+), 3 deletions(-)
> > >>>>
> > >>>> diff --git a/defs.h b/defs.h
> > >>>> index 2231cb6..d5cb8cc 100644
> > >>>> --- a/defs.h
> > >>>> +++ b/defs.h
> > >>>> @@ -4643,6 +4643,7 @@ struct efi_memory_desc_t {
> > >>>> #define MSR_PR_LG 14 /* Problem State / Privilege
Level */
> > >>>> /* Used to find the user or
kernel-mode frame*/
> > >>>> +#define STACK_SWITCH_FRAME_REGS 48
> > >>>> #define STACK_FRAME_OVERHEAD 112
> > >>>> #define EXCP_FRAME_MARKER 0x7265677368657265
> > >>>> diff --git a/ppc64.c b/ppc64.c
> > >>>> index e8930a1..6e5f155 100644
> > >>>> --- a/ppc64.c
> > >>>> +++ b/ppc64.c
> > >>>> @@ -72,6 +72,7 @@ static ulong pud_page_vaddr_l4(ulong pud);
> > >>>> static ulong pmd_page_vaddr_l4(ulong pmd);
> > >>>> static int is_opal_context(ulong sp, ulong nip);
> > >>>> void opalmsg(void);
> > >>>> +static bool is_ppc64_elf_abi_v2(void);
> > >>>> static int is_opal_context(ulong sp, ulong nip)
> > >>>> {
> > >>>> @@ -2813,6 +2814,51 @@ ppc64_get_sp(ulong task)
> > >>>> return sp;
> > >>>> }
> > >>>> +static bool
> > >>>> +is_ppc64_elf_abi_v2(void)
> > >>>> +{
> > >>>
> > >>> Thanks for fixing these issues on ppc64 !
> > >>>
> > >>>
> > >>> It is good. I am also checking if there is a way to implement
> > >>> 'is_ppc64_elf_abi_v2' by getting ABI version from the ELF
notes,
as 'file'
> > >>> command somehow does know:
> > >>>
> > >>> $ file vmlinux-little-endian vmlinux-little-endian: ELF 64-bit
LSB
> > >>> executable, 64-bit PowerPC or cisco 7500, OpenPOWER ELF V2 ABI,
version 1
> > >>> (SYSV), statically linked,
> > >>> BuildID[sha1]=f6449970e6af6cddeed1e260393e2377ccc0f369, not
stripped
> > >>>
> > >>> That might be more dependable (if we can do that).
> > >> How about this approach ?
> > >>
> > >> I feel this might be more dependable, as the file command also uses
> > >> this, as well as fadump (PowerPC firmware-assisted dump) in the
kernel.
> > >>
> > >>> As a sidenote, there is still some issue either in patch #1 or #3
> > >>> causing all secondary CPUs (where swapper task is running) to
show
all
> > >>> threads as '_end'
> > >> What do you think ?
> > > I think your approach will work, thanks for your code! Please see my
> > > comments inlined in the code:
> > >
> > >> Either way what you chose, I do realise the idea to utilise the
> > >> 'std r2, STK_GOT(r1)' in such a way, was interesting and
smart.
> > >>
> > >> static bool
> > >> is_ppc64_elf_abi_v2(void)
> > >> {
> > >> const char *kernel_file = pc->namelist;
> > >> int fd, swap;
> > >> char eheader[BUFSIZE];
> > >> Elf64_Ehdr *elf64;
> > >>
> > >> if ((fd = open(kernel_file, O_RDONLY)) < 0) {
> > >> error(INFO, "%s: %s\n", kernel_file,
strerror(errno));
> > >> return FALSE;
> > >> }
> > >> if (read(fd, eheader, BUFSIZE) != BUFSIZE) {
> > >> error(INFO, "%s: %s\n", kernel_file,
strerror(errno));
> > >> close(fd);
> > >> return FALSE;
> > >> }
> > >> close(fd);
> > >>
> > >> if (!STRNEQ(eheader, ELFMAG) || eheader[EI_VERSION] !=
EV_CURRENT)
> > >> return FALSE;
> > >>
> > >> elf64 = (Elf64_Ehdr *)&eheader[0];
> > >>
> > >> swap = (((eheader[EI_DATA] == ELFDATA2LSB) &&
> > >> (__BYTE_ORDER == __BIG_ENDIAN)) ||
> > >> ((eheader[EI_DATA] == ELFDATA2MSB) &&
> > >> (__BYTE_ORDER == __LITTLE_ENDIAN)));
> > > The above code seems duplicated with the one in is_kernel(), in fact,
> > > is_kernel() will be invoked at the very beginning to determine if one
> > > file is vmlinux, so I would suggest to add our abi_v2 check here,
> > > like:
> > >
> > > in ppc64.c:
> > > int ppc64_elf_abi = 0;
> > >
> > > static bool is_ppc64_elf_abi_v2(void)
> > > {
> > > if (ppc64_elf_abi == 1)
> > > ...
> > > else if (ppc64_elf_abi == 2)
> > > ...
> > > else
> > > ...
> > > }
> > >
> > > in symbols.c:is_kernel():
> > > ...
> > > } else if ((elf64->e_ident[EI_CLASS] == ELFCLASS64) &&
> > > ((swap16(elf64->e_type, swap) == ET_EXEC) ||
> > > ...
> > > case EM_PPC64:
> > > if (machine_type_mismatch(file, "PPC64", NULL, 0))
> > > goto bailout;
> > > + else
> > > + ppc64_elf_abi = swap32(elf64->e_flags, swap);
> > > break;
> > >
> > > So we can initial the abi version at crash start only once, and get
> > > rid of the code duplication.
> >
> > Yes, I actually copied pasted the initial part from is_kernel.
> >
> > The approach is good, just to clarify, we know that is_kernel will be
> > called everytime before this check right ?
>
> Actually I used another approach rather than is_kernel(), see [1] for
> v2 patchset of ppc64 fixing.
> The reason is keeping a ppc64 specific variable in the multi-cpu arch
> used function is_kernel(), it is hard to make the code as clean.
>
Hi Aditya,
There is one more question about the abi_v1/2. Say if I have a abi_v2
machine, but I want to run a abi_v1 elf executable file. I don't know
1) is it possible to run v1 elf on a v2 kernel machine; 2) if it can,
which STACK_FRAME_OVERHEAD will the kernel use, 48 or 112? If
STACK_FRAME_OVERHEAD will vary depending on abi_v1/2 of elf executable
file, then our approach needs to change.
That is my concern, as I discussed with Tao offline, is it possible to
allow different ABI versions in a ppc64 system? E.g: kernel tasks or kernel
modules.
Because Tao did the test, and found that there are different values of
e_flags among various tasks, see:
https://www.mail-archive.com/devel@lists.crash-utility.osci.io/msg01168.html
If the above is not true, could you please try to read the value of e_flags
from the kcore/vmcore elf header? (If possible)
Thanks
Lianbo
Currently I have failed to build a abi_v1 elf executable file on the
abi_v2 machine, which reports:
$ gcc test.c -mabi=elfv1
fatal error: gnu/stubs-64-v1.h: No such file or directory
But I don't know where to install the missing header or rpms, or if it
is doable.
Any ideas?
Thanks,
Tao Liu
> [1]:
https://www.mail-archive.com/devel@lists.crash-utility.osci.io/msg01170.html
>
> Thanks,
> Tao Liu
>
> >
> >
> > Thanks,
> > Aditya Gupta
> >
> > >
> > >> /*
> > >> * The ABI version is stored in E_FLAGS in the ELF header,
atleast for
> > >> * ppc64 binaries.
> > >> *
> > >> * These logic is also used by the 'file' command,
which
states this in
> > >> * its magic file:
> > >> *
> > >> * >18 leshort 20 PowerPC
or
cisco 4500,
> > >> * >18 leshort 21 64-bit
PowerPC or cisco 7500,
> > >> * >>48 lelong 0
Unspecified or Power ELF V1 ABI,
> > >> * >>48 lelong 1 Power
ELF
V1 ABI,
> > >> * >>48 lelong 2
OpenPOWER
ELF V2 ABI,
> > >> *
> > >> * The '>18' above means, data at 18 (0x12), which
is
basically
> > >> * 'elf64->e_machine', which will be 20 for PPC,
and 21
for PPC64
> > >> *
> > >> * Similarly, '>>48' is offset 48 (0x30) in the
file,
which is
> > >> * basically 'elf64->e_flags', which has the ELF
ABI
version
> > >> */
> > >> if (elf64->e_flags == 2) {
> > >> /* ABI v2 */
> > >> return TRUE;
> > >> } else if (elf64->e_flags == 1) {
> > >> /* ABI v1 */
> > >> return FALSE;
> > >> }
> > >>
> > >> error(WARNING, "Unstable elf_abi v1/v2
detection.\n");
> > >> return FALSE;
> > >> }
> > >>
> > >> Also, we can those values in the ELF header:
> > >>
> > >> $ hexdump vmlinux-little-endian
> > >> 0000000 457f 464c 0102 0001 0000 0000 0000 0000
> > >> 0000010 0002 0015 (= 21 = E_MACHINE) 0001 0000 0000 0000 0000
c000
> > >> 0000020 0040 0000 0000 0000 8bf8 0040 0000 0000
> > >> 0000030 0002 (= E_FLAGS = ABI) 0000 0040 0038 0002 0040 0027
0026
> > >> 0000040 0001 0000 0007 0000 0000 0001 0000 0000
> > >>
> > > I agree with the elf header check on the vmlinux file, it is much
> > > simpler. Also I compiled the kernel with CONFIG_PPC64_ELF_ABI_V1 on,
> > > the file cmd will give: Power ELF V1 ABI string. So the approach can
> > > work.
> > >
> > >> Also, I see 'pc->namelist' is basically the kernel
filename, is that
> > >> correct ? Can that be renamed to kernel_filename or something ?
> > >>
> > > Yes, by reading through the code, pc->namelist is vmlinux, and
> > > pc->namelist_debug is the debug symbol if vmlinux doesn't have the
> > > symbol. But I didn't see the need for the rename.
> > >
> > > Thanks,
> > > Tao Liu
> > >
> > >> Thanks,
> > >> Aditya Gupta
> > >>
> > >>> Thanks,
> > >>>
> > >>> Aditya Gupta
> > >>>
> > >>>> + char buf1[BUFSIZE];
> > >>>> + char *pos1, *pos2;
> > >>>> + int errflag = 0;
> > >>>> + ulong stk_got = 0;
> > >>>> + static bool ret = false;
> > >>>> + static bool checked = false;
> > >>>> +
> > >>>> + if (checked == true ||
!symbol_exists("tm_reclaim"))
> > >>>> + return ret;
> > >>>> +
> > >>>> + sprintf(buf1, "x/16i tm_reclaim");
> > >>>> + open_tmpfile();
> > >>>> + if (!gdb_pass_through(buf1, pc->tmpfile,
GNU_RETURN_ON_ERROR))
> > >>>> + goto out;
> > >>>> + checked = true;
> > >>>> + rewind(pc->tmpfile);
> > >>>> + while (fgets(buf1, BUFSIZE, pc->tmpfile)) {
> > >>>> + // "std r2, STK_GOT(r1)" is expected
> > >>>> + if (strstr(buf1, "std") &&
> > >>>> + strstr(buf1, "(r1)") &&
> > >>>> + (pos1 = strstr(buf1, "r2,"))) {
> > >>>> + pos1 += strlen("r2,");
> > >>>> + for (pos2 = pos1; *pos2 != '\0'
&& *pos2 !=
'('; pos2++);
> > >>>> + *pos2 = '\0';
> > >>>> + stk_got = stol(pos1,
RETURN_ON_ERROR|QUIET,
&errflag);
> > >>>> + break;
> > >>>> + }
> > >>>> + }
> > >>>> +
> > >>>> + if (!errflag) {
> > >>>> + switch (stk_got) {
> > >>>> + case 24:
> > >>>> + ret = true;
> > >>>> + case 40:
> > >>>> + goto out;
> > >>>> + }
> > >>>> + }
> > >>>> + error(WARNING, "Unstable elf_abi v1/v2
detection.\n");
> > >>>> +out:
> > >>>> + close_tmpfile();
> > >>>> + return ret;
> > >>>> +}
> > >>>> /*
> > >>>> * get the SP and PC values for idle tasks.
> > >>>> @@ -2834,9 +2880,17 @@ get_ppc64_frame(struct bt_info *bt,
ulong
*getpc, ulong *getsp)
> > >>>> sp = ppc64_get_sp(task);
> > >>>> if (!INSTACK(sp, bt))
> > >>>> goto out;
> > >>>> - readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, ®s,
> > >>>> - sizeof(struct ppc64_pt_regs),
> > >>>> - "PPC64 pt_regs", FAULT_ON_ERROR);
> > >>>> +
> > >>>> + if (THIS_KERNEL_VERSION >= LINUX(6,2,0) &&
is_ppc64_elf_abi_v2()) {
> > >>>> + readmem(sp+STACK_SWITCH_FRAME_REGS, KVADDR,
®s,
> > >>>> + sizeof(struct ppc64_pt_regs),
> > >>>> + "PPC64 pt_regs",
FAULT_ON_ERROR);
> > >>>> + } else {
> > >>>> + readmem(sp+STACK_FRAME_OVERHEAD, KVADDR,
®s,
> > >>>> + sizeof(struct ppc64_pt_regs),
> > >>>> + "PPC64 pt_regs",
FAULT_ON_ERROR);
> > >>>> + }
> > >>>> +
> > >>>> ip = regs.nip;
> > >>>> closest = closest_symbol(ip);
> > >>>> if (STREQ(closest, ".__switch_to") ||
STREQ(closest,
"__switch_to")) {
> >