Hi Tao,
On 04/10/24 10:40, Tao Lou wrote:
Hi Aditya & lianbo,
On Tue, Sep 24, 2024 at 2:28 AM Aditya Gupta <adityag(a)linux.ibm.com> wrote:
> Hi Tao,
>
> On 24/09/22 10:25PM, Aditya Gupta wrote:
>> Hi Tao,
>>
>>
>> On 18/09/24 05:12, Tao Liu wrote:
>>> A error stack trace of bt cmd observed:
>>>
>>> crash> bt 1
>>> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND: "systemd"
>>> #0 [c0000000037735c0] _end at c0000000037154b0 (unreliable)
>>> #1 [c000000003773770] __switch_to at c00000000001fa9c
>>> #2 [c0000000037737d0] __schedule at c00000000112e4ec
>>> #3 [c0000000037738b0] schedule at c00000000112ea80
>>> ...
>>>
>>> The #0 stack trace is incorrect, the function address shouldn't exceed
_end.
>>> The reason is for kernel>=v6.2, the offset of pt_regs to sp changed from
>>> STACK_FRAME_OVERHEAD, i.e 112, to STACK_SWITCH_FRAME_REGS. For
>>> CONFIG_PPC64_ELF_ABI_V1, it's 112, for ABI_V2, it's 48. So the nip
will read a
>>> wrong value from stack when ABI_V2 enabled.
>>>
>>> To determine if ABI_V2 enabled is tricky. This patch do it by check the
>>> following:
>>>
>>> In arch/powerpc/include/asm/ppc_asm.h:
>>> #ifdef CONFIG_PPC64_ELF_ABI_V2
>>> #define STK_GOT 24
>>> #else
>>> #define STK_GOT 40
>>>
>>> In arch/powerpc/kernel/tm.S:
>>> _GLOBAL(tm_reclaim)
>>> mfcr r5
>>> mflr r0
>>> stw r5, 8(r1)
>>> std r0, 16(r1)
>>> std r2, STK_GOT(r1)
>>> ...
>>>
>>> So a disassemble on tm_reclaim, and extract the STK_GOT value from std
>>> instruction is used as the approach.
>>>
>>> After the patch:
>>> crash> bt 1
>>> PID: 1 TASK: c000000003714b80 CPU: 2 COMMAND: "systemd"
>>> #0 [c0000000037737d0] __schedule at c00000000112e4ec
>>> #1 [c0000000037738b0] schedule at c00000000112ea80
>>> ...
>>>
>>> Signed-off-by: Tao Liu <ltao(a)redhat.com>
>>> ---
>>> defs.h | 1 +
>>> ppc64.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>> 2 files changed, 58 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/defs.h b/defs.h
>>> index 2231cb6..d5cb8cc 100644
>>> --- a/defs.h
>>> +++ b/defs.h
>>> @@ -4643,6 +4643,7 @@ struct efi_memory_desc_t {
>>> #define MSR_PR_LG 14 /* Problem State / Privilege Level */
>>> /* Used to find the user or kernel-mode frame*/
>>> +#define STACK_SWITCH_FRAME_REGS 48
>>> #define STACK_FRAME_OVERHEAD 112
>>> #define EXCP_FRAME_MARKER 0x7265677368657265
>>> diff --git a/ppc64.c b/ppc64.c
>>> index e8930a1..6e5f155 100644
>>> --- a/ppc64.c
>>> +++ b/ppc64.c
>>> @@ -72,6 +72,7 @@ static ulong pud_page_vaddr_l4(ulong pud);
>>> static ulong pmd_page_vaddr_l4(ulong pmd);
>>> static int is_opal_context(ulong sp, ulong nip);
>>> void opalmsg(void);
>>> +static bool is_ppc64_elf_abi_v2(void);
>>> static int is_opal_context(ulong sp, ulong nip)
>>> {
>>> @@ -2813,6 +2814,51 @@ ppc64_get_sp(ulong task)
>>> return sp;
>>> }
>>> +static bool
>>> +is_ppc64_elf_abi_v2(void)
>>> +{
>>
>> Thanks for fixing these issues on ppc64 !
>>
>>
>> It is good. I am also checking if there is a way to implement
>> 'is_ppc64_elf_abi_v2' by getting ABI version from the ELF notes, as
'file'
>> command somehow does know:
>>
>> $ file vmlinux-little-endian vmlinux-little-endian: ELF 64-bit LSB
>> executable, 64-bit PowerPC or cisco 7500, OpenPOWER ELF V2 ABI, version 1
>> (SYSV), statically linked,
>> BuildID[sha1]=f6449970e6af6cddeed1e260393e2377ccc0f369, not stripped
>>
>> That might be more dependable (if we can do that).
> How about this approach ?
>
> I feel this might be more dependable, as the file command also uses
> this, as well as fadump (PowerPC firmware-assisted dump) in the kernel.
>
>> As a sidenote, there is still some issue either in patch #1 or #3
>> causing all secondary CPUs (where swapper task is running) to show all
>> threads as '_end'
> What do you think ?
I think your approach will work, thanks for your code! Please see my
comments inlined in the code:
> Either way what you chose, I do realise the idea to utilise the
> 'std r2, STK_GOT(r1)' in such a way, was interesting and smart.
>
> static bool
> is_ppc64_elf_abi_v2(void)
> {
> const char *kernel_file = pc->namelist;
> int fd, swap;
> char eheader[BUFSIZE];
> Elf64_Ehdr *elf64;
>
> if ((fd = open(kernel_file, O_RDONLY)) < 0) {
> error(INFO, "%s: %s\n", kernel_file, strerror(errno));
> return FALSE;
> }
> if (read(fd, eheader, BUFSIZE) != BUFSIZE) {
> error(INFO, "%s: %s\n", kernel_file, strerror(errno));
> close(fd);
> return FALSE;
> }
> close(fd);
>
> if (!STRNEQ(eheader, ELFMAG) || eheader[EI_VERSION] != EV_CURRENT)
> return FALSE;
>
> elf64 = (Elf64_Ehdr *)&eheader[0];
>
> swap = (((eheader[EI_DATA] == ELFDATA2LSB) &&
> (__BYTE_ORDER == __BIG_ENDIAN)) ||
> ((eheader[EI_DATA] == ELFDATA2MSB) &&
> (__BYTE_ORDER == __LITTLE_ENDIAN)));
The above code seems duplicated with the one in is_kernel(), in fact,
is_kernel() will be invoked at the very beginning to determine if one
file is vmlinux, so I would suggest to add our abi_v2 check here,
like:
in ppc64.c:
int ppc64_elf_abi = 0;
static bool is_ppc64_elf_abi_v2(void)
{
if (ppc64_elf_abi == 1)
...
else if (ppc64_elf_abi == 2)
...
else
...
}
in symbols.c:is_kernel():
...
} else if ((elf64->e_ident[EI_CLASS] == ELFCLASS64) &&
((swap16(elf64->e_type, swap) == ET_EXEC) ||
...
case EM_PPC64:
if (machine_type_mismatch(file, "PPC64", NULL, 0))
goto bailout;
+ else
+ ppc64_elf_abi = swap32(elf64->e_flags, swap);
break;
So we can initial the abi version at crash start only once, and get
rid of the code duplication.
Yes, I actually copied pasted the initial part from is_kernel.
The approach is good, just to clarify, we know that is_kernel will be
called everytime before this check right ?
Thanks,
Aditya Gupta
>
>> /*
>> * The ABI version is stored in E_FLAGS in the ELF header, atleast for
>> * ppc64 binaries.
>> *
>> * These logic is also used by the 'file' command, which states
this in
>> * its magic file:
>> *
>> * >18 leshort 20 PowerPC or cisco
4500,
>> * >18 leshort 21 64-bit PowerPC or
cisco 7500,
>> * >>48 lelong 0 Unspecified or
Power ELF V1 ABI,
>> * >>48 lelong 1 Power ELF V1 ABI,
>> * >>48 lelong 2 OpenPOWER ELF V2
ABI,
>> *
>> * The '>18' above means, data at 18 (0x12), which is
basically
>> * 'elf64->e_machine', which will be 20 for PPC, and 21 for
PPC64
>> *
>> * Similarly, '>>48' is offset 48 (0x30) in the file,
which is
>> * basically 'elf64->e_flags', which has the ELF ABI version
>> */
>> if (elf64->e_flags == 2) {
>> /* ABI v2 */
>> return TRUE;
>> } else if (elf64->e_flags == 1) {
>> /* ABI v1 */
>> return FALSE;
>> }
>>
>> error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
>> return FALSE;
>> }
>>
>> Also, we can those values in the ELF header:
>>
>> $ hexdump vmlinux-little-endian
>> 0000000 457f 464c 0102 0001 0000 0000 0000 0000
>> 0000010 0002 0015 (= 21 = E_MACHINE) 0001 0000 0000 0000 0000 c000
>> 0000020 0040 0000 0000 0000 8bf8 0040 0000 0000
>> 0000030 0002 (= E_FLAGS = ABI) 0000 0040 0038 0002 0040 0027 0026
>> 0000040 0001 0000 0007 0000 0000 0001 0000 0000
>>
> I agree with the elf header check on the vmlinux file, it is much
> simpler. Also I compiled the kernel with CONFIG_PPC64_ELF_ABI_V1 on,
> the file cmd will give: Power ELF V1 ABI string. So the approach can
> work.
>
>> Also, I see 'pc->namelist' is basically the kernel filename, is that
>> correct ? Can that be renamed to kernel_filename or something ?
>>
> Yes, by reading through the code, pc->namelist is vmlinux, and
> pc->namelist_debug is the debug symbol if vmlinux doesn't have the
> symbol. But I didn't see the need for the rename.
>
> Thanks,
> Tao Liu
>
>> Thanks,
>> Aditya Gupta
>>
>>> Thanks,
>>>
>>> Aditya Gupta
>>>
>>>> + char buf1[BUFSIZE];
>>>> + char *pos1, *pos2;
>>>> + int errflag = 0;
>>>> + ulong stk_got = 0;
>>>> + static bool ret = false;
>>>> + static bool checked = false;
>>>> +
>>>> + if (checked == true || !symbol_exists("tm_reclaim"))
>>>> + return ret;
>>>> +
>>>> + sprintf(buf1, "x/16i tm_reclaim");
>>>> + open_tmpfile();
>>>> + if (!gdb_pass_through(buf1, pc->tmpfile, GNU_RETURN_ON_ERROR))
>>>> + goto out;
>>>> + checked = true;
>>>> + rewind(pc->tmpfile);
>>>> + while (fgets(buf1, BUFSIZE, pc->tmpfile)) {
>>>> + // "std r2, STK_GOT(r1)" is expected
>>>> + if (strstr(buf1, "std") &&
>>>> + strstr(buf1, "(r1)") &&
>>>> + (pos1 = strstr(buf1, "r2,"))) {
>>>> + pos1 += strlen("r2,");
>>>> + for (pos2 = pos1; *pos2 != '\0' &&
*pos2 != '('; pos2++);
>>>> + *pos2 = '\0';
>>>> + stk_got = stol(pos1, RETURN_ON_ERROR|QUIET,
&errflag);
>>>> + break;
>>>> + }
>>>> + }
>>>> +
>>>> + if (!errflag) {
>>>> + switch (stk_got) {
>>>> + case 24:
>>>> + ret = true;
>>>> + case 40:
>>>> + goto out;
>>>> + }
>>>> + }
>>>> + error(WARNING, "Unstable elf_abi v1/v2 detection.\n");
>>>> +out:
>>>> + close_tmpfile();
>>>> + return ret;
>>>> +}
>>>> /*
>>>> * get the SP and PC values for idle tasks.
>>>> @@ -2834,9 +2880,17 @@ get_ppc64_frame(struct bt_info *bt, ulong *getpc,
ulong *getsp)
>>>> sp = ppc64_get_sp(task);
>>>> if (!INSTACK(sp, bt))
>>>> goto out;
>>>> - readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, ®s,
>>>> - sizeof(struct ppc64_pt_regs),
>>>> - "PPC64 pt_regs", FAULT_ON_ERROR);
>>>> +
>>>> + if (THIS_KERNEL_VERSION >= LINUX(6,2,0) &&
is_ppc64_elf_abi_v2()) {
>>>> + readmem(sp+STACK_SWITCH_FRAME_REGS, KVADDR, ®s,
>>>> + sizeof(struct ppc64_pt_regs),
>>>> + "PPC64 pt_regs", FAULT_ON_ERROR);
>>>> + } else {
>>>> + readmem(sp+STACK_FRAME_OVERHEAD, KVADDR, ®s,
>>>> + sizeof(struct ppc64_pt_regs),
>>>> + "PPC64 pt_regs", FAULT_ON_ERROR);
>>>> + }
>>>> +
>>>> ip = regs.nip;
>>>> closest = closest_symbol(ip);
>>>> if (STREQ(closest, ".__switch_to") || STREQ(closest,
"__switch_to")) {