Re: [PATCH v4 15/16] vmware_guestdump: Various format versions support
by lijiang
On Fri, May 31, 2024 at 5:38 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:38 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 15/16] vmware_guestdump: Various
> format versions support
> To: devel(a)lists.crash-utility.osci.io
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>, Mahesh J
> Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>
> Message-ID: <20240531091939.97828-16-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> From: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
>
> There are several versions of debug.guest format. Current version of
> the code is able to parse only version 4.
>
> Improve parser to support other known versions. Split data structures
> on sub-structures and introduce a helper functions to calculate a gap
> between them based on the version number. Implement additional data
> structure (struct mainmeminfo_old) and logic specifically for original
> (version 1) format support.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> ---
> vmware_guestdump.c | 316 ++++++++++++++++++++++++++++++++-------------
> 1 file changed, 229 insertions(+), 87 deletions(-)
>
>
The current patch is still not improved, commented on it there:
https://www.spinics.net/linux/fedora/redhat-crash-utility/msg11627.html
Thanks
Lianbo
> diff --git a/vmware_guestdump.c b/vmware_guestdump.c
> index 5be26c8..5c7ee4d 100644
> --- a/vmware_guestdump.c
> +++ b/vmware_guestdump.c
> @@ -2,6 +2,8 @@
> * vmware_guestdump.c
> *
> * Copyright (c) 2020 VMware, Inc.
> + * Copyright (c) 2024 Broadcom. All Rights Reserved. The term "Broadcom"
> + * refers to Broadcom Inc. and/or its subsidiaries.
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -13,7 +15,7 @@
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> *
> - * Author: Alexey Makhalov <amakhalov(a)vmware.com>
> + * Author: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> */
>
> #include "defs.h"
> @@ -21,20 +23,31 @@
>
> #define LOGPRX "vmw: "
>
> -#define GUESTDUMP_VERSION 4
> -#define GUESTDUMP_MAGIC1 1
> -#define GUESTDUMP_MAGIC2 0
> -
> +/*
> + * debug.guest file layout
> + * 00000000: guest dump header, it includes:
> + * 1. Version (4 bytes) \
> + * 2. Number of Virtual CPUs (4 bytes) } - struct
> guestdumpheader
> + * 3. Reserved gap
> + * 4. Main Memory information - struct mainmeminfo{,_old}
> + * (use get_vcpus_offset() to get total size of guestdumpheader)
> + * vcpus_offset: ---------\
> + * 1. struct vcpu_state1 \
> + * 2. reserved gap } num_vcpus times
> + * 3. struct vcpu_state2 /
> + * 4. 4KB of reserved data /
> + * --------/
> + *
> + */
> struct guestdumpheader {
> uint32_t version;
> uint32_t num_vcpus;
> - uint8_t magic1;
> - uint8_t reserved1;
> - uint32_t cpu_vendor;
> - uint64_t magic2;
> +} __attribute__((packed)) hdr;
> +
> +struct mainmeminfo {
> uint64_t last_addr;
> uint64_t memsize_in_pages;
> - uint32_t reserved2;
> + uint32_t reserved1;
> uint32_t mem_holes;
> struct memhole {
> uint64_t ppn;
> @@ -42,14 +55,36 @@ struct guestdumpheader {
> } holes[2];
> } __attribute__((packed));
>
> -struct vcpu_state {
> +/* Used by version 1 only */
> +struct mainmeminfo_old {
> + uint64_t last_addr;
> + uint32_t memsize_in_pages;
> + uint32_t reserved1;
> + uint32_t mem_holes;
> + struct memhole1 {
> + uint32_t ppn;
> + uint32_t pages;
> + } holes[2];
> + /* There are additional fields, see get_vcpus_offset()
> calculation. */
> +} __attribute__((packed));
> +
> +/* First half of vcpu_state */
> +struct vcpu_state1 {
> uint32_t cr0;
> uint64_t cr2;
> uint64_t cr3;
> uint64_t cr4;
> uint64_t reserved1[10];
> uint64_t idt_base;
> - uint16_t reserved2[21];
> +} __attribute__((packed));
> +
> +/*
> + * Unused fields between vcpu_state1 and vcpu_state2 swill be skipped.
> + * See get_vcpu_gapsize() calculation.
> + */
> +
> +/* Second half of vcpu_state */
> +struct vcpu_state2 {
> struct x86_64_pt_regs {
> uint64_t r15;
> uint64_t r14;
> @@ -76,9 +111,41 @@ struct vcpu_state {
> uint8_t reserved3[65];
> } __attribute__((packed));
>
> +/*
> + * Returns the size of the guest dump header.
> + */
> +static inline long
> +get_vcpus_offset(uint32_t version, int mem_holes)
> +{
> + switch (version) {
> + case 1: /* ESXi 6.7 and older */
> + return sizeof(struct guestdumpheader) + 13 +
> sizeof(struct mainmeminfo_old) +
> + (mem_holes == -1 ? 0 : 8 * mem_holes + 4);
> + case 3: /* ESXi 6.8 */
> + return sizeof(struct guestdumpheader) + 14 +
> sizeof(struct mainmeminfo);
> + case 4: /* ESXi 7.0 */
> + case 5: /* ESXi 8.0 */
> + return sizeof(struct guestdumpheader) + 14 +
> sizeof(struct mainmeminfo);
> + case 6: /* ESXi 8.0u2 */
> + return sizeof(struct guestdumpheader) + 15 +
> sizeof(struct mainmeminfo);
> +
> + }
> + return 0;
> +}
> +
> +/*
> + * Returns the size of reserved (unused) fields in the middle of
> vcpu_state structure.
> + */
> +static inline long
> +get_vcpu_gapsize(uint32_t version)
> +{
> + if (version < 4)
> + return 45;
> + return 42;
> +}
>
> /*
> - * vmware_guestdump is extension to vmware_vmss with ability to debug
> + * vmware_guestdump is an extension to the vmware_vmss with ability to
> debug
> * debug.guest and debug.vmem files.
> *
> * debug.guest.gz and debug.vmem.gz can be obtained using following
> @@ -86,73 +153,136 @@ struct vcpu_state {
> * monitor.mini-suspend_on_panic = TRUE
> * monitor.suspend_on_triplefault = TRUE
> *
> - * guestdump (debug.guest) is simplified version of *.vmss which does
> - * not contain full VM state, but minimal guest state, such as memory
> + * guestdump (debug.guest) is a simplified version of the *.vmss which
> does
> + * not contain a full VM state, but minimal guest state, such as a memory
> * layout and CPUs state, needed for debugger. is_vmware_guestdump()
> * and vmware_guestdump_init() functions parse guestdump header and
> - * populate vmss data structure (from vmware_vmss.c). As result, all
> + * populate vmss data structure (from vmware_vmss.c). In result, all
> * handlers (except mempry_dump) from vmware_vmss.c can be reused.
> *
> - * debug.guest does not have dedicated header magic or signature for
> - * its format. To probe debug.guest we need to perform header fields
> - * and file size validity. In addition, check for the filename
> - * extension, which must be ".guest".
> + * debug.guest does not have a dedicated header magic or file format
> signature
> + * To probe debug.guest we need to perform series of validations. In
> addition,
> + * we check for the filename extension, which must be ".guest".
> */
> -
> int
> is_vmware_guestdump(char *filename)
> {
> - struct guestdumpheader hdr;
> + struct mainmeminfo mmi;
> + long vcpus_offset;
> FILE *fp;
> - uint64_t filesize, holes_sum = 0;
> + uint64_t filesize, expected_filesize, holes_sum = 0;
> int i;
>
> if (strcmp(filename + strlen(filename) - 6, ".guest"))
> return FALSE;
>
> - if ((fp = fopen(filename, "r")) == NULL) {
> + if ((fp = fopen(filename, "r")) == NULL) {
> error(INFO, LOGPRX"Failed to open '%s': [Error %d] %s\n",
> - filename, errno, strerror(errno));
> + filename, errno, strerror(errno));
> return FALSE;
> - }
> + }
>
> if (fread(&hdr, sizeof(struct guestdumpheader), 1, fp) != 1) {
> error(INFO, LOGPRX"Failed to read '%s' from file '%s':
> [Error %d] %s\n",
> - "guestdumpheader", filename, errno, strerror(errno));
> + "guestdumpheader", filename, errno,
> strerror(errno));
> + fclose(fp);
> + return FALSE;
> + }
> +
> + vcpus_offset = get_vcpus_offset(hdr.version, -1 /* Unknown yet,
> adjust it later */);
> +
> + if (!vcpus_offset) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Not supported version %d\n",
> hdr.version);
> fclose(fp);
> return FALSE;
> }
>
> + if (hdr.version == 1) {
> + struct mainmeminfo_old tmp;
> + if (fseek(fp, vcpus_offset - sizeof(struct
> mainmeminfo_old), SEEK_SET) == -1) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Failed to fseek '%s':
> [Error %d] %s\n",
> + filename, errno,
> strerror(errno));
> + fclose(fp);
> + return FALSE;
> + }
> +
> + if (fread(&tmp, sizeof(struct mainmeminfo_old), 1, fp) !=
> 1) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Failed to read '%s'
> from file '%s': [Error %d] %s\n",
> + "mainmeminfo_old",
> filename, errno, strerror(errno));
> + fclose(fp);
> + return FALSE;
> + }
> + mmi.last_addr = tmp.last_addr;
> + mmi.memsize_in_pages = tmp.memsize_in_pages;
> + mmi.mem_holes = tmp.mem_holes;
> + mmi.holes[0].ppn = tmp.holes[0].ppn;
> + mmi.holes[0].pages = tmp.holes[0].pages;
> + mmi.holes[1].ppn = tmp.holes[1].ppn;
> + mmi.holes[1].pages = tmp.holes[1].pages;
> + /* vcpu_offset adjustment for mem_holes is required only
> for version 1. */
> + vcpus_offset = get_vcpus_offset(hdr.version,
> mmi.mem_holes);
> + } else {
> + if (fseek(fp, vcpus_offset - sizeof(struct mainmeminfo),
> SEEK_SET) == -1) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Failed to fseek '%s':
> [Error %d] %s\n",
> + filename, errno,
> strerror(errno));
> + fclose(fp);
> + return FALSE;
> + }
> +
> + if (fread(&mmi, sizeof(struct mainmeminfo), 1, fp) != 1) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Failed to read '%s'
> from file '%s': [Error %d] %s\n",
> + "mainmeminfo", filename,
> errno, strerror(errno));
> + fclose(fp);
> + return FALSE;
> + }
> + }
> if (fseek(fp, 0L, SEEK_END) == -1) {
> - error(INFO, LOGPRX"Failed to fseek '%s': [Error %d] %s\n",
> - filename, errno, strerror(errno));
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Failed to fseek '%s': [Error
> %d] %s\n",
> + filename, errno, strerror(errno));
> fclose(fp);
> return FALSE;
> }
> filesize = ftell(fp);
> fclose(fp);
>
> - if (hdr.mem_holes > 2)
> - goto unrecognized;
> + if (mmi.mem_holes > 2) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Unexpected mmi.mem_holes value
> %d\n",
> + mmi.mem_holes);
> + return FALSE;
> + }
>
> - for (i = 0; i < hdr.mem_holes; i++) {
> + for (i = 0; i < mmi.mem_holes; i++) {
> /* hole start page */
> - vmss.regions[i].startpagenum = hdr.holes[i].ppn;
> + vmss.regions[i].startpagenum = mmi.holes[i].ppn;
> /* hole end page */
> - vmss.regions[i].startppn = hdr.holes[i].ppn +
> hdr.holes[i].pages;
> - holes_sum += hdr.holes[i].pages;
> + vmss.regions[i].startppn = mmi.holes[i].ppn +
> mmi.holes[i].pages;
> + holes_sum += mmi.holes[i].pages;
> + }
> +
> + if ((mmi.last_addr + 1) != ((mmi.memsize_in_pages + holes_sum) <<
> VMW_PAGE_SHIFT)) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Memory size check failed\n");
> + return FALSE;
> }
>
> - if (hdr.version != GUESTDUMP_VERSION ||
> - hdr.magic1 != GUESTDUMP_MAGIC1 ||
> - hdr.magic2 != GUESTDUMP_MAGIC2 ||
> - (hdr.last_addr + 1) != ((hdr.memsize_in_pages + holes_sum) <<
> VMW_PAGE_SHIFT) ||
> - filesize != sizeof(struct guestdumpheader) +
> - hdr.num_vcpus * (sizeof (struct vcpu_state) + VMW_PAGE_SIZE))
> - goto unrecognized;
> + expected_filesize = vcpus_offset + hdr.num_vcpus * (sizeof(struct
> vcpu_state1) +
> + get_vcpu_gapsize(hdr.version) + sizeof(struct vcpu_state2)
> + VMW_PAGE_SIZE);
> + if (filesize != expected_filesize) {
> + if (CRASHDEBUG(1))
> + error(INFO, LOGPRX"Incorrect file size: %d !=
> %d\n",
> + filesize, expected_filesize);
> + return FALSE;
> + }
>
> - vmss.memsize = hdr.memsize_in_pages << VMW_PAGE_SHIFT;
> - vmss.regionscount = hdr.mem_holes + 1;
> + vmss.memsize = mmi.memsize_in_pages << VMW_PAGE_SHIFT;
> + vmss.regionscount = mmi.mem_holes + 1;
> vmss.memoffset = 0;
> vmss.num_vcpus = hdr.num_vcpus;
> return TRUE;
> @@ -169,7 +299,8 @@ vmware_guestdump_init(char *filename, FILE *ofp)
> FILE *fp = NULL;
> int i, result = TRUE;
> char *vmem_filename = NULL;
> - struct vcpu_state vs;
> + struct vcpu_state1 vs1;
> + struct vcpu_state2 vs2;
> char *p;
>
> if (!machine_type("X86") && !machine_type("X86_64")) {
> @@ -180,14 +311,14 @@ vmware_guestdump_init(char *filename, FILE *ofp)
> goto exit;
> }
>
> - if ((fp = fopen(filename, "r")) == NULL) {
> + if ((fp = fopen(filename, "r")) == NULL) {
> error(INFO, LOGPRX"Failed to open '%s': [Error %d] %s\n",
> filename, errno, strerror(errno));
> result = FALSE;
> goto exit;
> - }
> + }
>
> - if (fseek(fp, sizeof(struct guestdumpheader), SEEK_SET) == -1) {
> + if (fseek(fp, get_vcpus_offset(hdr.version, vmss.regionscount -
> 1), SEEK_SET) == -1) {
> error(INFO, LOGPRX"Failed to fseek '%s': [Error %d] %s\n",
> filename, errno, strerror(errno));
> result = FALSE;
> @@ -203,7 +334,19 @@ vmware_guestdump_init(char *filename, FILE *ofp)
> }
>
> for (i = 0; i < vmss.num_vcpus; i++) {
> - if (fread(&vs, sizeof(struct vcpu_state), 1, fp) != 1) {
> + if (fread(&vs1, sizeof(struct vcpu_state1), 1, fp) != 1) {
> + error(INFO, LOGPRX"Failed to read '%s' from file
> '%s': [Error %d] %s\n",
> + "vcpu_state", filename, errno,
> strerror(errno));
> + result = FALSE;
> + goto exit;
> + }
> + if (fseek(fp, get_vcpu_gapsize(hdr.version), SEEK_CUR) ==
> -1) {
> + error(INFO, LOGPRX"Failed to read '%s' from file
> '%s': [Error %d] %s\n",
> + "vcpu_state", filename, errno,
> strerror(errno));
> + result = FALSE;
> + goto exit;
> + }
> + if (fread(&vs2, sizeof(struct vcpu_state2), 1, fp) != 1) {
> error(INFO, LOGPRX"Failed to read '%s' from file
> '%s': [Error %d] %s\n",
> "vcpu_state", filename, errno,
> strerror(errno));
> result = FALSE;
> @@ -217,29 +360,29 @@ vmware_guestdump_init(char *filename, FILE *ofp)
> }
> vmss.vcpu_regs[i] = 0;
>
> - vmss.regs64[i]->rax = vs.regs64.rax;
> - vmss.regs64[i]->rcx = vs.regs64.rcx;
> - vmss.regs64[i]->rdx = vs.regs64.rdx;
> - vmss.regs64[i]->rbx = vs.regs64.rbx;
> - vmss.regs64[i]->rbp = vs.regs64.rbp;
> - vmss.regs64[i]->rsp = vs.regs64.rsp;
> - vmss.regs64[i]->rsi = vs.regs64.rsi;
> - vmss.regs64[i]->rdi = vs.regs64.rdi;
> - vmss.regs64[i]->r8 = vs.regs64.r8;
> - vmss.regs64[i]->r9 = vs.regs64.r9;
> - vmss.regs64[i]->r10 = vs.regs64.r10;
> - vmss.regs64[i]->r11 = vs.regs64.r11;
> - vmss.regs64[i]->r12 = vs.regs64.r12;
> - vmss.regs64[i]->r13 = vs.regs64.r13;
> - vmss.regs64[i]->r14 = vs.regs64.r14;
> - vmss.regs64[i]->r15 = vs.regs64.r15;
> - vmss.regs64[i]->idtr = vs.idt_base;
> - vmss.regs64[i]->cr[0] = vs.cr0;
> - vmss.regs64[i]->cr[2] = vs.cr2;
> - vmss.regs64[i]->cr[3] = vs.cr3;
> - vmss.regs64[i]->cr[4] = vs.cr4;
> - vmss.regs64[i]->rip = vs.regs64.rip;
> - vmss.regs64[i]->rflags = vs.regs64.eflags;
> + vmss.regs64[i]->rax = vs2.regs64.rax;
> + vmss.regs64[i]->rcx = vs2.regs64.rcx;
> + vmss.regs64[i]->rdx = vs2.regs64.rdx;
> + vmss.regs64[i]->rbx = vs2.regs64.rbx;
> + vmss.regs64[i]->rbp = vs2.regs64.rbp;
> + vmss.regs64[i]->rsp = vs2.regs64.rsp;
> + vmss.regs64[i]->rsi = vs2.regs64.rsi;
> + vmss.regs64[i]->rdi = vs2.regs64.rdi;
> + vmss.regs64[i]->r8 = vs2.regs64.r8;
> + vmss.regs64[i]->r9 = vs2.regs64.r9;
> + vmss.regs64[i]->r10 = vs2.regs64.r10;
> + vmss.regs64[i]->r11 = vs2.regs64.r11;
> + vmss.regs64[i]->r12 = vs2.regs64.r12;
> + vmss.regs64[i]->r13 = vs2.regs64.r13;
> + vmss.regs64[i]->r14 = vs2.regs64.r14;
> + vmss.regs64[i]->r15 = vs2.regs64.r15;
> + vmss.regs64[i]->idtr = vs1.idt_base;
> + vmss.regs64[i]->cr[0] = vs1.cr0;
> + vmss.regs64[i]->cr[2] = vs1.cr2;
> + vmss.regs64[i]->cr[3] = vs1.cr3;
> + vmss.regs64[i]->cr[4] = vs1.cr4;
> + vmss.regs64[i]->rip = vs2.regs64.rip;
> + vmss.regs64[i]->rflags = vs2.regs64.eflags;
>
> vmss.vcpu_regs[i] = REGS_PRESENT_ALL;
> }
> @@ -268,9 +411,9 @@ vmware_guestdump_init(char *filename, FILE *ofp)
> fprintf(ofp, LOGPRX"vmem file: %s\n\n", vmem_filename);
>
> if (CRASHDEBUG(1)) {
> - vmware_guestdump_memory_dump(ofp);
> - dump_registers_for_vmss_dump();
> - }
> + vmware_guestdump_memory_dump(ofp);
> + dump_registers_for_vmss_dump();
> + }
>
> exit:
> if (fp)
> @@ -296,24 +439,23 @@ exit:
> int
> vmware_guestdump_memory_dump(FILE *ofp)
> {
> + uint64_t holes_sum = 0;
> + unsigned i;
> +
> fprintf(ofp, "vmware_guestdump:\n");
> fprintf(ofp, " Header: version=%d num_vcpus=%llu\n",
> - GUESTDUMP_VERSION, (ulonglong)vmss.num_vcpus);
> + hdr.version, (ulonglong)vmss.num_vcpus);
> fprintf(ofp, "Total memory: %llu\n", (ulonglong)vmss.memsize);
>
> - if (vmss.regionscount > 1) {
> - uint64_t holes_sum = 0;
> - unsigned i;
>
> - fprintf(ofp, "Memory regions[%d]:\n", vmss.regionscount);
> - fprintf(ofp, " [0x%016x-", 0);
> - for (i = 0; i < vmss.regionscount - 1; i++) {
> - fprintf(ofp, "0x%016llx]\n",
> (ulonglong)vmss.regions[i].startpagenum << VMW_PAGE_SHIFT);
> - fprintf(ofp, " [0x%016llx-",
> (ulonglong)vmss.regions[i].startppn << VMW_PAGE_SHIFT);
> - holes_sum += vmss.regions[i].startppn -
> vmss.regions[i].startpagenum;
> - }
> - fprintf(ofp, "0x%016llx]\n", (ulonglong)vmss.memsize +
> (holes_sum << VMW_PAGE_SHIFT));
> + fprintf(ofp, "Memory regions[%d]:\n", vmss.regionscount);
> + fprintf(ofp, " [0x%016x-", 0);
> + for (i = 0; i < vmss.regionscount - 1; i++) {
> + fprintf(ofp, "0x%016llx]\n",
> (ulonglong)vmss.regions[i].startpagenum << VMW_PAGE_SHIFT);
> + fprintf(ofp, " [0x%016llx-",
> (ulonglong)vmss.regions[i].startppn << VMW_PAGE_SHIFT);
> + holes_sum += vmss.regions[i].startppn -
> vmss.regions[i].startpagenum;
> }
> + fprintf(ofp, "0x%016llx]\n", (ulonglong)vmss.memsize + (holes_sum
> << VMW_PAGE_SHIFT));
>
> return TRUE;
> }
> --
> 2.40.1
>
4 months
Re: [PATCH v4 14/16] x86_64: fix gdb bt for vmware dumps (Tao Liu)
by lijiang
On Fri, May 31, 2024 at 5:38 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:37 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 14/16] x86_64: fix gdb bt for
> vmware dumps
> To: devel(a)lists.crash-utility.osci.io
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>, Mahesh J
> Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>
> Message-ID: <20240531091939.97828-15-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> From: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
>
> vmware_vmss_get_cpu_reg() whould be called only for active tasks
> to get their registers from corresponding CPUs.
> Otherwise, the standard path of fetching pt_regs from the memory
> (inactive_task_frame) should be used.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> ---
> x86_64.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/x86_64.c b/x86_64.c
> index 47c215f..617a4ab 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -9232,7 +9232,10 @@ x86_64_get_current_task_reg(int regno, const char
> *name,
> if (!tc)
> return FALSE;
>
> - if (VMSS_DUMPFILE())
> + /*
> + * Task is active, grab CPU's registers
> + */
> + if (is_task_active(tc->task) && VMSS_DUMPFILE())
> return vmware_vmss_get_cpu_reg(tc->processor, regno, name,
> size, value);
>
>
Can you try to fold this change into the [PATCH 09/16]? And add the current
descriptions in the patch log.
Thanks
Lianbo
BZERO(&bt_setup, sizeof(struct bt_info));
> --
> 2.40.1
>
4 months
Re: [PATCH v4 13/16] set_context(): check if context is already current
by lijiang
On Fri, May 31, 2024 at 5:38 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:36 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 13/16] set_context(): check if
> context is already current
> To: devel(a)lists.crash-utility.osci.io
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>, Mahesh J
> Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>
> Message-ID: <20240531091939.97828-14-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> From: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
>
> By doing it we avoid dropping gdb caches unnecessarily.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> ---
> task.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/task.c b/task.c
> index 8c00837..3814f6d 100644
> --- a/task.c
> +++ b/task.c
> @@ -5287,6 +5287,9 @@ set_context(ulong task, ulong pid, uint
> update_gdb_thread)
> struct task_context *tc;
> int found;
>
> + if (CURRENT_CONTEXT() && CURRENT_TASK() == task)
> + return TRUE;
> +
>
Seems it makes sense.
I would suggest also adding the pid checking as below:
+ if (CURRENT_CONTEXT() && (CURRENT_TASK() == task || CURRENT_PID()
== pid))
+ return TRUE;
+
What do you think?
Thanks
Lianbo
tc = FIRST_CONTEXT();
>
> for (i = 0, found = FALSE; i < RUNNING_TASKS(); i++, tc++) {
> --
> 2.40.1
>
4 months
Re: [PATCH v4 11/16] Fix cpumask_t recursive dependence issue
by lijiang
On Fri, May 31, 2024 at 5:33 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:34 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 11/16] Fix cpumask_t recursive
> dependence issue
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-12-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> There is recursive dependence for cpumask_t and will exhause the stack,
> see the following stack trace:
>
> (gdb) bt
> ...snip...
> #61965 0x00000000005de98c in datatype_info (name=name@entry=0xa5b1fd
> "cpumask_t", member=member@entry=0x0, dm=dm@entry=0xfffffffffffffffc) at
> symbols.c:6694
> #61966 0x000000000057e4ea in cpu_map_size ...
> #61967 0x000000000058e7bd in get_cpus_online ...
> #61968 0x000000000061fa4b in diskdump_get_prstatus_percpu ...
> #61969 0x0000000000616d74 in get_netdump_regs_x86_64 ...
> #61970 0x0000000000585290 in get_dumpfile_regs ...
> #61971 0x00000000005b7a3c in x86_64_get_current_task_reg ...
> #61972 0x0000000000650389 in crash_target::fetch_registers ...
> #61973 0x00000000008f385a in target_fetch_registers ...
> #61974 0x000000000086ecda in regcache::raw_update ...
> #61975 regcache::raw_update ...
> #61976 0x000000000086ed7a in readable_regcache::raw_read ...
> #61977 0x000000000086f063 in readable_regcache::cooked_read_value ...
> #61978 0x000000000089c4ee in sentinel_frame_prev_register ...
> #61979 0x0000000000786c76 in frame_unwind_register_value ...
> #61980 0x0000000000786f18 in frame_register_unwind ...
> #61981 0x0000000000787267 in frame_unwind_register ...
> #61982 0x00000000007ad9b0 in i386_unwind_pc ...
> #61983 0x00000000007866c0 in frame_unwind_pc ...
> #61984 0x000000000078679c in get_frame_pc ...
> #61985 get_frame_address_in_block ...
> #61986 0x0000000000786849 in get_frame_address_in_block_if_available ...
> #61987 0x0000000000691466 in get_frame_block ...
> #61988 0x00000000008b9430 in get_selected_block ...
> #61989 0x000000000084f8f2 in parse_exp_in_context ...
> #61990 0x000000000084f9e5 in parse_exp_1 ...
> #61991 parse_expression ...
> #61992 0x00000000008d44da in gdb_get_datatype ...
> #61993 gdb_command_funnel_1 ...
> #61994 0x00000000008d48ae in gdb_command_funnel ...
> #61995 0x000000000059cc42 in gdb_interface ...
> #61996 0x00000000005de98c in datatype_info (name=name@entry=0xa5b1fd
> "cpumask_t", member=member@entry=0x0, dm=dm@entry=0xfffffffffffffffc) at
> symbols.c:6694
> #61997 0x000000000057e4ea in cpu_map_size ...
> #61998 0x000000000058e7bd in get_cpus_online () ...
> #61999 0x000000000061fa4b in diskdump_get_prstatus_percpu ...
> #62000 0x0000000000616d74 in get_netdump_regs_x86_64 ...
> #62001 0x0000000000585290 in get_dumpfile_regs ...
> #62002 0x00000000005b7a3c in x86_64_get_current_task_reg ...
> #62003 0x0000000000650389 in crash_target::fetch_registers ...
>
> The cpumask_t will be recursively evaluated. This patch will
> fix the bug.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> defs.h | 1 +
> kernel.c | 17 ++++++++++-------
> 2 files changed, 11 insertions(+), 7 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index ed52cc3..fd00462 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2429,6 +2429,7 @@ struct size_table { /* stash of
> commonly-used sizes */
> long maple_tree;
> long maple_node;
> long module_memory;
> + long cpumask_t;
> };
>
>
Can you add the cpumask_t into the dump_offset_table()?
struct array_table {
> diff --git a/kernel.c b/kernel.c
> index 3730c55..2cae305 100644
> --- a/kernel.c
> +++ b/kernel.c
> @@ -157,6 +157,7 @@ kernel_init()
> if (!(kt->cpu_flags = (ulong *)calloc(NR_CPUS, sizeof(ulong))))
> error(FATAL, "cannot malloc cpu_flags array");
>
> + STRUCT_SIZE_INIT(cpumask_t, "cpumask_t");
> cpu_maps_init();
>
> kt->stext = symbol_value("_stext");
> @@ -913,9 +914,10 @@ cpu_map_size(const char *type)
> struct gnu_request req;
>
> if (LKCD_KERNTYPES()) {
> - if ((len = STRUCT_SIZE("cpumask_t")) < 0)
> - error(FATAL, "cannot determine type cpumask_t\n");
> - return len;
> + if (INVALID_SIZE(cpumask_t))
> + error(FATAL, "cannot determine type cpumask_t\n");
> + else
> + return SIZE(cpumask_t);
>
Let's remove the 'else', keep the original style:
+ if (INVALID_SIZE(cpumask_t))
+ error(FATAL, "Invalid type cpumask_t\n");
+
+ return SIZE(cpumask_t);
}
>
> sprintf(map_symbol, "cpu_%s_map", type);
> @@ -925,11 +927,10 @@ cpu_map_size(const char *type)
> return len;
> }
>
> - len = STRUCT_SIZE("cpumask_t");
> - if (len < 0)
> + if (INVALID_SIZE(cpumask_t))
> return sizeof(ulong);
> else
> - return len;
> + return SIZE(cpumask_t);
>
Ditto.
+ if (INVALID_SIZE(cpumask_t))
return sizeof(ulong);
- else
- return len;
+
+ return SIZE(cpumask_t);
> }
>
> /*
> @@ -952,8 +953,10 @@ cpu_maps_init(void)
> { ACTIVE_MAP, "active" },
> };
>
> - if ((len = STRUCT_SIZE("cpumask_t")) < 0)
> + if (INVALID_SIZE(cpumask_t))
> len = sizeof(ulong);
> + else
> + len = SIZE(cpumask_t);
>
> buf = GETBUF(len);
>
In addition, I still found two similar calls as below. Can you try to make
similar changes in those two functions?
1. kernel.c:
void
generic_get_irq_affinity(int irq)
{
...
if ((len = STRUCT_SIZE("cpumask_t")) < 0)
len = DIV_ROUND_UP(kt->cpus, BITS_PER_LONG) * sizeof(ulong);
...
}
2. tools.
ulong *
get_cpumask_buf(void)
{
int cpulen;
if ((cpulen = STRUCT_SIZE("cpumask_t")) < 0)
cpulen = DIV_ROUND_UP(kt->cpus, BITS_PER_LONG) *
sizeof(ulong);
return (ulong *)GETBUF(cpulen);
}
Thanks
Lianbo
> --
> 2.40.1
>
4 months
Re: [PATCH v4 10/16] Parse stack by inactive_stack_frame priorily if the struct is valid
by lijiang
On Fri, May 31, 2024 at 5:30 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:33 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 10/16] Parse stack by
> inactive_stack_frame priorily if the struct is valid
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-11-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> x86_64.c | 22 +++++++++++-----------
> 1 file changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/x86_64.c b/x86_64.c
> index 4ba0b40..54c69fd 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -6519,6 +6519,17 @@ x86_64_ORC_init(void)
> };
> struct ORC_data *orc;
>
> + MEMBER_OFFSET_INIT(inactive_task_frame_bp, "inactive_task_frame",
> "bp");
> + MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr,
> "inactive_task_frame", "ret_addr");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r15, "inactive_task_frame",
> "r15");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r14, "inactive_task_frame",
> "r14");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r13, "inactive_task_frame",
> "r13");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r12, "inactive_task_frame",
> "r12");
> + MEMBER_OFFSET_INIT(inactive_task_frame_flags,
> "inactive_task_frame", "flags");
> + MEMBER_OFFSET_INIT(inactive_task_frame_si, "inactive_task_frame",
> "si");
> + MEMBER_OFFSET_INIT(inactive_task_frame_di, "inactive_task_frame",
> "di");
> + MEMBER_OFFSET_INIT(inactive_task_frame_bx, "inactive_task_frame",
> "bx");
> +
> if (machdep->flags & FRAMEPOINTER)
> return;
>
> @@ -6576,17 +6587,6 @@ x86_64_ORC_init(void)
> orc->__stop_orc_unwind = symbol_value("__stop_orc_unwind");
> orc->orc_lookup = symbol_value("orc_lookup");
>
> - MEMBER_OFFSET_INIT(inactive_task_frame_bp, "inactive_task_frame",
> "bp");
> - MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr,
> "inactive_task_frame", "ret_addr");
> - MEMBER_OFFSET_INIT(inactive_task_frame_r15, "inactive_task_frame",
> "r15");
> - MEMBER_OFFSET_INIT(inactive_task_frame_r14, "inactive_task_frame",
> "r14");
> - MEMBER_OFFSET_INIT(inactive_task_frame_r13, "inactive_task_frame",
> "r13");
> - MEMBER_OFFSET_INIT(inactive_task_frame_r12, "inactive_task_frame",
> "r12");
> - MEMBER_OFFSET_INIT(inactive_task_frame_flags,
> "inactive_task_frame", "flags");
> - MEMBER_OFFSET_INIT(inactive_task_frame_si, "inactive_task_frame",
> "si");
> - MEMBER_OFFSET_INIT(inactive_task_frame_di, "inactive_task_frame",
> "di");
> - MEMBER_OFFSET_INIT(inactive_task_frame_bx, "inactive_task_frame",
> "bx");
> -
> orc->has_signal = MEMBER_EXISTS("orc_entry", "signal"); /* added
> at 6.3 */
> orc->has_end = MEMBER_EXISTS("orc_entry", "end"); /* removed
> at 6.4 */
>
I would suggest folding the current patch into [PATCH v4 09/16].
Thanks
Lianbo
>
> --
> 2.40.1
>
4 months
Re: [PATCH v4 07/16] Stop stack unwinding at non-kernel address
by lijiang
On Fri, May 31, 2024 at 5:30 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:30 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 07/16] Stop stack unwinding at
> non-kernel address
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-8-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> The stack unwinding is for kernel addresses only. If non-kernel address
> encountered, it is usually a user space address, or non-address value
> like a function call parameter. So stopping stack unwinding at non-kernel
> address will decrease the invalid unwind results.
>
> Before:
> crash> gdb bt
> #0 0xffffffff816a8f65 in context_switch ...
> #1 __schedule () ...
> #2 0xffffffff816a94e9 in schedule ...
> #3 0xffffffff816a86fd in schedule_hrtimeout_range_clock ...
> #4 0xffffffff816a8733 in schedule_hrtimeout_range ...
> #5 0xffffffff8124bb7e in ep_poll ...
> #6 0xffffffff8124d00d in SYSC_epoll_wait ...
> #7 SyS_epoll_wait ...
> #8 <signal handler called>
> #9 0x00007f0449407923 in ?? ()
> #10 0xffff880100000001 in ?? ()
> #11 0xffff880169b3c010 in ?? ()
> #12 0x0000000000000040 in irq_stack_union ()
> #13 0xffff880169b3c058 in ?? ()
> #14 0xffff880169b3c048 in ?? ()
> #15 0xffff880169b3c050 in ?? ()
> #16 0x0000000000000000 in ?? ()
>
> After:
> crash> gdb bt
> #0 0xffffffff816a8f65 in context_switch ...
> #1 __schedule () ...
> #2 0xffffffff816a94e9 in schedule () ...
> #3 0xffffffff816a86fd in schedule_hrtimeout_range_clock ...
> #4 0xffffffff816a8733 in schedule_hrtimeout_range ...
> #5 0xffffffff8124bb7e in ep_poll ...
> #6 0xffffffff8124d00d in SYSC_epoll_wait ...
> #7 SyS_epoll_wait ...
> #8 <signal handler called>
> #9 0x00007f0449407923 in ?? ()
>
>
It seems that there are still some non-kernel addresses that do not get
filtered. Can you help double check?
For example:
crash> gdb bt
#0 crash_setup_regs (newregs=0xffffb5bb4f197938, oldregs=0x0) at
./arch/x86/include/asm/kexec.h:114
#1 0xffffffff8e61e32e in __crash_kexec (regs=regs@entry=0x0) at
kernel/crash_core.c:122
#2 0xffffffff8e51a64d in panic (fmt=fmt@entry=0xffffffff8fa51609 "sysrq
triggered crash\n") at kernel/panic.c:366
#3 0xffffffff8ec21f86 in sysrq_handle_crash (key=<optimized out>) at
drivers/tty/sysrq.c:154
#4 0xffffffff8ec22550 in __handle_sysrq (key=<optimized out>,
check_mask=check_mask@entry=false) at drivers/tty/sysrq.c:612
#5 0xffffffff8ec22bf5 in write_sysrq_trigger (file=<optimized out>,
buf=<optimized out>, count=2, ppos=<optimized out>) at
drivers/tty/sysrq.c:1183
#6 0xffffffff8e935ae5 in pde_write (ppos=<optimized out>, count=<optimized
out>, buf=<optimized out>, file=0xffffb5bb4f197938, pde=0xffff98338b78e0c0)
at fs/proc/inode.c:334
#7 proc_reg_write (file=0xffffb5bb4f197938, buf=0x0, count=1, ppos=0x0) at
fs/proc/inode.c:346
#8 0xffffffff8e88d382 in vfs_write (file=file@entry=0xffff98338b789200,
buf=buf@entry=0x5614d58a22c0 <error: Cannot access memory at address
0x5614d58a22c0>, count=count@entry=2, pos=pos@entry=0xffffb5bb4f197b78) at
fs/read_write.c:588
#9 0xffffffff8e88d9ff in ksys_write (fd=<optimized out>,
buf=0x5614d58a22c0 <error: Cannot access memory at address 0x5614d58a22c0>,
count=2) at fs/read_write.c:643
#10 0xffffffff8f124429 in do_syscall_x64 (nr=1, regs=0xffffb5bb4f197f58) at
arch/x86/entry/common.c:52
#11 do_syscall_64 (regs=0xffffb5bb4f197f58, nr=1) at
arch/x86/entry/common.c:83
#12 0xffffffff8f20012b in entry_SYSCALL_64 () at
arch/x86/entry/entry_64.S:121
#13 0x00007f9a147f69e0 in ?? ()
The frame #13 looks like a non-kernel address.
Thanks
Lianbo
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> defs.h | 1 +
> gdb-10.2.patch | 26 ++++++++++++++++++++++++++
> gdb_interface.c | 6 ++++++
> 3 files changed, 33 insertions(+)
>
> diff --git a/defs.h b/defs.h
> index 012ffdc..c0e6a29 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -7902,6 +7902,7 @@ extern unsigned char *gdb_prettyprint_arrays;
> extern unsigned int *gdb_repeat_count_threshold;
> extern unsigned char *gdb_stop_print_at_null;
> extern unsigned int *gdb_output_radix;
> +int is_kvaddr(ulong);
>
> /*
> * gdb/top.c
> diff --git a/gdb-10.2.patch b/gdb-10.2.patch
> index 0bed96a..3ed40c0 100644
> --- a/gdb-10.2.patch
> +++ b/gdb-10.2.patch
> @@ -16171,3 +16171,29 @@ exit 0
> }
>
> /*
> +--- gdb-10.2/gdb/frame.c.orig
> ++++ gdb-10.2/gdb/frame.c
> +@@ -2331,6 +2331,10 @@ inside_entry_func (frame_info *this_frame)
> + This function should not contain target-dependent tests, such as
> + checking whether the program-counter is zero. */
> +
> ++#ifdef CRASH_MERGE
> ++extern "C" int is_kvaddr(ulong);
> ++#endif
> ++
> + struct frame_info *
> + get_prev_frame (struct frame_info *this_frame)
> + {
> +@@ -2353,7 +2357,11 @@ get_prev_frame (struct frame_info *this_frame)
> + get_frame_id (this_frame);
> +
> + frame_pc_p = get_frame_pc_if_available (this_frame, &frame_pc);
> +-
> ++#ifdef CRASH_MERGE
> ++ if (!is_kvaddr(frame_pc)) {
> ++ return NULL;
> ++ }
> ++#endif
> + /* tausq/2004-12-07: Dummy frames are skipped because it doesn't make
> much
> + sense to stop unwinding at a dummy frame. One place where a dummy
> + frame may have an address "inside_main_func" is on HPUX. On HPUX,
> the
> diff --git a/gdb_interface.c b/gdb_interface.c
> index b13d5fd..e76ecc6 100644
> --- a/gdb_interface.c
> +++ b/gdb_interface.c
> @@ -947,6 +947,12 @@ gdb_lookup_module_symbol(ulong addr, ulong *offset)
> }
> }
>
> +int
> +is_kvaddr(ulong addr)
> +{
> + return IS_KVADDR(addr);
> +}
> +
> /*
> * Used by gdb_interface() to catch gdb-related errors, if desired.
> */
> --
> 2.40.1
>
4 months
Re: [PATCH v4 09/16] x86_64: Add gdb stack unwinding support
by lijiang
On Fri, May 31, 2024 at 5:33 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:32 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 09/16] x86_64: Add gdb stack
> unwinding support
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-10-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> This patch will add stack unwinding support for x86_64 CPU arch. It
> follows the tech path of ppc64_get_stack_frame() & ppc64_get_cpu_reg()
> in ppc64.c, to get and consume the cpu regs.
>
> One different case is, we need to handle inactive_task_frame structure
> specifically in x86_64.
>
> If inactive_task_frame structure enabled, for inactive tasks, 7 regs can
> be get from inactive_task_frame in stack, and sp need to rewind back to
> skip inactive_task_frame structure (See code comments in
> x86_64.c:x86_64_get_stack_frame()); for active tasks, we get regs by the
> original way.
>
> If inactive_task_frame structure is not enabled, for inactive tasks, the
> stack frame is organized as linked list, and sp/bp can be get from stack;
> for active tasks, we get regs by the original way.
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Signed-off-by: Tao Liu <ltao(a)redhat.com>
> ---
> defs.h | 19 +++-
> x86_64.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 283 insertions(+), 27 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index 8450aea..ed52cc3 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2241,6 +2241,20 @@ struct offset_table { /* stash
> of commonly-used offsets */
> long mnt_namespace_nr_mounts;
> long mount_mnt_node;
> long log_caller_id;
> + long inactive_task_frame_r15;
> + long inactive_task_frame_r14;
> + long inactive_task_frame_r13;
> + long inactive_task_frame_r12;
> + long inactive_task_frame_flags;
> + long inactive_task_frame_si;
> + long inactive_task_frame_di;
> + long inactive_task_frame_bx;
> + long thread_struct_es;
> + long thread_struct_ds;
> + long thread_struct_fsbase;
> + long thread_struct_gsbase;
> + long thread_struct_fs;
> + long thread_struct_gs;
> };
>
> struct size_table { /* stash of commonly-used sizes */
> @@ -8008,7 +8022,7 @@ extern int have_full_symbols(void);
>
> /*
> * Register numbers must be in sync with gdb/features/i386/64bit-core.c
> - * to make crash_target->fetch_registers() ---> machdep->get_cpu_reg()
> + * to make crash_target->fetch_registers() --->
> machdep->get_current_task_reg()
> * working properly.
> */
> enum x86_64_regnum {
> @@ -8052,6 +8066,9 @@ enum x86_64_regnum {
> FOSEG_REGNUM,
> FOOFF_REGNUM,
> FOP_REGNUM,
> + FS_BASE_REGNUM = 152,
> + GS_BASE_REGNUM,
> + ORIG_RAX_REGNUM,
> LAST_REGNUM
> };
>
> diff --git a/x86_64.c b/x86_64.c
> index 5ab2852..4ba0b40 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -126,7 +126,7 @@ static int x86_64_get_framesize(struct bt_info *,
> ulong, ulong, char *);
> static void x86_64_framesize_debug(struct bt_info *);
> static void x86_64_get_active_set(void);
> static int x86_64_get_kvaddr_ranges(struct vaddr_range *);
> -static int x86_64_get_cpu_reg(int, int, const char *, int, void *);
> +static int x86_64_get_current_task_reg(int, const char *, int, void *);
> static int x86_64_verify_paddr(uint64_t);
> static void GART_init(void);
> static void x86_64_exception_stacks_init(void);
> @@ -143,6 +143,29 @@ struct machine_specific x86_64_machine_specific = { 0
> };
> static const char *exception_functions_orig[];
> static const char *exception_functions_5_8[];
>
> +/* Use this hardwired version -- sometimes the
> + * debuginfo doesn't pick this up even though
> + * it exists in the kernel; it shouldn't change.
> + */
> +struct x86_64_user_regs_struct {
> + unsigned long r15, r14, r13, r12, bp, bx;
> + unsigned long r11, r10, r9, r8, ax, cx, dx;
> + unsigned long si, di, orig_ax, ip, cs;
> + unsigned long flags, sp, ss, fs_base;
> + unsigned long gs_base, ds, es, fs, gs;
> +};
> +
> +struct user_regs_bitmap_struct {
> + struct {
> + unsigned long r15, r14, r13, r12, bp, bx;
> + unsigned long r11, r10, r9, r8, ax, cx, dx;
> + unsigned long si, di, orig_ax, ip, cs;
> + unsigned long flags, sp, ss, fs_base;
> + unsigned long gs_base, ds, es, fs, gs;
> + };
> + ulong bitmap;
> +};
>
The struct user_regs_bitmap_struct can de defined as below:
+struct user_regs_bitmap_struct {
+ struct user_regs_bitmap_struct uregs;
+ ulong bitmap;
+};
+
> /*
> * Do all necessary machine-specific setup here. This is called several
> * times during initialization.
> @@ -195,7 +218,7 @@ x86_64_init(int when)
> machdep->machspec->irq_eframe_link = UNINITIALIZED;
> machdep->machspec->irq_stack_gap = UNINITIALIZED;
> machdep->get_kvaddr_ranges = x86_64_get_kvaddr_ranges;
> - machdep->get_current_task_reg = x86_64_get_cpu_reg;
> + machdep->get_current_task_reg =
> x86_64_get_current_task_reg;
> if (machdep->cmdline_args[0])
> parse_cmdline_args();
> if ((string = pc->read_vmcoreinfo("relocate"))) {
> @@ -500,6 +523,12 @@ x86_64_init(int when)
> MEMBER_OFFSET_INIT(thread_struct_rsp,
> "thread_struct", "sp");
> if (INVALID_MEMBER(thread_struct_rsp0))
> MEMBER_OFFSET_INIT(thread_struct_rsp0,
> "thread_struct", "sp0");
> + MEMBER_OFFSET_INIT(thread_struct_es, "thread_struct",
> "es");
> + MEMBER_OFFSET_INIT(thread_struct_ds, "thread_struct",
> "ds");
> + MEMBER_OFFSET_INIT(thread_struct_fsbase, "thread_struct",
> "fsbase");
> + MEMBER_OFFSET_INIT(thread_struct_gsbase, "thread_struct",
> "gsbase");
> + MEMBER_OFFSET_INIT(thread_struct_fs, "thread_struct",
> "fs");
> + MEMBER_OFFSET_INIT(thread_struct_gs, "thread_struct",
> "gs");
> STRUCT_SIZE_INIT(tss_struct, "tss_struct");
> MEMBER_OFFSET_INIT(tss_struct_ist, "tss_struct", "ist");
> if (INVALID_MEMBER(tss_struct_ist)) {
> @@ -583,17 +612,6 @@ x86_64_init(int when)
> "user_regs_struct", "r15");
> STRUCT_SIZE_INIT(user_regs_struct, "user_regs_struct");
> if (!VALID_STRUCT(user_regs_struct)) {
> - /* Use this hardwired version -- sometimes the
> - * debuginfo doesn't pick this up even though
> - * it exists in the kernel; it shouldn't change.
> - */
> - struct x86_64_user_regs_struct {
> - unsigned long r15, r14, r13, r12, bp, bx;
> - unsigned long r11, r10, r9, r8, ax, cx, dx;
> - unsigned long si, di, orig_ax, ip, cs;
> - unsigned long flags, sp, ss, fs_base;
> - unsigned long gs_base, ds, es, fs, gs;
> - };
> ASSIGN_SIZE(user_regs_struct) =
> sizeof(struct x86_64_user_regs_struct);
> ASSIGN_OFFSET(user_regs_struct_rip) =
> @@ -891,7 +909,7 @@ x86_64_dump_machdep_table(ulong arg)
> fprintf(fp, " is_page_ptr: x86_64_is_page_ptr()\n");
> fprintf(fp, " verify_paddr: x86_64_verify_paddr()\n");
> fprintf(fp, " get_kvaddr_ranges: x86_64_get_kvaddr_ranges()\n");
> - fprintf(fp, " get_cpu_reg: x86_64_get_cpu_reg()\n");
> + fprintf(fp, "get_current_task_reg:
> x86_64_get_current_task_reg()\n");
> fprintf(fp, " init_kernel_pgd: x86_64_init_kernel_pgd()\n");
> fprintf(fp, "clear_machdep_cache:
> x86_64_clear_machdep_cache()\n");
> fprintf(fp, " xendump_p2m_create: %s\n", PVOPS_XEN() ?
> @@ -4971,22 +4989,124 @@ x86_64_eframe_verify(struct bt_info *bt, long
> kvaddr, long cs, long ss,
> return FALSE;
> }
>
> +#define GET_REG_FROM_INACTIVE_TASK_FRAME(reg) \
> +({ \
> + ulong offset, reg_value = 0, rsp; \
> + if (VALID_MEMBER(inactive_task_frame_bp)) { \
> + offset = OFFSET(task_struct_thread) +
> OFFSET(thread_struct_rsp); \
> + readmem(bt->task + offset, KVADDR, &rsp, \
> + sizeof(ulong), "thread_struct.rsp",
> FAULT_ON_ERROR); \
> + readmem(rsp + OFFSET(inactive_task_frame_##reg), KVADDR,
> ®_value, \
> + sizeof(ulong), "inactive_task_frame.##reg",
> FAULT_ON_ERROR); \
> + } \
> + reg_value; \
> +})
>
I would suggest changing the above macro definition to a function, at least
there are two advantages:
[1] once the task_struct or inactive_task_frame is changed in the future,
it is easy to follow its changes.
[2] easy to debug, and more readable.
> +
> /*
> * Get a stack frame combination of pc and ra from the most relevent
> spot.
> */
> static void
> x86_64_get_stack_frame(struct bt_info *bt, ulong *pcp, ulong *spp)
> {
> - if (bt->flags & BT_DUMPFILE_SEARCH)
> - return x86_64_get_dumpfile_stack_frame(bt, pcp, spp);
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong pcp_save = *pcp;
> + ulong spp_save = *spp;
> + ulong sp;
>
> if (bt->flags & BT_SKIP_IDLE)
> bt->flags &= ~BT_SKIP_IDLE;
> + if (pcp)
> + *pcp = x86_64_get_pc(bt);
> + if (spp)
> + sp = *spp = x86_64_get_sp(bt);
> + ur_bitmap = (struct user_regs_bitmap_struct
> *)GETBUF(sizeof(*ur_bitmap));
> + memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
>
Can you keep the same style?(sorry to be picky :-))
+ ur_bitmap = (struct user_regs_bitmap_struct
*)GETBUF(sizeof(*ur_bitmap));
+ memset(ur_bitmap, 0, sizeof(*ur_bitmap));
or
+ ur_bitmap = (struct user_regs_bitmap_struct *)GETBUF(sizeof(struct
user_regs_bitmap_struct)));
+ memset(ur_bitmap, 0, sizeof(struct user_regs_bitmap_struct));
+
> + if (VALID_MEMBER(inactive_task_frame_bp)) {
> + if (!is_task_active(bt->task)) {
> + /*
> + * For inactive tasks in live and dumpfile, regs
> can be
> + * get from inactive_task_frame struct.
> + */
> + ur_bitmap->r15 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r15);
> + ur_bitmap->r14 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r14);
> + ur_bitmap->r13 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r13);
> + ur_bitmap->r12 =
> GET_REG_FROM_INACTIVE_TASK_FRAME(r12);
> + ur_bitmap->bx =
> GET_REG_FROM_INACTIVE_TASK_FRAME(bx);
> + ur_bitmap->bp =
> GET_REG_FROM_INACTIVE_TASK_FRAME(bp);
> + /*
> + For inactive tasks:
> + crash> task -x 1|grep sp
> + sp = 0xffffc90000013d00
> + crash> rd ffffc90000013d00 32
> + ffffc90000013d00: ffff888104dad4a8
> 0000000000000000 r15,r14
> + ffffc90000013d10: ffff888100280000
> ffff888100216500 r13,r12
> + ffffc90000013d20: ffff888100217018
> ffff88817fd2c800 rbx,rbp
> + ffffc90000013d30: ffffffff81a6a1b3
> ffffc90000013de0 saved_rip,...
> + ffffc90000013d40: ffff888100000004
> 99ccbf53ea493000
> + ffffc90000013d50: ffff888100216500
> ffff888100216500
> +
> + crash> dis __schedule
> + ...
> + 0xffffffff81a6a1ab <__schedule+507>: mov
> %r13,%rsi
> + 0xffffffff81a6a1ae <__schedule+510>: call
> 0xffffffff81003490 <__switch_to_asm>
> + 0xffffffff81a6a1b3 <__schedule+515>: mov
> %rax,%rdi <<=== saved_rip
> + ...
> + crash> dis __switch_to_asm
> + 0xffffffff81003490 <__switch_to_asm>: push %rbp
> + 0xffffffff81003491 <__switch_to_asm+1>: push %rbx
> + 0xffffffff81003492 <__switch_to_asm+2>: push %r12
> + 0xffffffff81003494 <__switch_to_asm+4>: push %r13
> + 0xffffffff81003496 <__switch_to_asm+6>: push %r14
> + 0xffffffff81003498 <__switch_to_asm+8>: push %r15
> + 0xffffffff8100349a <__switch_to_asm+10>:
> mov %rsp,0x14d8(%rdi)
> + ...
> + Now saved_rip = ffffffff81a6a1b3, and we are
> starting
> + the stack unwind at saved_rip, which is function
> __schedule()
> + instead of function __switch_to_asm(), so the
> stack pointer should
> + be rewind from ffffc90000013d00 back to
> ffffc90000013d38,
> + aka *spp += 7 * reg_len. Otherwise we are
> unwinding function
> + __schedule() but with __switch_to_asm()'s stack
> frame, which
> + will fail.
> + */
> + sp += 7 * sizeof(unsigned long);
>
It seems this may still rely on the compiler, have you tried another
compiler? I'm not sure if there are the same results, just my concerns.
Anyway, If there is no better way or solution, at least we should comment
on it and point out the current limitations. What do you think?
+ ur_bitmap->bitmap += 0x3f;
>
Also why is it 0x3f? Can you explain the details?
> + } else {
> + /*
> + * For active tasks in dumpfile, we get regs
> through the
> + * original way. For active tasks in live, we only
> get
> + * ip and sp in the end of the function.
> + */
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
>
See the comments in the previous thread.
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return x86_64_get_dumpfile_stack_frame(bt,
> pcp, spp);
> + }
> + }
> + } else {
> + if (!is_task_active(bt->task)) {
> + readmem(*spp, KVADDR, &(ur_bitmap->bp),
> + sizeof(ulong), "ur_bitmap->bp",
> FAULT_ON_ERROR);
> + ur_bitmap->bitmap += 0x10;
> + } else {
> + if (bt->flags & BT_DUMPFILE_SEARCH) {
> + FREEBUF(ur_bitmap);
> + bt->need_free = FALSE;
> + *pcp = pcp_save;
> + *spp = spp_save;
> + return x86_64_get_dumpfile_stack_frame(bt,
> pcp, spp);
> + }
> + }
> + }
>
> - if (pcp)
> - *pcp = x86_64_get_pc(bt);
> - if (spp)
> - *spp = x86_64_get_sp(bt);
> + ur_bitmap->ip = *pcp;
> + ur_bitmap->sp = sp;
> + ur_bitmap->bitmap += 0x50000;
> +
> + bt->machdep = ur_bitmap;
> + bt->need_free = TRUE;
> }
>
> /*
> @@ -6458,6 +6578,14 @@ x86_64_ORC_init(void)
>
> MEMBER_OFFSET_INIT(inactive_task_frame_bp, "inactive_task_frame",
> "bp");
> MEMBER_OFFSET_INIT(inactive_task_frame_ret_addr,
> "inactive_task_frame", "ret_addr");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r15, "inactive_task_frame",
> "r15");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r14, "inactive_task_frame",
> "r14");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r13, "inactive_task_frame",
> "r13");
> + MEMBER_OFFSET_INIT(inactive_task_frame_r12, "inactive_task_frame",
> "r12");
> + MEMBER_OFFSET_INIT(inactive_task_frame_flags,
> "inactive_task_frame", "flags");
> + MEMBER_OFFSET_INIT(inactive_task_frame_si, "inactive_task_frame",
> "si");
> + MEMBER_OFFSET_INIT(inactive_task_frame_di, "inactive_task_frame",
> "di");
> + MEMBER_OFFSET_INIT(inactive_task_frame_bx, "inactive_task_frame",
> "bx");
>
> orc->has_signal = MEMBER_EXISTS("orc_entry", "signal"); /* added
> at 6.3 */
> orc->has_end = MEMBER_EXISTS("orc_entry", "end"); /* removed
> at 6.4 */
> @@ -9071,17 +9199,128 @@ x86_64_get_kvaddr_ranges(struct vaddr_range *vrp)
> return cnt;
> }
>
> +#define REG_CASE(R, r) \
> + case R##_REGNUM: \
> + if (size != sizeof(ur_bitmap->r)) { \
> + ret = FALSE; break; \
> + } else { \
> + memcpy(value, &ur_bitmap->r, size); \
> + ret = TRUE; break; \
> + }
> +
> static int
> -x86_64_get_cpu_reg(int cpu, int regno, const char *name,
> +x86_64_get_current_task_reg(int regno, const char *name,
> int size, void *value)
> {
> - if (regno >= LAST_REGNUM)
> - return FALSE;
> + struct bt_info bt_info, bt_setup;
> + struct task_context *tc;
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong ip, sp;
> + bool ret = FALSE;
>
> - if (VMSS_DUMPFILE())
> - return vmware_vmss_get_cpu_reg(cpu, regno, name, size,
> value);
> + switch (regno) {
> + case RAX_REGNUM ... GS_REGNUM:
> + case FS_BASE_REGNUM ... ORIG_RAX_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
>
>
Ditto.
> - return FALSE;
> + tc = CURRENT_CONTEXT();
> + if (!tc)
> + return FALSE;
> +
> + if (VMSS_DUMPFILE())
> + return vmware_vmss_get_cpu_reg(tc->processor, regno, name,
> size, value);
> +
> + BZERO(&bt_setup, sizeof(struct bt_info));
> + clone_bt_info(&bt_setup, &bt_info, tc);
> + fill_stackbuf(&bt_info);
> +
> + // reusing the get_dumpfile_regs function to get pt regs structure
> + get_dumpfile_regs(&bt_info, &sp, &ip);
> + if (bt_info.stackbuf)
> + FREEBUF(bt_info.stackbuf);
> + ur_bitmap = (struct user_regs_bitmap_struct *)bt_info.machdep;
> + if (!ur_bitmap)
> + return FALSE;
> +
> + /* Get all registers from elf notes*/
> + if (!bt_info.need_free) {
> + goto get_all;
> + }
> +
>
Ditto.
> + /* Get subset registers from stack frame*/
> + switch (ur_bitmap->bitmap) {
> + case 0x5003f:
> + switch (regno) {
> + case R12_REGNUM ... R15_REGNUM:
> + case RBP_REGNUM:
> + case RBX_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50010:
> + switch (regno) {
> + case RBP_REGNUM:
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + case 0x50000:
> + switch (regno) {
> + case RIP_REGNUM:
> + case RSP_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + }
>
Can you help to explain more details about several magic number values?
Such as 0x5003f, 0x50000, 0x50010. Or you may point out where they come
from, it may be helpful to add them here as code comment.
Thanks
Lianbo
+
> +get_all:
> + switch (regno) {
> + REG_CASE(RAX, ax);
> + REG_CASE(RBX, bx);
> + REG_CASE(RCX, cx);
> + REG_CASE(RDX, dx);
> + REG_CASE(RSI, si);
> + REG_CASE(RDI, di);
> + REG_CASE(RBP, bp);
> + REG_CASE(RSP, sp);
> + REG_CASE(R8, r8);
> + REG_CASE(R9, r9);
> + REG_CASE(R10, r10);
> + REG_CASE(R11, r11);
> + REG_CASE(R12, r12);
> + REG_CASE(R13, r13);
> + REG_CASE(R14, r14);
> + REG_CASE(R15, r15);
> + REG_CASE(RIP, ip);
> + REG_CASE(EFLAGS, flags);
> + REG_CASE(CS, cs);
> + REG_CASE(SS, ss);
> + REG_CASE(DS, ds);
> + REG_CASE(ES, es);
> + REG_CASE(FS, fs);
> + REG_CASE(GS, gs);
> + REG_CASE(FS_BASE, fs_base);
> + REG_CASE(GS_BASE, gs_base);
> + REG_CASE(ORIG_RAX, orig_ax);
> + }
> +
> + if (bt_info.need_free) {
> + FREEBUF(ur_bitmap);
> + bt_info.need_free = FALSE;
> + }
> + return ret;
> }
>
> /*
> --
> 2.40.1
>
4 months
Re: [PATCH v4 08/16] ppc64: correct gdb passthroughs by implementing machdep->get_cpu_reg
by lijiang
On Fri, May 31, 2024 at 5:33 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 17:19:31 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] [PATCH v4 08/16] ppc64: correct gdb
> passthroughs by implementing machdep->get_cpu_reg
> To: devel(a)lists.crash-utility.osci.io
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>, "Naveen N . Rao"
> <naveen.n.rao(a)linux.vnet.ibm.com>, Lianbo Jiang <
> lijiang(a)redhat.com>,
> Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Message-ID: <20240531091939.97828-9-ltao(a)redhat.com>
> Content-Type: text/plain; charset=UTF-8
>
> From: Aditya Gupta <adityag(a)linux.ibm.com>
>
> Currently, gdb passthroughs of 'bt', 'frame', 'up', 'down', 'info
> locals' don't work. This is due to gdb not knowing the register values to
> unwind the stack frames
>
> Every gdb passthrough goes through `gdb_interface`. And then, gdb expects
> `crash_target::fetch_registers` to give it the register values, which is
> dependent on `machdep->get_cpu_reg` to read the register values for
> specific architecture.
>
> ----------------------------
> gdb passthrough (eg. "bt") | |
> crash -------------------------> | |
> | gdb_interface |
> | |
> | |
> | ---------------------- |
> fetch_registers | | | |
> crash_target<-------------------------+--| gdb | |
> --------------------------+->| | |
> Registers (SP,NIP, etc.)| | | |
> | | | |
> | ---------------------- |
> ----------------------------
>
> Implement `machdep->get_cpu_reg` on PPC64, so that crash provides the
> register values to gdb to unwind stack frames properly
>
> With these changes, on powerpc, 'bt' command output in gdb mode, will look
> like this:
>
> gdb> bt
> #0 0xc0000000002a53e8 in crash_setup_regs (oldregs=<optimized out>,
> newregs=0xc00000000486f8d8) at ./arch/powerpc/include/asm/kexec.h:69
> #1 __crash_kexec (regs=<optimized out>) at kernel/kexec_core.c:974
> #2 0xc000000000168918 in panic (fmt=<optimized out>) at
> kernel/panic.c:358
> #3 0xc000000000b735f8 in sysrq_handle_crash (key=<optimized out>) at
> drivers/tty/sysrq.c:155
> #4 0xc000000000b742cc in __handle_sysrq (key=key@entry=99,
> check_mask=check_mask@entry=false) at drivers/tty/sysrq.c:602
> #5 0xc000000000b7506c in write_sysrq_trigger (file=<optimized out>,
> buf=<optimized out>, count=2, ppos=<optimized out>) at
> drivers/tty/sysrq.c:1163
> #6 0xc00000000069a7bc in pde_write (ppos=<optimized out>,
> count=<optimized out>, buf=<optimized out>, file=<optimized out>,
> pde=0xc000000009ed3a80) at fs/proc/inode.c:340
> #7 proc_reg_write (file=<optimized out>, buf=<optimized out>,
> count=<optimized out>, ppos=<optimized out>) at fs/proc/inode.c:352
> #8 0xc0000000005b3bbc in vfs_write (file=file@entry=0xc00000009dda7d00,
> buf=buf@entry=0xebcfc7c6040 <error: Cannot access memory at address
> 0xebcfc7c6040>, count=count@entry=2, pos=pos@entry=0xc00000000486fda0) at
> fs/read_write.c:582
>
> instead of earlier output without this patch:
>
> gdb> bt
> #0 <unavailable> in ?? ()
> Backtrace stopped: previous frame identical to this frame (corrupt
> stack?)
>
> Also, 'get_dumpfile_regs' has been introduced to get registers from
> multiple supported vmcore formats. Correspondingly a flag
> 'BT_NO_PRINT_REGS'
> has been introduced to tell helper functions to get registers, to not
> print registers with every call to backtrace in gdb.
>
> Note: This feature to support GDB unwinding doesn't support live debugging
>
> Cc: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Cc: Hari Bathini <hbathini(a)linux.ibm.com>
> Cc: Mahesh J Salgaonkar <mahesh(a)linux.ibm.com>
> Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
> Cc: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: Tao Liu <ltao(a)redhat.com>
> Cc: Alexey Makhalov <alexey.makhalov(a)broadcom.com>
> Improved-by: Tao Liu <ltao(a)redhat.com>
> Signed-off-by: Aditya Gupta <adityag(a)linux.ibm.com>
> ---
> defs.h | 123 +++++++++++++++++++++++++++++++++++++++++
> kernel.c | 33 +++++++++++
> ppc64.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> 3 files changed, 312 insertions(+), 7 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index c0e6a29..8450aea 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -982,6 +982,7 @@ struct bt_info {
> ulong eframe_ip;
> ulong radix;
> ulong *cpumask;
> + bool need_free;
Comment on it later.
> };
>
> #define STACK_OFFSET_TYPE(OFF) \
> @@ -6096,6 +6097,7 @@ int load_module_symbols_helper(char *);
> void unlink_module(struct load_module *);
> int check_specified_module_tree(char *, char *);
> int is_system_call(char *, ulong);
> +void get_dumpfile_regs(struct bt_info*, ulong*, ulong*);
> void generic_dump_irq(int);
> void generic_get_irq_affinity(int);
> void generic_show_interrupts(int, ulong *);
> @@ -6195,6 +6197,7 @@ ulong cpu_map_addr(const char *type);
> #define BT_REGS_NOT_FOUND (0x4000000000000ULL)
> #define BT_OVERFLOW_STACK (0x8000000000000ULL)
> #define BT_SKIP_IDLE (0x10000000000000ULL)
> +#define BT_NO_PRINT_REGS (0x20000000000000ULL)
> #define BT_SYMBOL_OFFSET (BT_SYMBOLIC_ARGS)
>
> #define BT_REF_HEXVAL (0x1)
> @@ -8052,6 +8055,126 @@ enum x86_64_regnum {
> LAST_REGNUM
> };
>
> +/*
> + * Register numbers to make crash_target->fetch_registers()
> + * ---> machdep->get_current_task_reg() work properly.
> + *
> + * These register numbers and names are given according to output of
> + * `rs6000_register_name`, because that is what was being used by
> + * crash_target::fetch_registers in case of PPC64
> + */
> +enum ppc64_regnum {
> + PPC64_R0_REGNUM = 0,
> + PPC64_R1_REGNUM,
> + PPC64_R2_REGNUM,
> + PPC64_R3_REGNUM,
> + PPC64_R4_REGNUM,
> + PPC64_R5_REGNUM,
> + PPC64_R6_REGNUM,
> + PPC64_R7_REGNUM,
> + PPC64_R8_REGNUM,
> + PPC64_R9_REGNUM,
> + PPC64_R10_REGNUM,
> + PPC64_R11_REGNUM,
> + PPC64_R12_REGNUM,
> + PPC64_R13_REGNUM,
> + PPC64_R14_REGNUM,
> + PPC64_R15_REGNUM,
> + PPC64_R16_REGNUM,
> + PPC64_R17_REGNUM,
> + PPC64_R18_REGNUM,
> + PPC64_R19_REGNUM,
> + PPC64_R20_REGNUM,
> + PPC64_R21_REGNUM,
> + PPC64_R22_REGNUM,
> + PPC64_R23_REGNUM,
> + PPC64_R24_REGNUM,
> + PPC64_R25_REGNUM,
> + PPC64_R26_REGNUM,
> + PPC64_R27_REGNUM,
> + PPC64_R28_REGNUM,
> + PPC64_R29_REGNUM,
> + PPC64_R30_REGNUM,
> + PPC64_R31_REGNUM,
> +
> + PPC64_F0_REGNUM = 32,
> + PPC64_F1_REGNUM,
> + PPC64_F2_REGNUM,
> + PPC64_F3_REGNUM,
> + PPC64_F4_REGNUM,
> + PPC64_F5_REGNUM,
> + PPC64_F6_REGNUM,
> + PPC64_F7_REGNUM,
> + PPC64_F8_REGNUM,
> + PPC64_F9_REGNUM,
> + PPC64_F10_REGNUM,
> + PPC64_F11_REGNUM,
> + PPC64_F12_REGNUM,
> + PPC64_F13_REGNUM,
> + PPC64_F14_REGNUM,
> + PPC64_F15_REGNUM,
> + PPC64_F16_REGNUM,
> + PPC64_F17_REGNUM,
> + PPC64_F18_REGNUM,
> + PPC64_F19_REGNUM,
> + PPC64_F20_REGNUM,
> + PPC64_F21_REGNUM,
> + PPC64_F22_REGNUM,
> + PPC64_F23_REGNUM,
> + PPC64_F24_REGNUM,
> + PPC64_F25_REGNUM,
> + PPC64_F26_REGNUM,
> + PPC64_F27_REGNUM,
> + PPC64_F28_REGNUM,
> + PPC64_F29_REGNUM,
> + PPC64_F30_REGNUM,
> + PPC64_F31_REGNUM,
> +
> + PPC64_PC_REGNUM = 64,
> + PPC64_MSR_REGNUM = 65,
> + PPC64_CR_REGNUM = 66,
> + PPC64_LR_REGNUM = 67,
> + PPC64_CTR_REGNUM = 68,
> + PPC64_XER_REGNUM = 69,
> + PPC64_FPSCR_REGNUM = 70,
> +
> + PPC64_VR0_REGNUM = 106,
> + PPC64_VR1_REGNUM,
> + PPC64_VR2_REGNUM,
> + PPC64_VR3_REGNUM,
> + PPC64_VR4_REGNUM,
> + PPC64_VR5_REGNUM,
> + PPC64_VR6_REGNUM,
> + PPC64_VR7_REGNUM,
> + PPC64_VR8_REGNUM,
> + PPC64_VR9_REGNUM,
> + PPC64_VR10_REGNUM,
> + PPC64_VR11_REGNUM,
> + PPC64_VR12_REGNUM,
> + PPC64_VR13_REGNUM,
> + PPC64_VR14_REGNUM,
> + PPC64_VR15_REGNUM,
> + PPC64_VR16_REGNUM,
> + PPC64_VR17_REGNUM,
> + PPC64_VR18_REGNUM,
> + PPC64_VR19_REGNUM,
> + PPC64_VR20_REGNUM,
> + PPC64_VR21_REGNUM,
> + PPC64_VR22_REGNUM,
> + PPC64_VR23_REGNUM,
> + PPC64_VR24_REGNUM,
> + PPC64_VR25_REGNUM,
> + PPC64_VR26_REGNUM,
> + PPC64_VR27_REGNUM,
> + PPC64_VR28_REGNUM,
> + PPC64_VR29_REGNUM,
> + PPC64_VR30_REGNUM,
> + PPC64_VR31_REGNUM,
> +
> + PPC64_VSCR_REGNUM = 138,
> + PPC64_VRSAVE_REGNU = 139
> +};
> +
>
Is it possible to move the regnum definitions to ppc64.c? I did not see
them used in other modules. But, not sure if there are any compilation
issues, I haven't tried it.
/* crash_target.c */
> extern int gdb_change_thread_context ();
>
> diff --git a/kernel.c b/kernel.c
> index 78b7b1e..3730c55 100644
> --- a/kernel.c
> +++ b/kernel.c
> @@ -3533,6 +3533,39 @@ get_lkcd_regs(struct bt_info *bt, ulong *eip, ulong
> *esp)
> machdep->get_stack_frame(bt, eip, esp);
> }
>
> +void
> +get_dumpfile_regs(struct bt_info *bt, ulong *eip, ulong *esp)
> +{
> + bt->flags |= BT_NO_PRINT_REGS;
> +
> + if (NETDUMP_DUMPFILE())
> + get_netdump_regs(bt, eip, esp);
> + else if (KDUMP_DUMPFILE())
> + get_kdump_regs(bt, eip, esp);
> + else if (DISKDUMP_DUMPFILE())
> + get_diskdump_regs(bt, eip, esp);
> + else if (KVMDUMP_DUMPFILE())
> + get_kvmdump_regs(bt, eip, esp);
> + else if (LKCD_DUMPFILE())
> + get_lkcd_regs(bt, eip, esp);
> + else if (XENDUMP_DUMPFILE())
> + get_xendump_regs(bt, eip, esp);
> + else if (SADUMP_DUMPFILE())
> + get_sadump_regs(bt, eip, esp);
> + else if (VMSS_DUMPFILE())
> + get_vmware_vmss_regs(bt, eip, esp);
> + else if (REMOTE_PAUSED()) {
> + if (!is_task_active(bt->task) || !get_remote_regs(bt, eip,
> esp))
> + machdep->get_stack_frame(bt, eip, esp);
> + } else
> + machdep->get_stack_frame(bt, eip, esp);
> +
> + bt->flags &= ~BT_NO_PRINT_REGS;
> +
> + bt->instptr = *eip;
> + bt->stkptr = *esp;
> +}
> +
>
> /*
> * Store the head of the kernel module list for future use.
> diff --git a/ppc64.c b/ppc64.c
> index e8930a1..ee1995a 100644
> --- a/ppc64.c
> +++ b/ppc64.c
> @@ -55,6 +55,8 @@ static void ppc64_set_bt_emergency_stack(enum
> emergency_stack_type type,
> static char * ppc64_check_eframe(struct ppc64_pt_regs *);
> static void ppc64_print_eframe(char *, struct ppc64_pt_regs *,
> struct bt_info *);
> +static int ppc64_get_current_task_reg(int regno, const char *name, int
> size,
> + void *value);
> static void parse_cmdline_args(void);
> static int ppc64_paca_percpu_offset_init(int);
> static void ppc64_init_cpu_info(void);
> @@ -73,6 +75,26 @@ static ulong pmd_page_vaddr_l4(ulong pmd);
> static int is_opal_context(ulong sp, ulong nip);
> void opalmsg(void);
>
> +struct user_regs_bitmap_struct {
> + struct {
> + long gpr[32];
> + long nip;
> + long msr;
> + long orig_gpr3; /* Used for restarting system calls */
> + long ctr;
> + long link;
> + long xer;
> + long ccr;
> + long mq; /* 601 only (not used at present) */
> + /* Used on APUS to hold IPL value.
> */
> + long trap; /* Reason for being here */
> + long dar; /* Fault registers */
> + long dsisr;
> + long result; /* Result of a system call */
> + };
> + ulong bitmap;
> +};
> +
> static int is_opal_context(ulong sp, ulong nip)
> {
> uint64_t opal_start, opal_end;
> @@ -704,6 +726,8 @@ ppc64_init(int when)
> error(FATAL, "cannot malloc hwirqstack
> buffer space.");
> }
>
> + machdep->get_current_task_reg = ppc64_get_current_task_reg;
> +
> ppc64_init_paca_info();
>
> if (!machdep->hz) {
> @@ -2501,6 +2525,120 @@ ppc64_print_eframe(char *efrm_str, struct
> ppc64_pt_regs *regs,
> ppc64_print_nip_lr(regs, 1);
> }
>
> +static int
> +ppc64_get_current_task_reg(int regno, const char *name, int size,
> + void *value)
> +{
> + struct bt_info bt_info, bt_setup;
> + struct task_context *tc;
> + ulong task;
> + struct user_regs_bitmap_struct *ur_bitmap;
> + ulong ip, sp;
> + bool ret = FALSE;
> +
> + /* Currently only handling registers available in ppc64_pt_regs:
> + *
> + * 0-31: r0-r31
> + * 64: pc/nip
> + * 65: msr
> + *
> + * 67: lr
> + * 68: ctr
> + */
> + switch (regno) {
> + case PPC64_R0_REGNUM ... PPC64_R31_REGNUM:
> +
> + case PPC64_PC_REGNUM:
> + case PPC64_MSR_REGNUM:
> + case PPC64_LR_REGNUM:
> + case PPC64_CTR_REGNUM:
> + break;
> +
> + default:
> + // return false if we can't get that register
> + if (CRASHDEBUG(1))
> + error(WARNING, "unsupported register, regno=%d\n",
> regno);
> + return FALSE;
> + }
> +
>
The above switch-case code block can be defined as a new static function to
check the validity of regnum, for example:
static int sanity_check_regs(int regno)
And then it can be invoked in the current function. Just an idea, maybe
this looks more concise and readable. Any thoughts?
+ tc = CURRENT_CONTEXT();
> + if (!tc)
> + return FALSE;
> + BZERO(&bt_setup, sizeof(struct bt_info));
> + clone_bt_info(&bt_setup, &bt_info, tc);
> + fill_stackbuf(&bt_info);
> +
> + // reusing the get_dumpfile_regs function to get pt regs structure
> + get_dumpfile_regs(&bt_info, &sp, &ip);
> + if (bt_info.stackbuf)
> + FREEBUF(bt_info.stackbuf);
> + ur_bitmap = (struct user_regs_bitmap_struct *)bt_info.machdep;
> +
> + if (!ur_bitmap) {
> + error(WARNING, "pt_regs not available for cpu %d\n",
> tc->processor);
> + return FALSE;
> + }
> + if (!bt_info.need_free) {
> + goto get_all;
> + }
>
Could you please explain why the need_free is used to determine if it
should jump into the get_all code block? The flag looks very strange, or
is there any another solution?
+
> + switch (ur_bitmap->bitmap) {
> + case 0x100000002:
> + switch (regno) {
> + case PPC64_R1_REGNUM:
> + case PPC64_PC_REGNUM:
> + break;
> + default:
> + return FALSE;
> + }
> + break;
> + }
>
The switch-case code block should be equivalent to the implementation
below(if I understood correctly), and looks more readable?
+ /* Only for R1 and PC values */
+ if ((ur_bitmap->bitmap == 0x100000002) &&
+ (regno != PPC64_R1_REGNUM && regno != PPC64_PC_REGNUM))
+ return FALSE;
> +
> +get_all:
> + switch (regno) {
> + case PPC64_R0_REGNUM ... PPC64_R31_REGNUM:
> + if (size != sizeof(ur_bitmap->gpr[regno])) {
> + ret = FALSE; break; // size mismatch
> + }
>
In fact , the variable ret can be initialized to FALSE one time at start of
this function, and then it can be simplified as below:
+ switch (regno) {
+ case PPC64_R0_REGNUM ... PPC64_R31_REGNUM:
+ if (size != sizeof(ur_bitmap->gpr[regno]))
+ break;
+ memcpy(value, &ur_bitmap->gpr[regno], size);
+ ret = TRUE;
+ break;
+ memcpy(value, &ur_bitmap->gpr[regno], size);
> + ret = TRUE; break;
> +
> + case PPC64_PC_REGNUM:
> + if (size != sizeof(ur_bitmap->nip)) {
> + ret = FALSE; break; // size mismatch
> + }
> + memcpy(value, &ur_bitmap->nip, size);
> + ret = TRUE; break;
> +
> + case PPC64_MSR_REGNUM:
> + if (size != sizeof(ur_bitmap->msr)) {
> + ret = FALSE; break; // size mismatch
> + }
> + memcpy(value, &ur_bitmap->msr, size);
> + ret = TRUE; break;
> +
> + case PPC64_LR_REGNUM:
> + if (size != sizeof(ur_bitmap->link)) {
> + ret = FALSE; break; // size mismatch
> + }
> + memcpy(value, &ur_bitmap->link, size);
> + ret = TRUE; break;
> +
> + case PPC64_CTR_REGNUM:
> + if (size != sizeof(ur_bitmap->ctr)) {
> + ret = FALSE; break; // size mismatch
> + }
> + memcpy(value, &ur_bitmap->ctr, size);
> + ret = TRUE; break;
> + }
> + if (bt_info.need_free) {
> + FREEBUF(ur_bitmap);
> + bt_info.need_free = FALSE;
> + }
> +
> + return ret;
> +}
> +
> /*
> * For vmcore typically saved with KDump or FADump, get SP and IP values
> * from the saved ptregs.
> @@ -2613,9 +2751,11 @@ ppc64_get_dumpfile_stack_frame(struct bt_info
> *bt_in, ulong *nip, ulong *ksp)
> pt_regs = (struct ppc64_pt_regs *)bt->machdep;
> ur_nip = pt_regs->nip;
> ur_ksp = pt_regs->gpr[1];
> - /* Print the collected regs for panic task. */
> - ppc64_print_regs(pt_regs);
> - ppc64_print_nip_lr(pt_regs, 1);
> + if (!(bt->flags & BT_NO_PRINT_REGS)) {
> + /* Print the collected regs for panic task. */
> + ppc64_print_regs(pt_regs);
> + ppc64_print_nip_lr(pt_regs, 1);
> + }
> } else if ((pc->flags & KDUMP) ||
> ((pc->flags & DISKDUMP) &&
> (*diskdump_flags & KDUMP_CMPRS_LOCAL))) {
> @@ -2779,19 +2919,28 @@ static void
> ppc64_get_stack_frame(struct bt_info *bt, ulong *pcp, ulong *spp)
> {
> ulong ksp, nip;
> -
> + struct user_regs_bitmap_struct *ur_bitmap;
> +
> nip = ksp = 0;
>
> - if (DUMPFILE() && is_task_active(bt->task))
> + if (DUMPFILE() && is_task_active(bt->task)) {
> ppc64_get_dumpfile_stack_frame(bt, &nip, &ksp);
> - else
> + bt->need_free = FALSE;
> + } else {
> get_ppc64_frame(bt, &nip, &ksp);
> + ur_bitmap = (struct user_regs_bitmap_struct
> *)GETBUF(sizeof(*ur_bitmap));
> + memset(ur_bitmap, 0, sizeof(*ur_bitmap));
> + ur_bitmap->nip = nip;
> + ur_bitmap->gpr[1] = ksp;
> + ur_bitmap->bitmap += 0x100000002;
>
This confused me a little bit. Maybe it's good enough like this?
+ ur_bitmap->bitmap = 0x100000002;
Thanks
Lianbo
+ bt->machdep = ur_bitmap;
> + bt->need_free = TRUE;
> + }
>
> if (pcp)
> *pcp = nip;
> if (spp)
> *spp = ksp;
> -
> }
>
> static ulong
> --
> 2.40.1
>
4 months
[PATCH] Fix a "Bus error" issue caused by 'crash --osrelease' or crash loading
by Lianbo Jiang
Sometimes, in a production environment, there are still some vmcores
that are incomplete, such as partial header or the data is corrupted.
When crash tool attempts to parse such vmcores, it may fail as below:
$ ./crash --osrelease vmcore
Bus error (core dumped)
or
$ crash vmlinux vmcore
...
Bus error (core dumped)
$
The gdb sees that crash tool reads out a null bitmap from the header in
this vmcore, when executing memcpy(), emits a SIGBUS error as below:
$ gdb /home/lijiang/src/crash/crash /tmp/core.126301
Core was generated by `./crash --osrelease /home/lijiang/src/39317/vmcore'.
Program terminated with signal SIGBUS, Bus error.
#0 __memcpy_evex_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:831
831 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
(gdb) bt
#0 __memcpy_evex_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:831
#1 0x0000000000651096 in read_dump_header (file=0x7ffc59ddff5f "/home/lijiang/src/39317/vmcore") at diskdump.c:820
#2 0x0000000000651cf3 in is_diskdump (file=0x7ffc59ddff5f "/home/lijiang/src/39317/vmcore") at diskdump.c:1042
#3 0x0000000000502ac9 in get_osrelease (dumpfile=0x7ffc59ddff5f "/home/lijiang/src/39317/vmcore") at main.c:1938
#4 0x00000000004fb2e8 in main (argc=3, argv=0x7ffc59dde3a8) at main.c:271
(gdb) frame 1
#1 0x0000000000651096 in read_dump_header (file=0x7ffc59ddff5f "/home/lijiang/src/39317/vmcore") at diskdump.c:820
820 memcpy(dd->dumpable_bitmap, dd->bitmap + bitmap_len/2,
(gdb) p dd->dumpable_bitmap
$1 = 0x7f8e89800010 ""
(gdb) p dd->bitmap
$2 = 0x7f8e87e09000 ""
(gdb) p dd->bitmap + bitmap_len/2
$3 = 0x7f8e88a17000 ""
(gdb) p *(char*)(dd->bitmap+bitmap_len/2)
$4 = 0 '\000'
(gdb) p bitmap_len/2
$5 = 12640256
Let's add a sanity check for such cases to avoid causing a SIGBUS error.
With the patch:
$ crash -s vmlinux vmcore
crash: vmcore: not a supported file format
...
Enter "crash -h" for details.
$ crash --osrelease vmcore
unknown
Reported-by: Buland Kumar Singh <bsingh(a)redhat.com>
Signed-off-by: Lianbo Jiang <lijiang(a)redhat.com>
---
diskdump.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/diskdump.c b/diskdump.c
index 1f7118cacfc6..c31eab01aa05 100644
--- a/diskdump.c
+++ b/diskdump.c
@@ -814,10 +814,12 @@ restart:
madvise(dd->bitmap, bitmap_len, MADV_WILLNEED);
}
- if (dump_is_partial(header))
+ if (dump_is_partial(header)) {
+ if (*(char*)(dd->bitmap + bitmap_len/2) == '\0')
+ goto err;
memcpy(dd->dumpable_bitmap, dd->bitmap + bitmap_len/2,
bitmap_len/2);
- else
+ } else
memcpy(dd->dumpable_bitmap, dd->bitmap, bitmap_len);
dd->data_offset
--
2.45.1
4 months, 3 weeks
Re: [PATCH] Fix "kmem -i" and "swap" commands on Linux 6.10-rc1 and later kernels
by lijiang
On Fri, Jun 28, 2024 at 9:46 AM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 28 Jun 2024 13:13:57 +1200
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] Re: [PATCH] Fix "kmem -i" and "swap" commands
> on Linux 6.10-rc1 and later kernels
> To: HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab(a)nec.com>
> Cc: "devel(a)lists.crash-utility.osci.io"
> <devel(a)lists.crash-utility.osci.io>
> Message-ID:
> <
> CAO7dBbUHmo0tpaFyAGqeYHBai5PqzT+HSHEUsgimtG4asohVgQ(a)mail.gmail.com>
> Content-Type: text/plain; charset="UTF-8"
>
> Hi Kazu,
>
> On Fri, Jun 28, 2024 at 12:41 PM HAGIO KAZUHITO(萩尾 一仁)
> <k-hagio-ab(a)nec.com> wrote:
> >
> > Hi Tao,
> >
> > thank you for your review.
> >
> > On 2024/06/28 9:19, Tao Liu wrote:
> > > Hi Kazu,
> > >
> > > sorry for the late response.
> > >
> > > On Tue, Jun 11, 2024 at 2:41 PM HAGIO KAZUHITO(萩尾 一仁)
> > > <k-hagio-ab(a)nec.com> wrote:
> > >>
> > >> Kernel commit 798cb7f9aec3 ("swapon(2)/swapoff(2): don't bother with
> > >> block size") removed swap_info_struct.old_block_size member at Linux
> > >> 6.10-rc1. The crash-utility has used this to determine whether a swap
> > >> is a partition or file and to determine the way to get the swap path.
> > >>
> > >> Withtout the patch, the "kmem -i" and "swap" commands fail with the
> > >> following error messsage:
> > >>
> > >> crash> kmem -i
> > >> ...
> > >> TOTAL HUGE 13179392 50.3 GB ----
> > >> HUGE FREE 13179392 50.3 GB 100% of TOTAL HUGE
> > >>
> > >> swap: invalid (optional) structure member offsets:
> swap_info_struct_swap_device or swap_info_struct_old_block_size
> > >> FILE: memory.c LINE: 16032 FUNCTION: dump_swap_info()
> > >>
> > >> The swap_file member of recent swap_info_struct is a pointer to a
> > >> struct file (once upon a time it was dentry), use this fact directly.
> > >>
> > >> Signed-off-by: Kazuhito Hagio <k-hagio-ab(a)nec.com>
> > >> ---
> > >> NOTE: This patch can be applied on top of my '[PATCH] Fix "kmem -v"
> > >> option on Linux 6.9 and later kernels' patch. Otherwise, please
> adjust
> > >> the hunk for offset_table.
> > >>
> > >> defs.h | 1 +
> > >> filesys.c | 1 +
> > >> memory.c | 28 +++++++++++++++++++++++-----
> > >> symbols.c | 1 +
> > >> 4 files changed, 26 insertions(+), 5 deletions(-)
> > >>
> > >> diff --git a/defs.h b/defs.h
> > >> index 95de33188070..64bf785b7a1b 100644
> > >> --- a/defs.h
> > >> +++ b/defs.h
> > >> @@ -2242,6 +2242,7 @@ struct offset_table { /*
> stash of commonly-used offsets */
> > >> long log_caller_id;
> > >> long vmap_node_busy;
> > >> long rb_list_head;
> > >> + long file_f_inode;
> > >> };
> > >>
> > >> struct size_table { /* stash of commonly-used sizes */
> > >> diff --git a/filesys.c b/filesys.c
> > >> index 81fe856699e1..406ebb299780 100644
> > >> --- a/filesys.c
> > >> +++ b/filesys.c
> > >> @@ -2038,6 +2038,7 @@ vfs_init(void)
> > >> MEMBER_OFFSET_INIT(file_f_dentry, "file", "f_dentry");
> > >> MEMBER_OFFSET_INIT(file_f_vfsmnt, "file", "f_vfsmnt");
> > >> MEMBER_OFFSET_INIT(file_f_count, "file", "f_count");
> > >> + MEMBER_OFFSET_INIT(file_f_inode, "file", "f_inode");
> > >> MEMBER_OFFSET_INIT(path_mnt, "path", "mnt");
> > >> MEMBER_OFFSET_INIT(path_dentry, "path", "dentry");
> > >> if (INVALID_MEMBER(file_f_dentry)) {
> > >> diff --git a/memory.c b/memory.c
> > >> index acb8507cfb75..28f901f16adf 100644
> > >> --- a/memory.c
> > >> +++ b/memory.c
> > >> @@ -16075,6 +16075,8 @@ dump_swap_info(ulong swapflags, ulong
> *totalswap_pages, ulong *totalused_pages)
> > >> char buf3[BUFSIZE];
> > >> char buf4[BUFSIZE];
> > >> char buf5[BUFSIZE];
> > >> + int swap_file_is_file =
> > >> + STREQ(MEMBER_TYPE_NAME("swap_info_struct",
> "swap_file"), "file");
> > >>
> > >> if (!symbol_exists("nr_swapfiles"))
> > >> error(FATAL, "nr_swapfiles doesn't exist in this
> kernel!\n");
> > >> @@ -16118,9 +16120,21 @@ dump_swap_info(ulong swapflags, ulong
> *totalswap_pages, ulong *totalused_pages)
> > >> swap_file = ULONG(vt->swap_info_struct +
> > >> OFFSET(swap_info_struct_swap_file));
> > >>
> > >> - swap_device = INT(vt->swap_info_struct +
> > >> - OFFSET_OPTION(swap_info_struct_swap_device,
> > >> - swap_info_struct_old_block_size));
> > >> + /* Linux 6.10 and later */
> > >> + if (INVALID_MEMBER(swap_info_struct_swap_device) &&
> > >> + INVALID_MEMBER(swap_info_struct_old_block_size) &&
> > >> + swap_file_is_file) {
> > >> + ulong inode;
> > >> + ushort mode;
> > >> + readmem(swap_file + OFFSET(file_f_inode),
> KVADDR, &inode,
> > >> + sizeof(ulong), "swap_file.f_inode",
> FAULT_ON_ERROR);
> > >> + readmem(inode + OFFSET(inode_i_mode), KVADDR,
> &mode,
> > >> + sizeof(ushort), "inode.i_mode",
> FAULT_ON_ERROR);
> > >> + swap_device = S_ISBLK(mode);
> > >> + } else
> > >> + swap_device = INT(vt->swap_info_struct +
> > >> +
> OFFSET_OPTION(swap_info_struct_swap_device,
> > >> + swap_info_struct_old_block_size));
> > >>
> > >
> > > The above code change is OK to me, just my curiosity: it looks to me
> > > the swap_file->f_inode->i_mode->S_ISBLK approach will always work, not
> > > only for the Linux 6.10 and later versions right? E.g. I made the
> > > following change which works as expected on a 4.0.4-201 kernel:
> >
> > This S_ISBLK approach will work with kernels that swap_file is 'file',
> > and at least 2.6.0 and later kernels have it, as far as I checked (as
> > written in [1]).
> >
> > but I didn't want to change the current behavior (the else block) for
> > 6.9 and older kernels, because it has worked well with them for a long
> > time and there is no need to change.
> >
>
> OK, ageed. Thanks a lot for your explanation! The patch is good to me, so
> ack.
>
>
Applied:
https://github.com/crash-utility/crash/commit/3452fe802bf94d15879b3c5fd17...
Thanks
Lianbo
> Thanks,
> Tao Liu
>
> > Does this answer your question?
> >
> > [1]
> >
> https://lists.crash-utility.osci.io/archives/list/devel@lists.crash-utili...
> >
> > Thanks,
> > Kazu
> >
> > >
> > > @@ -16121,9 +16121,7 @@ dump_swap_info(ulong swapflags, ulong
> > > *totalswap_pages, ulong *totalused_pages)
> > > OFFSET(swap_info_struct_swap_file));
> > >
> > > /* Linux 6.10 and later */
> > > - if (INVALID_MEMBER(swap_info_struct_swap_device) &&
> > > - INVALID_MEMBER(swap_info_struct_old_block_size) &&
> > > - swap_file_is_file) {
> > > + if (1) {
> > > ulong inode;
> > > ushort mode;
> > > readmem(swap_file + OFFSET(file_f_inode),
> > > KVADDR, &inode,
> > >
> > > KERNEL:
> burn.khw2.lab.eng.bos.redhat.com/4.0.4-201.fc21_sysrq-c/vmlinux.gz
> > > DUMPFILE:
> burn.khw2.lab.eng.bos.redhat.com/4.0.4-201.fc21_sysrq-c/vmcore
> > > [PARTIAL DUMP]
> > > CPUS: 4
> > > DATE: Wed May 27 14:23:22 EDT 2015
> > > UPTIME: 02:47:56
> > > LOAD AVERAGE: 0.00, 0.04, 0.05
> > > TASKS: 135
> > > NODENAME: dell-pec5125-04.lab.bos.redhat.com
> > > RELEASE: 4.0.4-201.fc21.x86_64
> > > VERSION: #1 SMP Thu May 21 15:58:47 UTC 2015
> > > MACHINE: x86_64 (2599 Mhz)
> > > MEMORY: 2 GB
> > > PANIC: "sysrq: SysRq : Trigger a crash"
> > > PID: 14037
> > > COMMAND: "bash"
> > > TASK: ffff88006a03d970 [THREAD_INFO: ffff880074734000]
> > > CPU: 1
> > > STATE: TASK_RUNNING (SYSRQ)
> > >
> > > crash> swap
> > > SWAP_INFO_STRUCT TYPE SIZE USED PCT PRI FILENAME
> > > ffff88007bf62000 PARTITION 2097148k 0k 0% -1 /dev/dm-1
> > > crash> kmem -i
> > > PAGES TOTAL PERCENTAGE
> > > TOTAL MEM 445983 1.7 GB ----
> > > FREE 275307 1.1 GB 61% of TOTAL MEM
> > > ...
> > >
> > > Thanks,
> > > Tao Liu
> > >
> > >> pages = INT(vt->swap_info_struct +
> > >> OFFSET(swap_info_struct_pages));
> > >> @@ -16161,8 +16175,12 @@ dump_swap_info(ulong swapflags, ulong
> *totalswap_pages, ulong *totalused_pages)
> > >>
> OFFSET(swap_info_struct_swap_vfsmnt));
> > >> get_pathname(swap_file, buf, BUFSIZE,
> > >> 1, vfsmnt);
> > >> - } else if (VALID_MEMBER
> > >> - (swap_info_struct_old_block_size)) {
> > >> + } else if
> (VALID_MEMBER(swap_info_struct_old_block_size) ||
> > >> + swap_file_is_file) {
> > >> + /*
> > >> + * Linux 6.10 and later kernels do
> not have old_block_size,
> > >> + * but this still should work, if
> swap_file is file.
> > >> + */
> > >> devname =
> vfsmount_devname(file_to_vfsmnt(swap_file),
> > >> buf1, BUFSIZE);
> > >>
> get_pathname(file_to_dentry(swap_file),
> > >> diff --git a/symbols.c b/symbols.c
> > >> index 283b98a8fbfe..0bf050ab62d0 100644
> > >> --- a/symbols.c
> > >> +++ b/symbols.c
> > >> @@ -10522,6 +10522,7 @@ dump_offset_table(char *spec, ulong
> makestruct)
> > >> OFFSET(file_f_count));
> > >> fprintf(fp, " file_f_path: %ld\n",
> > >> OFFSET(file_f_path));
> > >> + fprintf(fp, " file_f_inode: %ld\n",
> OFFSET(file_f_inode));
> > >> fprintf(fp, " path_mnt: %ld\n",
> > >> OFFSET(path_mnt));
> > >> fprintf(fp, " path_dentry: %ld\n",
> > >> --
> > >> 2.31.1
>
5 months