[PATCH v3 0/4] Generalize KASLR calculation and use it for KDUMPs
by Sergio Lopez
Commit 45b74b89530d611b3fa95a1041e158fbb865fa84 added support for
calculating phys_base and kernel offset for KASLR-enabled kernels on
SADUMPs by using a technique developed by Takao Indoh. Originally, the
patchset included support for KDUMPs, but this was dropped in v2, as it
was deemed unnecessary due to the implementation of the vmcoreinfo
device in QEMU.
Sadly, there are many reasons for which the vmcoreinfo device may not be
present in the moment of taking the memory dump from a VM, ranging from
a Host running older QEMU/libvirt versions, to misconfigured VMs or
environments running Hypervisors that doesn't support this device.
This patchset generalizes the kaslr related functions from sadump.c
moving them to kaslr_helper.c, and makes KDUMP analysis fallback to
KASLR offset calculation if vmcoreinfo data is missing.
These changes have been successfully tested with a 3.10.0-830.el7.x86_64
under the following conditions:
- kdump with KASLR and vmcoreinfo
- kdump with KASLR but no vmcoreinfo
- kdump without KASLR ("nokaslr" kernel command line option)
It was also tested that a "crash" patched with these changes still
builds and runs (live and kdump debugging) on an aarch64 machine.
changelog:
v3:
- Merge *get_cr3 and *get_idtr functions and move them to
kaslr_helper.c
- diskdump: drop kaslr_phys_base addition and use
sub_header_kdump->phys_base instead.
- Unconditionally call x86_64_virt_phys_base after grabbing phys_base
v2:
- Limit application to QEMU ELF and QEMU COMPRESSED dumps (thanks Dave)
- Add support for QEMU COMPRESSED dumps (diskdump)
Sergio Lopez (4):
Move kaslr related functions from sadump.c to kaslr_helper.c
Move QEMUCPU* structs from netdump.h to defs.h
netdump: infer kaslr offset for QEMU ELF dumps without vmcoreinfo
diskdump: infer kaslr offset for QEMU COMPRESSED dumps without
vmcoreinfo
Makefile | 7 +-
defs.h | 39 +++++
diskdump.c | 61 ++++++++
kaslr_helper.c | 488 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
netdump.c | 54 +++++++
netdump.h | 24 +--
sadump.c | 486 ++++----------------------------------------------------
symbols.c | 26 ++-
x86_64.c | 18 ++-
9 files changed, 719 insertions(+), 484 deletions(-)
create mode 100644 kaslr_helper.c
--
2.14.3
6 years, 9 months
[PATCH v2] vmware_vmss: read vCPUs regs and show them in 'bt'
by Sergio Lopez
VMSS dump files contain the state of each vCPU at the time of suspending
the VM. This change enables 'crash' to read some relevant registers from
each vCPU state to display them in 'bt' and adds additional output for
commands 'help -D', 'help -r' and 'help -p'.
This is also the first step towards implementing kaslr offset
calculation for VMSS dump files.
---
defs.h | 5 +
help.c | 3 +
kernel.c | 2 +
main.c | 3 +
memory.c | 2 +
vmware_vmss.c | 375 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
vmware_vmss.h | 31 +++++
x86_64.c | 13 +-
8 files changed, 424 insertions(+), 10 deletions(-)
diff --git a/defs.h b/defs.h
index 7998ebf..44efc8a 100644
--- a/defs.h
+++ b/defs.h
@@ -283,6 +283,7 @@ struct number_option {
#define LKCD_KERNTYPES() (pc->flags & KERNTYPES)
#define KVMDUMP_DUMPFILE() (pc->flags & KVMDUMP)
#define SADUMP_DUMPFILE() (pc->flags & SADUMP)
+#define VMSS_DUMPFILE() (pc->flags & VMWARE_VMSS)
#define NETDUMP_LOCAL (0x1) /* netdump_data flags */
#define NETDUMP_REMOTE (0x2)
@@ -6388,6 +6389,10 @@ int vmware_vmss_init(char *filename, FILE *ofp);
uint vmware_vmss_page_size(void);
int read_vmware_vmss(int, void *, int, ulong, physaddr_t);
int write_vmware_vmss(int, void *, int, ulong, physaddr_t);
+void vmware_vmss_display_regs(int, FILE *);
+void get_vmware_vmss_regs(struct bt_info *, ulong *, ulong *);
+int vmware_vmss_memory_dump(FILE *);
+void dump_registers_for_vmss_dump(void);
/*
* gnu_binutils.c
diff --git a/help.c b/help.c
index 5f6d9be..06b7961 100644
--- a/help.c
+++ b/help.c
@@ -710,6 +710,9 @@ dump_registers(void)
} else if (NETDUMP_DUMPFILE() || KDUMP_DUMPFILE()) {
dump_registers_for_elf_dumpfiles();
return;
+ } else if (VMSS_DUMPFILE()) {
+ dump_registers_for_vmss_dump();
+ return;
}
error(FATAL, "-r option not supported on %s\n",
diff --git a/kernel.c b/kernel.c
index 1bf6251..7642217 100644
--- a/kernel.c
+++ b/kernel.c
@@ -2969,6 +2969,8 @@ back_trace(struct bt_info *bt)
get_xendump_regs(bt, &eip, &esp);
else if (SADUMP_DUMPFILE())
get_sadump_regs(bt, &eip, &esp);
+ else if (VMSS_DUMPFILE())
+ get_vmware_vmss_regs(bt, &eip, &esp);
else if (REMOTE_PAUSED()) {
if (!is_task_active(bt->task) || !get_remote_regs(bt, &eip, &esp))
machdep->get_stack_frame(bt, &eip, &esp);
diff --git a/main.c b/main.c
index 2aae0c6..15834cb 100644
--- a/main.c
+++ b/main.c
@@ -1361,6 +1361,9 @@ dump_program_context(void)
if (pc->flags & DISKDUMP)
sprintf(&buf[strlen(buf)],
"%sDISKDUMP", others++ ? "|" : "");
+ if (pc->flags & VMWARE_VMSS)
+ sprintf(&buf[strlen(buf)],
+ "%sVMWARE_VMSS", others++ ? "|" : "");
if (pc->flags & SYSMAP)
sprintf(&buf[strlen(buf)],
"%sSYSMAP", others++ ? "|" : "");
diff --git a/memory.c b/memory.c
index 0669276..9f752c2 100644
--- a/memory.c
+++ b/memory.c
@@ -16909,6 +16909,8 @@ dumpfile_memory(int cmd)
retval = kcore_memory_dump(fp);
else if (pc->flags & SADUMP)
retval = sadump_memory_dump(fp);
+ else if (pc->flags & VMWARE_VMSS)
+ retval = vmware_vmss_memory_dump(fp);
break;
case DUMPFILE_ENVIRONMENT:
diff --git a/vmware_vmss.c b/vmware_vmss.c
index 667676a..a97a545 100644
--- a/vmware_vmss.c
+++ b/vmware_vmss.c
@@ -25,6 +25,8 @@
#define VMW_PAGE_SIZE (4096)
#define VMW_PAGE_SHIFT (12)
+#define MAX_BLOCK_DUMP (128)
+
static vmssdata vmss = { 0 };
int
@@ -128,7 +130,8 @@ vmware_vmss_init(char *filename, FILE *ofp)
DEBUG_PARSE_PRINT((ofp, LOGPRX"Group: %-20s offset=%#llx size=0x%#llx.\n",
grps[i].name, (ulonglong)grps[i].position, (ulonglong)grps[i].size));
- if (strcmp(grps[i].name, "memory") != 0) {
+ if (strcmp(grps[i].name, "memory") != 0 &&
+ (strcmp(grps[i].name, "cpu") != 0 || !machine_type("X86_64"))) {
continue;
}
@@ -198,12 +201,6 @@ vmware_vmss_init(char *filename, FILE *ofp)
}
blockpos += padsize;
- if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
- error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
- (ulonglong)(blockpos + nbytes));
- break;
- }
-
if (strcmp(name, "Memory") == 0) {
/* The things that we really care about...*/
vmss.memoffset = blockpos;
@@ -217,11 +214,44 @@ vmware_vmss_init(char *filename, FILE *ofp)
result = FALSE;
goto exit;
}
+
+ if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
+ error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
+ (ulonglong)(blockpos + nbytes));
+ break;
+ }
+ } else if (strcmp(name, "gpregs") == 0 &&
+ nbytes == VMW_GPREGS_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+
+ fread(vmss.regs64[cpu], nbytes, 1, fp);
+ } else if (strcmp(name, "CR64") == 0 &&
+ nbytes == VMW_CR64_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+
+ fread(&vmss.regs64[cpu]->cr[0], nbytes, 1, fp);
+ } else if (strcmp(name, "IDTR") == 0 &&
+ nbytes == VMW_IDTR_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+ char buf[10];
+
+ fread(&buf[0], nbytes, 1, fp);
+ vmss.regs64[cpu]->idtr = *((uint64_t *)(&buf[0] + 2));
+ } else {
+ if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
+ error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
+ (ulonglong)(blockpos + nbytes));
+ break;
+ }
}
} else {
union {
uint8_t val[TAG_VALSIZE_MASK];
uint32_t val32;
+ uint64_t val64;
} u;
unsigned k;
unsigned valsize = TAG_VALSIZE(tag);
@@ -253,6 +283,30 @@ vmware_vmss_init(char *filename, FILE *ofp)
if (strcmp(name, "align_mask") == 0) {
vmss.alignmask = u.val32;
}
+ } else if (strcmp(grps[i].name, "cpu") == 0) {
+ if (strcmp(name, "cpu:numVCPUs") == 0) {
+ if (vmss.regs64 != NULL) {
+ error(INFO, LOGPRX"Duplicated cpu:numVCPUs entry.\n");
+ break;
+ }
+
+ vmss.num_vcpus = u.val32;
+ vmss.regs64 = malloc(vmss.num_vcpus * sizeof(void *));
+
+ for (k = 0; k < vmss.num_vcpus; k++) {
+ vmss.regs64[k] = malloc(sizeof(vmssregs64));
+ memset(vmss.regs64[k], 0, sizeof(vmssregs64));
+ }
+ } else if (strcmp(name, "rip") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->rip = u.val64;
+ } else if (strcmp(name, "eflags") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->rflags |= u.val32;
+ } else if (strcmp(name, "EFLAGS") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->rflags |= u.val32;
+ }
}
DEBUG_PARSE_PRINT((ofp, "\n"));
@@ -350,3 +404,310 @@ write_vmware_vmss(int fd, void *bufptr, int cnt, ulong addr, physaddr_t paddr)
return SEEK_ERROR;
}
+void
+vmware_vmss_display_regs(int cpu, FILE *ofp)
+{
+ if (cpu >= vmss.num_vcpus)
+ return;
+
+ if (machine_type("X86_64")) {
+ fprintf(ofp,
+ " RIP: %016llx RSP: %016llx RFLAGS: %08llx\n"
+ " RAX: %016llx RBX: %016llx RCX: %016llx\n"
+ " RDX: %016llx RSI: %016llx RDI: %016llx\n"
+ " RBP: %016llx R8: %016llx R9: %016llx\n"
+ " R10: %016llx R11: %016llx R12: %016llx\n"
+ " R13: %016llx R14: %016llx R15: %016llx\n",
+ vmss.regs64[cpu]->rip,
+ vmss.regs64[cpu]->rsp,
+ vmss.regs64[cpu]->rflags,
+ vmss.regs64[cpu]->rax,
+ vmss.regs64[cpu]->rbx,
+ vmss.regs64[cpu]->rcx,
+ vmss.regs64[cpu]->rdx,
+ vmss.regs64[cpu]->rsi,
+ vmss.regs64[cpu]->rdi,
+ vmss.regs64[cpu]->rbp,
+ vmss.regs64[cpu]->r8,
+ vmss.regs64[cpu]->r9,
+ vmss.regs64[cpu]->r10,
+ vmss.regs64[cpu]->r11,
+ vmss.regs64[cpu]->r12,
+ vmss.regs64[cpu]->r13,
+ vmss.regs64[cpu]->r14,
+ vmss.regs64[cpu]->r15
+ );
+ }
+}
+
+void
+get_vmware_vmss_regs(struct bt_info *bt, ulong *ipp, ulong *spp)
+{
+ ulong ip, sp;
+ struct register_set *rp;
+
+ ip = sp = 0;
+
+ if (!is_task_active(bt->task)) {
+ machdep->get_stack_frame(bt, ipp, spp);
+ return;
+ }
+
+ bt->flags |= BT_DUMPFILE_SEARCH;
+ if (machine_type("X86_64"))
+ machdep->get_stack_frame(bt, ipp, spp);
+ else if (machine_type("X86"))
+ get_netdump_regs_x86(bt, ipp, spp);
+ if (bt->flags & BT_DUMPFILE_SEARCH)
+ return;
+
+ if ((vmss.regs64 == NULL) ||
+ (bt->tc->processor >= vmss.num_vcpus))
+ return;
+
+ ip = (ulong)vmss.regs64[bt->tc->processor]->rip;
+ sp = (ulong)vmss.regs64[bt->tc->processor]->rsp;
+ if (is_kernel_text(ip) &&
+ (((sp >= GET_STACKBASE(bt->task)) &&
+ (sp < GET_STACKTOP(bt->task))) ||
+ in_alternate_stack(bt->tc->processor, sp))) {
+ *ipp = ip;
+ *spp = sp;
+ bt->flags |= BT_KERNEL_SPACE;
+ return;
+ }
+
+ if (!is_kernel_text(ip) &&
+ in_user_stack(bt->tc->task, sp))
+ bt->flags |= BT_USER_SPACE;
+}
+
+int
+vmware_vmss_memory_dump(FILE *fp)
+{
+ cptdumpheader hdr;
+ cptgroupdesc *grps = NULL;
+ unsigned grpsize;
+ unsigned i;
+ int result = TRUE;
+
+ if (fseek(vmss.dfp, 0, SEEK_SET) != 0) {
+ fprintf(fp, "Error seeking to position 0.\n");
+ return FALSE;
+ }
+
+ if (fread(&hdr, sizeof(cptdumpheader), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Failed to read vmss file: %s [Error %d] %s\n",
+ errno, strerror(errno));
+ return FALSE;
+ }
+
+ fprintf(fp, "vmware_vmss:\n");
+ fprintf(fp, " Header: id=%x version=%d numgroups=%d\n",
+ hdr.id, hdr.version, hdr.numgroups);
+
+ vmss.cpt64bit = (hdr.id != CPTDUMP_OLD_MAGIC_NUMBER);
+ fprintf(fp, " Checkpoint is %d-bit\n", vmss.cpt64bit ? 64 : 32);
+
+ grpsize = hdr.numgroups * sizeof (cptgroupdesc);
+ grps = (cptgroupdesc *) malloc(grpsize * sizeof(cptgroupdesc));
+ if (grps == NULL) {
+ fprintf(fp, "Failed to allocate memory! [Error %d] %s\n",
+ errno, strerror(errno));
+ return FALSE;
+ }
+
+ if (fread(grps, sizeof(cptgroupdesc), grpsize, vmss.dfp) != grpsize) {
+ fprintf(fp, "Failed to read vmss file: [Error %d] %s\n",
+ errno, strerror(errno));
+ result = FALSE;
+ goto exit;
+ }
+
+ for (i = 0; i < hdr.numgroups; i++) {
+ if (fseek(vmss.dfp, grps[i].position, SEEK_SET) == -1) {
+ fprintf(fp, "Bad offset of VMSS Group['%s'] in vmss file at %#llx.\n",
+ grps[i].name, (ulonglong)grps[i].position);
+ continue;
+ }
+ fprintf(fp, "\nGroup: %s offset=%#llx size=0x%#llx\n",
+ grps[i].name, (ulonglong)grps[i].position, (ulonglong)grps[i].size);
+
+ for (;;) {
+ uint16_t tag;
+ char name[TAG_NAMELEN_MASK + 1];
+ unsigned nameLen;
+ unsigned nindx;
+ int idx[3];
+ unsigned j;
+ int nextgroup = FALSE;
+
+ if (fread(&tag, sizeof(tag), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read tag.\n");
+ break;
+ }
+ if (tag == NULL_TAG)
+ break;
+
+ nameLen = TAG_NAMELEN(tag);
+ if (fread(name, nameLen, 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read tag name.\n");
+ break;
+ }
+ name[nameLen] = 0;
+ fprintf(fp, " Item %20s", name);
+
+ nindx = TAG_NINDX(tag);
+ if (nindx > 3) {
+ fprintf(fp, "Too many indexes %d (> 3).\n", nindx);
+ break;
+ }
+ idx[0] = idx[1] = idx[2] = NO_INDEX;
+ for (j= 0; j < 3; j++) {
+ if (j < nindx) {
+ if (fread(&idx[j], sizeof(idx[0]), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read index.\n");
+ nextgroup = TRUE;
+ break;
+ }
+ fprintf(fp, "[%d]", idx[j]);
+ } else
+ fprintf(fp, " ");
+ }
+ if (nextgroup)
+ break;
+
+ if (IS_BLOCK_TAG(tag)) {
+ uint64_t nbytes;
+ uint64_t blockpos;
+ uint64_t nbytesinmem;
+ int compressed = IS_BLOCK_COMPRESSED_TAG(tag);
+ uint16_t padsize;
+ unsigned k, l;
+ char byte;
+
+ if (fread(&nbytes, sizeof(nbytes), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read block size.\n");
+ break;
+ }
+ if (fread(&nbytesinmem, sizeof(nbytesinmem), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read block memory size.\n");
+ break;
+ }
+ if (fread(&padsize, sizeof(padsize), 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read block padding size.\n");
+ break;
+ }
+ if ((blockpos = ftell(vmss.dfp)) == -1) {
+ fprintf(fp, "Cannot determine location within VMSS file.\n");
+ break;
+ }
+ blockpos += padsize;
+
+ fprintf(fp, " => %sBLOCK: position=%#llx size=%#llx memsize=%#llx\n",
+ compressed ? "COMPRESSED " : "",
+ (ulonglong)blockpos, (ulonglong)nbytes, (ulonglong)nbytesinmem);
+
+ if (nbytes && nbytes <= MAX_BLOCK_DUMP && !compressed) {
+ fprintf(fp, "Hex dump: \n");
+ l = 0;
+ for (k = 0; k < nbytes; k++) {
+ if (fread(&byte, 1, 1, vmss.dfp) != 1) {
+ fprintf(fp, "Cannot read byte.\n");
+ result = FALSE;
+ goto exit;
+ }
+
+ fprintf(fp, " %02hhX", byte);
+
+ if (l++ == 15) {
+ fprintf(fp, "\n");
+ l = 0;
+ }
+ }
+ if (l)
+ fprintf(fp, "\n\n");
+ else
+ fprintf(fp, "\n");
+ } else {
+ if (fseek(vmss.dfp, blockpos + nbytes, SEEK_SET) == -1) {
+ fprintf(fp, "Cannot seek past block at %#llx.\n",
+ (ulonglong)(blockpos + nbytes));
+ result = FALSE;
+ goto exit;
+ }
+ }
+ } else {
+ union {
+ uint8_t val[TAG_VALSIZE_MASK];
+ uint32_t val32;
+ uint64_t val64;
+ } u;
+ unsigned k;
+ unsigned valsize = TAG_VALSIZE(tag);
+ uint64_t blockpos = ftell(vmss.dfp);
+
+ fprintf(fp, " => position=%#llx size=%#x: ",
+ (ulonglong)blockpos, valsize);
+
+ if (fread(u.val, sizeof(u.val[0]), valsize, vmss.dfp) != valsize) {
+ fprintf(fp, "Cannot read item.\n");
+ break;
+ }
+ for (k = 0; k < valsize; k++) {
+ /* Assume Little Endian */
+ fprintf(fp, "%02X", u.val[valsize - k - 1]);
+ }
+
+
+ fprintf(fp, "\n");
+ }
+ }
+ }
+
+exit:
+ if (grps)
+ free(grps);
+
+ return result;
+}
+
+void
+dump_registers_for_vmss_dump(void)
+{
+ int i;
+ vmssregs64 *regs;
+
+ if (!machine_type("X86_64")) {
+ fprintf(fp, "-r option not supported on this dumpfile type\n");
+ return;
+ }
+
+ for (i = 0; i < vmss.num_vcpus; i++) {
+ regs = vmss.regs64[i];
+
+ if (i)
+ fprintf(fp, "\n");
+
+ fprintf(fp, "CPU %d:\n", i);
+
+ fprintf(fp, " RAX: %016llx RBX: %016llx RCX: %016llx\n",
+ regs->rax, regs->rbx, regs->rcx);
+ fprintf(fp, " RDX: %016llx RSI: %016llx RDI: %016llx\n",
+ regs->rdx, regs->rsi, regs->rdi);
+ fprintf(fp, " RSP: %016llx RBP: %016llx R8: %016llx\n",
+ regs->rsp, regs->rbp, regs->r8);
+ fprintf(fp, " R9: %016llx R10: %016llx R11: %016llx\n",
+ regs->r9, regs->r10, regs->r11);
+ fprintf(fp, " R12: %016llx R13: %016llx R14: %016llx\n",
+ regs->r12, regs->r13, regs->r14);
+ fprintf(fp, " R15: %016llx RIP: %016llx RFLAGS: %08llx\n",
+ regs->r15, regs->rip, regs->rflags);
+ fprintf(fp, " IDT: base: %016llx\n",
+ regs->idtr);
+ fprintf(fp, " CR0: %016llx CR1: %016llx CR2: %016llx\n",
+ regs->cr[0], regs->cr[1], regs->cr[2]);
+ fprintf(fp, " CR3: %016llx CR4: %016llx\n",
+ regs->cr[3], regs->cr[4]);
+ }
+}
diff --git a/vmware_vmss.h b/vmware_vmss.h
index a4b8937..41d14c3 100644
--- a/vmware_vmss.h
+++ b/vmware_vmss.h
@@ -89,6 +89,35 @@ struct memregion {
};
typedef struct memregion memregion;
+#define VMW_GPREGS_SIZE (128)
+#define VMW_CR64_SIZE (72)
+#define VMW_IDTR_SIZE (10)
+struct vmssregs64 {
+ /* read from vmss */
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rbp;
+ uint64_t rsp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ /* manually managed */
+ uint64_t idtr;
+ uint64_t cr[VMW_CR64_SIZE / 8];
+ uint64_t rip;
+ uint64_t rflags;
+};
+typedef struct vmssregs64 vmssregs64;
+
#define MAX_REGIONS 3
struct vmssdata {
int32_t cpt64bit;
@@ -99,6 +128,8 @@ struct vmssdata {
memregion regions[MAX_REGIONS];
uint64_t memoffset;
uint64_t memsize;
+ uint64_t num_vcpus;
+ vmssregs64 **regs64;
};
typedef struct vmssdata vmssdata;
diff --git a/x86_64.c b/x86_64.c
index 0d5e150..7b02761 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -3273,6 +3273,8 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
return;
}
@@ -3295,13 +3297,16 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
else if (pc->flags2 & QEMU_MEM_DUMP_ELF)
display_regs_from_elf_notes(bt->tc->processor, ofp);
return;
} else if ((bt->flags & BT_KERNEL_SPACE) &&
(KVMDUMP_DUMPFILE() ||
(ELF_NOTES_VALID() && DISKDUMP_DUMPFILE()) ||
- SADUMP_DUMPFILE() || (pc->flags2 & QEMU_MEM_DUMP_ELF))) {
+ SADUMP_DUMPFILE() || (pc->flags2 & QEMU_MEM_DUMP_ELF) ||
+ VMSS_DUMPFILE())) {
fprintf(ofp, " [exception RIP: ");
if ((sp = value_search(bt->instptr, &offset))) {
fprintf(ofp, "%s", sp->name);
@@ -3317,6 +3322,8 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
else if (pc->flags2 & QEMU_MEM_DUMP_ELF)
display_regs_from_elf_notes(bt->tc->processor, ofp);
@@ -4941,7 +4948,7 @@ skip_stage:
if (halt_rip && halt_rsp) {
*rip = halt_rip;
*rsp = halt_rsp;
- if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE())
+ if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE() || VMSS_DUMPFILE())
bt_in->flags &= ~(ulonglong)BT_DUMPFILE_SEARCH;
return;
}
@@ -4986,7 +4993,7 @@ skip_stage:
machdep->get_stack_frame(bt, rip, rsp);
- if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE())
+ if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE() || VMSS_DUMPFILE())
bt_in->flags &= ~(ulonglong)BT_DUMPFILE_SEARCH;
}
--
2.14.3
6 years, 9 months
[PATCH] vmware_vmss: read vCPUs regs and show them in 'bt'
by Sergio Lopez
VMSS dump files contain the state of each vCPU at the time of suspending
the VM. This change enables 'crash' to read some relevant registers from
each vCPU state and display them in 'bt'.
This is also the first step towards implementing kaslr offset
calculation for VMSS dump files.
---
defs.h | 3 ++
kernel.c | 2 +
vmware_vmss.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
vmware_vmss.h | 28 +++++++++++
x86_64.c | 13 ++++--
5 files changed, 184 insertions(+), 10 deletions(-)
diff --git a/defs.h b/defs.h
index 7998ebf..0ebd38b 100644
--- a/defs.h
+++ b/defs.h
@@ -283,6 +283,7 @@ struct number_option {
#define LKCD_KERNTYPES() (pc->flags & KERNTYPES)
#define KVMDUMP_DUMPFILE() (pc->flags & KVMDUMP)
#define SADUMP_DUMPFILE() (pc->flags & SADUMP)
+#define VMSS_DUMPFILE() (pc->flags & VMWARE_VMSS)
#define NETDUMP_LOCAL (0x1) /* netdump_data flags */
#define NETDUMP_REMOTE (0x2)
@@ -6388,6 +6389,8 @@ int vmware_vmss_init(char *filename, FILE *ofp);
uint vmware_vmss_page_size(void);
int read_vmware_vmss(int, void *, int, ulong, physaddr_t);
int write_vmware_vmss(int, void *, int, ulong, physaddr_t);
+void vmware_vmss_display_regs(int, FILE *);
+void get_vmware_vmss_regs(struct bt_info *, ulong *, ulong *);
/*
* gnu_binutils.c
diff --git a/kernel.c b/kernel.c
index 1bf6251..7642217 100644
--- a/kernel.c
+++ b/kernel.c
@@ -2969,6 +2969,8 @@ back_trace(struct bt_info *bt)
get_xendump_regs(bt, &eip, &esp);
else if (SADUMP_DUMPFILE())
get_sadump_regs(bt, &eip, &esp);
+ else if (VMSS_DUMPFILE())
+ get_vmware_vmss_regs(bt, &eip, &esp);
else if (REMOTE_PAUSED()) {
if (!is_task_active(bt->task) || !get_remote_regs(bt, &eip, &esp))
machdep->get_stack_frame(bt, &eip, &esp);
diff --git a/vmware_vmss.c b/vmware_vmss.c
index 667676a..10fbe9e 100644
--- a/vmware_vmss.c
+++ b/vmware_vmss.c
@@ -24,6 +24,9 @@
/* VMware only supports X86/X86_64 virtual machines. */
#define VMW_PAGE_SIZE (4096)
#define VMW_PAGE_SHIFT (12)
+#define VMW_GPREGS_SIZE (128)
+#define VMW_CR64_SIZE (72)
+#define VMW_IDTR_SIZE (10)
static vmssdata vmss = { 0 };
@@ -128,7 +131,8 @@ vmware_vmss_init(char *filename, FILE *ofp)
DEBUG_PARSE_PRINT((ofp, LOGPRX"Group: %-20s offset=%#llx size=0x%#llx.\n",
grps[i].name, (ulonglong)grps[i].position, (ulonglong)grps[i].size));
- if (strcmp(grps[i].name, "memory") != 0) {
+ if (strcmp(grps[i].name, "memory") != 0 &&
+ (strcmp(grps[i].name, "cpu") != 0 || !machine_type("X86_64"))) {
continue;
}
@@ -198,12 +202,6 @@ vmware_vmss_init(char *filename, FILE *ofp)
}
blockpos += padsize;
- if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
- error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
- (ulonglong)(blockpos + nbytes));
- break;
- }
-
if (strcmp(name, "Memory") == 0) {
/* The things that we really care about...*/
vmss.memoffset = blockpos;
@@ -217,11 +215,46 @@ vmware_vmss_init(char *filename, FILE *ofp)
result = FALSE;
goto exit;
}
+
+ if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
+ error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
+ (ulonglong)(blockpos + nbytes));
+ break;
+ }
+ } else if (strcmp(name, "gpregs") == 0 &&
+ nbytes == VMW_GPREGS_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+
+ fread(vmss.regs64[cpu], nbytes, 1, fp);
+ } else if (strcmp(name, "CR64") == 0 &&
+ nbytes == VMW_CR64_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+ uint64_t regs[9];
+
+ fread(®s[0], nbytes, 1, fp);
+ vmss.regs64[cpu]->cr3 = regs[3];
+ } else if (strcmp(name, "IDTR") == 0 &&
+ nbytes == VMW_IDTR_SIZE &&
+ idx[0] < vmss.num_vcpus) {
+ int cpu = idx[0];
+ char buf[10];
+
+ fread(&buf[0], nbytes, 1, fp);
+ vmss.regs64[cpu]->cr3 = *((uint64_t *)(&buf[0] + 2));
+ } else {
+ if (fseek(fp, blockpos + nbytes, SEEK_SET) == -1) {
+ error(INFO, LOGPRX"Cannot seek past block at %#llx.\n",
+ (ulonglong)(blockpos + nbytes));
+ break;
+ }
}
} else {
union {
uint8_t val[TAG_VALSIZE_MASK];
uint32_t val32;
+ uint64_t val64;
} u;
unsigned k;
unsigned valsize = TAG_VALSIZE(tag);
@@ -253,6 +286,30 @@ vmware_vmss_init(char *filename, FILE *ofp)
if (strcmp(name, "align_mask") == 0) {
vmss.alignmask = u.val32;
}
+ } else if (strcmp(grps[i].name, "cpu") == 0) {
+ if (strcmp(name, "cpu:numVCPUs") == 0) {
+ if (vmss.regs64 != NULL) {
+ error(INFO, LOGPRX"Duplicated cpu:numVCPUs entry.\n");
+ break;
+ }
+
+ vmss.num_vcpus = u.val32;
+ vmss.regs64 = malloc(vmss.num_vcpus * sizeof(void *));
+
+ for (k = 0; k < vmss.num_vcpus; k++) {
+ vmss.regs64[k] = malloc(sizeof(vmssregs64));
+ memset(vmss.regs64[k], 0, sizeof(vmssregs64));
+ }
+ } else if (strcmp(name, "rip") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->rip = u.val64;
+ } else if (strcmp(name, "eflags") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->eflags |= u.val32;
+ } else if (strcmp(name, "EFLAGS") == 0) {
+ int cpu = idx[0];
+ vmss.regs64[cpu]->eflags |= u.val32;
+ }
}
DEBUG_PARSE_PRINT((ofp, "\n"));
@@ -350,3 +407,80 @@ write_vmware_vmss(int fd, void *bufptr, int cnt, ulong addr, physaddr_t paddr)
return SEEK_ERROR;
}
+void
+vmware_vmss_display_regs(int cpu, FILE *ofp)
+{
+ if (cpu >= vmss.num_vcpus)
+ return;
+
+ if (machine_type("X86_64")) {
+ fprintf(ofp,
+ " RIP: %016llx RSP: %016llx RFLAGS: %08llx\n"
+ " RAX: %016llx RBX: %016llx RCX: %016llx\n"
+ " RDX: %016llx RSI: %016llx RDI: %016llx\n"
+ " RBP: %016llx R8: %016llx R9: %016llx\n"
+ " R10: %016llx R11: %016llx R12: %016llx\n"
+ " R13: %016llx R14: %016llx R15: %016llx\n",
+ vmss.regs64[cpu]->rip,
+ vmss.regs64[cpu]->rsp,
+ vmss.regs64[cpu]->eflags,
+ vmss.regs64[cpu]->rax,
+ vmss.regs64[cpu]->rbx,
+ vmss.regs64[cpu]->rcx,
+ vmss.regs64[cpu]->rdx,
+ vmss.regs64[cpu]->rsi,
+ vmss.regs64[cpu]->rdi,
+ vmss.regs64[cpu]->rbp,
+ vmss.regs64[cpu]->r8,
+ vmss.regs64[cpu]->r9,
+ vmss.regs64[cpu]->r10,
+ vmss.regs64[cpu]->r11,
+ vmss.regs64[cpu]->r12,
+ vmss.regs64[cpu]->r13,
+ vmss.regs64[cpu]->r14,
+ vmss.regs64[cpu]->r15
+ );
+ }
+}
+
+void
+get_vmware_vmss_regs(struct bt_info *bt, ulong *ipp, ulong *spp)
+{
+ ulong ip, sp;
+ struct register_set *rp;
+
+ ip = sp = 0;
+
+ if (!is_task_active(bt->task)) {
+ machdep->get_stack_frame(bt, ipp, spp);
+ return;
+ }
+
+ bt->flags |= BT_DUMPFILE_SEARCH;
+ if (machine_type("X86_64"))
+ machdep->get_stack_frame(bt, ipp, spp);
+ else if (machine_type("X86"))
+ get_netdump_regs_x86(bt, ipp, spp);
+ if (bt->flags & BT_DUMPFILE_SEARCH)
+ return;
+
+ if ((vmss.regs64 == NULL) ||
+ (bt->tc->processor >= vmss.num_vcpus))
+ return;
+
+ ip = (ulong)vmss.regs64[bt->tc->processor]->rip;
+ sp = (ulong)vmss.regs64[bt->tc->processor]->rsp;
+ if (is_kernel_text(ip) &&
+ (((sp >= GET_STACKBASE(bt->task)) &&
+ (sp < GET_STACKTOP(bt->task))) ||
+ in_alternate_stack(bt->tc->processor, sp))) {
+ *ipp = ip;
+ *spp = sp;
+ bt->flags |= BT_KERNEL_SPACE;
+ return;
+ }
+
+ if (!is_kernel_text(ip) &&
+ in_user_stack(bt->tc->task, sp))
+ bt->flags |= BT_USER_SPACE;
+}
diff --git a/vmware_vmss.h b/vmware_vmss.h
index a4b8937..3c69a82 100644
--- a/vmware_vmss.h
+++ b/vmware_vmss.h
@@ -90,6 +90,32 @@ struct memregion {
typedef struct memregion memregion;
#define MAX_REGIONS 3
+struct vmssregs64 {
+ /* read from vmss */
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rbp;
+ uint64_t rsp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ /* manually managed */
+ uint64_t idtr;
+ uint64_t cr3;
+ uint64_t rip;
+ uint64_t eflags;
+};
+typedef struct vmssregs64 vmssregs64;
+
struct vmssdata {
int32_t cpt64bit;
FILE *dfp;
@@ -99,6 +125,8 @@ struct vmssdata {
memregion regions[MAX_REGIONS];
uint64_t memoffset;
uint64_t memsize;
+ uint64_t num_vcpus;
+ vmssregs64 **regs64;
};
typedef struct vmssdata vmssdata;
diff --git a/x86_64.c b/x86_64.c
index 0d5e150..7b02761 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -3273,6 +3273,8 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
return;
}
@@ -3295,13 +3297,16 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
else if (pc->flags2 & QEMU_MEM_DUMP_ELF)
display_regs_from_elf_notes(bt->tc->processor, ofp);
return;
} else if ((bt->flags & BT_KERNEL_SPACE) &&
(KVMDUMP_DUMPFILE() ||
(ELF_NOTES_VALID() && DISKDUMP_DUMPFILE()) ||
- SADUMP_DUMPFILE() || (pc->flags2 & QEMU_MEM_DUMP_ELF))) {
+ SADUMP_DUMPFILE() || (pc->flags2 & QEMU_MEM_DUMP_ELF) ||
+ VMSS_DUMPFILE())) {
fprintf(ofp, " [exception RIP: ");
if ((sp = value_search(bt->instptr, &offset))) {
fprintf(ofp, "%s", sp->name);
@@ -3317,6 +3322,8 @@ x86_64_low_budget_back_trace_cmd(struct bt_info *bt_in)
diskdump_display_regs(bt->tc->processor, ofp);
else if (SADUMP_DUMPFILE())
sadump_display_regs(bt->tc->processor, ofp);
+ else if (VMSS_DUMPFILE())
+ vmware_vmss_display_regs(bt->tc->processor, ofp);
else if (pc->flags2 & QEMU_MEM_DUMP_ELF)
display_regs_from_elf_notes(bt->tc->processor, ofp);
@@ -4941,7 +4948,7 @@ skip_stage:
if (halt_rip && halt_rsp) {
*rip = halt_rip;
*rsp = halt_rsp;
- if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE())
+ if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE() || VMSS_DUMPFILE())
bt_in->flags &= ~(ulonglong)BT_DUMPFILE_SEARCH;
return;
}
@@ -4986,7 +4993,7 @@ skip_stage:
machdep->get_stack_frame(bt, rip, rsp);
- if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE())
+ if (KVMDUMP_DUMPFILE() || SADUMP_DUMPFILE() || VMSS_DUMPFILE())
bt_in->flags &= ~(ulonglong)BT_DUMPFILE_SEARCH;
}
--
2.14.3
6 years, 9 months
[PATCH v2 0/4] Generalize KASLR calculation and use it for KDUMPs
by Sergio Lopez
Commit 45b74b89530d611b3fa95a1041e158fbb865fa84 added support for
calculating phys_base and kernel offset for KASLR-enabled kernels on
SADUMPs by using a technique developed by Takao Indoh. Originally, the
patchset included support for KDUMPs, but this was dropped in v2, as it
was deemed unnecessary due to the implementation of the vmcoreinfo
device in QEMU.
Sadly, there are many reasons for which the vmcoreinfo device may not be
present in the moment of taking the memory dump from a VM, ranging from
a Host running older QEMU/libvirt versions, to misconfigured VMs or
environments running Hypervisors that doesn't support this device.
This patchset generalizes the kaslr related functions from sadump.c
moving them to kaslr_helper.c, and makes KDUMP analysis fallback to
KASLR offset calculation if vmcoreinfo data is missing.
These changes have been successfully tested with a 3.10.0-830.el7.x86_64
under the following conditions:
- kdump with KASLR and vmcoreinfo
- kdump with KASLR but no vmcoreinfo
- kdump without KASLR ("nokaslr" kernel command line option)
It was also tested that a "crash" patched with these changes still
builds and runs (live and kdump debugging) on an aarch64 machine.
changelog:
v2:
- Limit application to QEMU ELF and QEMU COMPRESSED dumps (thanks Dave)
- Add support for QEMU COMPRESSED dumps (diskdump)
Sergio Lopez (4):
Move kaslr related functions from sadump.c to kaslr_helper.c
Move QEMUCPU* structs from netdump.h to defs.h
netdump: infer kaslr offset for QEMU ELF dumps without vmcoreinfo
diskdump: infer kaslr offset for QEMU COMPRESSED dumps without
vmcoreinfo
Makefile | 7 +-
defs.h | 43 +++++
diskdump.c | 96 ++++++++++++
kaslr_helper.c | 473 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
netdump.c | 79 ++++++++++
netdump.h | 24 +--
sadump.c | 487 +++++----------------------------------------------------
symbols.c | 26 ++-
x86_64.c | 29 +++-
9 files changed, 782 insertions(+), 482 deletions(-)
create mode 100644 kaslr_helper.c
--
2.14.3
6 years, 9 months
modules loaded from wrong directory
by Olaf Hering
When trying to run crash 7.2 like this, the 'mod -s $mod' command loads
the modules from the running system instead of the specified directory:
d=${0%/*}
tee $t <<_EOF_
mod -s xen-kbdfront
mod -s xen_kbdfront
exit
_EOF_
strace -f -s 123 -tt -o /dev/shm/crash.trace.txt \
crash \
-i $t \
--mod "${d}" \
"${d}"/boot/vmlinux-*-default.gz \
"$1" \
"${d}"/usr/lib/debug/boot/vmlinux-*-default.debug
In the strace log I see a search for xen_kbdfront.ko, later for
xen-kbdfront.ko. Unfortunately /lib/modules/`uname -r` is searched before $d.
Now that I read the man page once more I noticed that --mod is just for
debug info?
It seems $d/boot is considered as well, likely because the kernel is
stored in that directory.
I wonder why there is no option to tell crash to only operate below a
certain directory when looking for module related things.
"--mod" seems to have an effect, if the directory exists it is searched.
How can I tell crash to only operate below $d?
Olaf
6 years, 9 months
[PATCH 0/2] Generalize KASLR calculation and use it for KDUMPs
by Sergio Lopez
Commit 45b74b89530d611b3fa95a1041e158fbb865fa84 added support for
calculating phys_base and kernel offset for KASLR-enabled kernels on
SADUMPs by using a technique developed by Takao Indoh. Originally, the
patchset included support for KDUMPs, but this was dropped in v2, as it
was deemed unnecessary due to the implementation of the vmcoreinfo
device in QEMU.
Sadly, there are many reasons for which the vmcoreinfo device may not be
present in the moment of taking the memory dump from a VM, ranging from
a Host running older QEMU/libvirt versions, to misconfigured VMs or
environments running Hypervisors that doesn't support this device.
This patchset generalizes the kaslr related functions from sadump.c
moving them to kaslr_helper.c, and makes KDUMP analysis fallback to
KASLR offset calculation if vmcoreinfo data is missing.
These changes have been successfully tested with a 3.10.0-830.el7.x86_64
under the following conditions:
- kdump with KASLR and vmcoreinfo
- kdump with KASLR but no vmcoreinfo
- kdump without KASLR ("nokaslr" kernel command line option)
It was also tested that a "crash" patched with these changes still
builds and runs (live and kdump debugging) on an aarch64 machine.
Sergio Lopez (2):
Move kaslr related functions from sadump.c to kaslr_helper.c
kdump: if vmcoreinfo is missing, try to infer kaslr offset
Makefile | 7 +-
defs.h | 15 ++
kaslr_helper.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
netdump.c | 73 +++++++++
netdump.h | 1 +
sadump.c | 487 +++++----------------------------------------------------
symbols.c | 29 ++--
x86_64.c | 14 +-
8 files changed, 629 insertions(+), 463 deletions(-)
create mode 100644 kaslr_helper.c
--
2.14.3
6 years, 9 months
Re: [Crash-utility] [PATCH v3 0/2] Fix KASLR problem on sadump
by Sergio Lopez
Hi,
I know support for guessing the KASLR offset for kdump/diskimages was
deliberately dropped from this patchset, because upstream was also working on
the vmcoreinfo device, but I think having that would be *really* useful.
The vmcoreinfo device solution requires explicit support in the virtualization
Hosts, and it'll take a while until its presence has been generalized among
products and users. Meanwhile, we already have VMs with KASLR-enabled kernels.
So I'd like to ask if a patchset extending the current state of KASLR offset
guessing for sadump to kdump/diskimages would be considered for merging.
Thanks,
Sergio.
6 years, 9 months
[PATCH v3] Speed up "kmem -[sS]" by optimizing is_page_ptr() for x86_64
by Kazuhito Hagio
changes v2 -> v3:
- move the setting point of machdep->is_page_ptr to machdep_init(SETUP_ENV)
changes v1 -> v2:
- rewrite based on the per-architecture function call Dave provided
- remove the part which used page.flags for non-VMEMMAP kernels
- add address range/position check first
- remove/optimize the calculations of mem_map and phys address
- modify the patch description
The "kmem -[sS]" commands can take several minutes to complete with
the following conditions:
- The system has a lot of memory sections with CONFIG_SPARSEMEM, and
- The kernel uses SLUB and it has a very long partial slab list.
crash> kmem -s dentry
CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
ffff88017fc78a00 dentry 192 9038949 10045728 239184 8k
crash> kmem -s dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
133
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
656
One of the causes is that is_page_ptr() in count_partial() determines
whether a given slub page address is a page struct by searching all
mem_sections available for the one which includes it.
With CONFIG_SPARSEMEM_VMEMMAP on x86_64, we can do that by checking
its address range and whether its calculated mem_section is valid.
With this patch, the computation amount can be significantly reduced
in that case.
crash> kmem -s dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
1
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
1
Signed-off-by: Kazuhito Hagio <k-hagio(a)ab.jp.nec.com>
---
defs.h | 1 +
x86_64.c | 23 +++++++++++++++++++++++
2 files changed, 24 insertions(+)
diff --git a/defs.h b/defs.h
index 9663bd8..7998ebf 100644
--- a/defs.h
+++ b/defs.h
@@ -5133,6 +5133,7 @@ int vaddr_type(ulong, struct task_context *);
char *format_stack_entry(struct bt_info *bt, char *, ulong, ulong);
int in_user_stack(ulong, ulong);
int dump_inode_page(ulong);
+ulong valid_section_nr(ulong);
/*
diff --git a/x86_64.c b/x86_64.c
index 7449571..0d5e150 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -77,6 +77,7 @@ static void x86_64_calc_phys_base(void);
static int x86_64_is_module_addr(ulong);
static int x86_64_is_kvaddr(ulong);
static int x86_64_is_uvaddr(ulong, struct task_context *);
+static int x86_64_is_page_ptr(ulong, physaddr_t *);
static ulong *x86_64_kpgd_offset(ulong, int, int);
static ulong x86_64_upgd_offset(struct task_context *, ulong, int, int);
static ulong x86_64_upgd_offset_legacy(struct task_context *, ulong, int, int);
@@ -156,6 +157,7 @@ x86_64_init(int when)
{
case SETUP_ENV:
machdep->process_elf_notes = x86_process_elf_notes;
+ machdep->is_page_ptr = x86_64_is_page_ptr;
break;
case PRE_SYMTAB:
machdep->verify_symbol = x86_64_verify_symbol;
@@ -802,6 +804,7 @@ x86_64_dump_machdep_table(ulong arg)
fprintf(fp, " get_smp_cpus: x86_64_get_smp_cpus()\n");
fprintf(fp, " is_kvaddr: x86_64_is_kvaddr()\n");
fprintf(fp, " is_uvaddr: x86_64_is_uvaddr()\n");
+ fprintf(fp, " is_page_ptr: x86_64_is_page_ptr()\n");
fprintf(fp, " verify_paddr: x86_64_verify_paddr()\n");
fprintf(fp, " get_kvaddr_ranges: x86_64_get_kvaddr_ranges()\n");
fprintf(fp, " init_kernel_pgd: x86_64_init_kernel_pgd()\n");
@@ -1594,6 +1597,26 @@ x86_64_is_uvaddr(ulong addr, struct task_context *tc)
return (addr < USERSPACE_TOP);
}
+static int
+x86_64_is_page_ptr(ulong addr, physaddr_t *phys)
+{
+ ulong pfn, nr;
+
+ if (IS_SPARSEMEM() && (machdep->flags & VMEMMAP) &&
+ (addr >= VMEMMAP_VADDR && addr <= VMEMMAP_END) &&
+ !((addr - VMEMMAP_VADDR) % SIZE(page))) {
+
+ pfn = (addr - VMEMMAP_VADDR) / SIZE(page);
+ nr = pfn_to_section_nr(pfn);
+ if (valid_section_nr(nr)) {
+ if (phys)
+ *phys = PTOB(pfn);
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
/*
* Find the kernel pgd entry..
* pgd = pgd_offset_k(addr);
--
1.8.3.1
6 years, 9 months
[PATCH v2] Speed up "kmem -[sS]" by optimizing is_page_ptr() for x86_64
by Kazuhito Hagio
Hi Dave,
Thank you very much for merging the infrastructure.
I rewrote the patch based on it and tested this with some dumpfiles.
---
Changes from v1:
- rewrite based on the per-architecture function call Dave provided
- remove the part which used page.flags for non-VMEMMAP kernels
- add address range/position check first
- remove/optimize the calculations of mem_map and phys address
- modify the patch description
The "kmem -[sS]" commands can take several minutes to complete with
the following conditions:
- The system has a lot of memory sections with CONFIG_SPARSEMEM, and
- The kernel uses SLUB and it has a very long partial slab list.
crash> kmem -s dentry
CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
ffff88017fc78a00 dentry 192 9038949 10045728 239184 8k
crash> kmem -s dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
133
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
656
One of the causes is that is_page_ptr() in count_partial() determines
whether a given slub page address is a page struct by searching all
mem_sections available for the one which includes it.
With CONFIG_SPARSEMEM_VMEMMAP on x86_64, we can do that by checking
its address range and whether its calculated mem_section is valid.
With this patch, the computation amount can be significantly reduced
in that case.
crash> kmem -s dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
1
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
1
Signed-off-by: Kazuhito Hagio <k-hagio(a)ab.jp.nec.com>
---
defs.h | 1 +
x86_64.c | 23 +++++++++++++++++++++++
2 files changed, 24 insertions(+)
diff --git a/defs.h b/defs.h
index 9663bd8..7998ebf 100644
--- a/defs.h
+++ b/defs.h
@@ -5133,6 +5133,7 @@ int vaddr_type(ulong, struct task_context *);
char *format_stack_entry(struct bt_info *bt, char *, ulong, ulong);
int in_user_stack(ulong, ulong);
int dump_inode_page(ulong);
+ulong valid_section_nr(ulong);
/*
diff --git a/x86_64.c b/x86_64.c
index 7449571..67cc528 100644
--- a/x86_64.c
+++ b/x86_64.c
@@ -77,6 +77,7 @@ static void x86_64_calc_phys_base(void);
static int x86_64_is_module_addr(ulong);
static int x86_64_is_kvaddr(ulong);
static int x86_64_is_uvaddr(ulong, struct task_context *);
+static int x86_64_is_page_ptr(ulong, physaddr_t *);
static ulong *x86_64_kpgd_offset(ulong, int, int);
static ulong x86_64_upgd_offset(struct task_context *, ulong, int, int);
static ulong x86_64_upgd_offset_legacy(struct task_context *, ulong, int, int);
@@ -624,6 +625,7 @@ x86_64_init(int when)
_MAX_PHYSMEM_BITS_2_6_26;
}
}
+ machdep->is_page_ptr = x86_64_is_page_ptr;
if (XEN()) {
if (kt->xen_flags & WRITABLE_PAGE_TABLES) {
@@ -802,6 +804,7 @@ x86_64_dump_machdep_table(ulong arg)
fprintf(fp, " get_smp_cpus: x86_64_get_smp_cpus()\n");
fprintf(fp, " is_kvaddr: x86_64_is_kvaddr()\n");
fprintf(fp, " is_uvaddr: x86_64_is_uvaddr()\n");
+ fprintf(fp, " is_page_ptr: x86_64_is_page_ptr()\n");
fprintf(fp, " verify_paddr: x86_64_verify_paddr()\n");
fprintf(fp, " get_kvaddr_ranges: x86_64_get_kvaddr_ranges()\n");
fprintf(fp, " init_kernel_pgd: x86_64_init_kernel_pgd()\n");
@@ -1594,6 +1597,26 @@ x86_64_is_uvaddr(ulong addr, struct task_context *tc)
return (addr < USERSPACE_TOP);
}
+static int
+x86_64_is_page_ptr(ulong addr, physaddr_t *phys)
+{
+ ulong pfn, nr;
+
+ if (IS_SPARSEMEM() && (machdep->flags & VMEMMAP) &&
+ (addr >= VMEMMAP_VADDR && addr <= VMEMMAP_END) &&
+ !((addr - VMEMMAP_VADDR) % SIZE(page))) {
+
+ pfn = (addr - VMEMMAP_VADDR) / SIZE(page);
+ nr = pfn_to_section_nr(pfn);
+ if (valid_section_nr(nr)) {
+ if (phys)
+ *phys = PTOB(pfn);
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
/*
* Find the kernel pgd entry..
* pgd = pgd_offset_k(addr);
--
1.8.3.1
6 years, 9 months
[PATCH] Speed up "kmem -[sS]" by optimizing is_page_ptr()
by k-hagio@ab.jp.nec.com
Hi,
The "kmem -[sS]" commands can take several minutes to complete with
the following conditions:
* The system has a lot of memory sections with CONFIG_SPARSEMEM.
* The kernel uses SLUB and it has a very long partial slab list.
crash> kmem -s dentry | awk '{print strftime("%T"), $0}'
10:18:34 CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
10:19:41 ffff88017fc78a00 dentry 192 9038949 10045728 239184 8k
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
334
One of the causes is that is_page_ptr() in count_partial() checks if
a given slub page address is a page struct by searching all memory
sections linearly for the one which includes it.
nr_mem_sections = NR_MEM_SECTIONS();
for (nr = 0; nr < nr_mem_sections ; nr++) {
if ((sec_addr = valid_section_nr(nr))) {
...
With CONFIG_SPARSEMEM{_VMEMMAP}, we can calculate the memory section
which includes a page struct with its page.flags, or its address and
VMEMMAP_VADDR. With this patch doing so, the computation amount can be
significantly reduced in that case.
crash> kmem -s dentry | awk '{print strftime("%T"), $0}'
10:34:55 CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
10:34:55 ffff88017fc78a00 dentry 192 9038949 10045728 239184 8k
crash> kmem -S dentry | bash -c 'cat >/dev/null ; echo $SECONDS'
2
This patch uses VMEMMAP_VADDR. It is not defined on PPC64, but it looks
like PPC64 supports VMEMMAP flag and machdep->machspec->vmemmap_base is
it, so this patch also defines it for PPC64. This might need some help
from PPC folks.
Signed-off-by: Kazuhito Hagio <k-hagio(a)ab.jp.nec.com>
---
defs.h | 2 ++
memory.c | 15 +++++++++++++++
2 files changed, 17 insertions(+)
diff --git a/defs.h b/defs.h
index aa17792..84e68ca 100644
--- a/defs.h
+++ b/defs.h
@@ -3861,6 +3861,8 @@ struct efi_memory_desc_t {
#define IS_VMALLOC_ADDR(X) machdep->machspec->is_vmaddr(X)
#define KERNELBASE machdep->pageoffset
+#define VMEMMAP_VADDR (machdep->machspec->vmemmap_base)
+
#define PGDIR_SHIFT (machdep->pageshift + (machdep->pageshift -3) + (machdep->pageshift - 2))
#define PMD_SHIFT (machdep->pageshift + (machdep->pageshift - 3))
diff --git a/memory.c b/memory.c
index 0df8ecc..0696763 100644
--- a/memory.c
+++ b/memory.c
@@ -13348,10 +13348,25 @@ is_page_ptr(ulong addr, physaddr_t *phys)
ulong nr_mem_sections;
ulong coded_mem_map, mem_map, end_mem_map;
physaddr_t section_paddr;
+#ifdef VMEMMAP_VADDR
+ ulong flags;
+#endif
if (IS_SPARSEMEM()) {
nr_mem_sections = NR_MEM_SECTIONS();
+#ifdef VMEMMAP_VADDR
+ nr = nr_mem_sections;
+ if (machdep->flags & VMEMMAP)
+ nr = pfn_to_section_nr((addr - VMEMMAP_VADDR) / SIZE(page));
+ else if (readmem(addr + OFFSET(page_flags), KVADDR, &flags,
+ sizeof(ulong), "page.flags", RETURN_ON_ERROR|QUIET))
+ nr = (flags >> (SIZE(page_flags)*8 - SECTIONS_SHIFT())
+ & ((1UL << SECTIONS_SHIFT()) - 1));
+
+ if (nr < nr_mem_sections) {
+#else
for (nr = 0; nr < nr_mem_sections ; nr++) {
+#endif
if ((sec_addr = valid_section_nr(nr))) {
coded_mem_map = section_mem_map_addr(sec_addr);
mem_map = sparse_decode_mem_map(coded_mem_map, nr);
--
1.8.3.1
6 years, 10 months