[PATCH] ppc64: increase MAX_PHYSMEM_BITS to 128TB
by Hari Bathini
With kernel commit 7d4340bb92a9 ("powerpc/mm: Increase MAX_PHYSMEM_BITS
to 128TB with SPARSEMEM_VMEMMAP config"), MAX_PHYSMEM_BITS is bumped up
to 47. Make the appropriate update here.
Signed-off-by: Hari Bathini <hbathini(a)linux.ibm.com>
---
defs.h | 1 +
ppc64.c | 5 ++++-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/defs.h b/defs.h
index 80c61ef..5b64bb7 100644
--- a/defs.h
+++ b/defs.h
@@ -4073,6 +4073,7 @@ struct efi_memory_desc_t {
#define _SECTION_SIZE_BITS 24
#define _MAX_PHYSMEM_BITS 44
#define _MAX_PHYSMEM_BITS_3_7 46
+#define _MAX_PHYSMEM_BITS_4_19 47
#endif /* PPC64 */
diff --git a/ppc64.c b/ppc64.c
index 8badcde..ee2f76f 100644
--- a/ppc64.c
+++ b/ppc64.c
@@ -554,7 +554,10 @@ ppc64_init(int when)
ppc64_vmemmap_init();
machdep->section_size_bits = _SECTION_SIZE_BITS;
- if (THIS_KERNEL_VERSION >= LINUX(3,7,0))
+ if ((machdep->flags & VMEMMAP) &&
+ (THIS_KERNEL_VERSION >= LINUX(4,19,0)))
+ machdep->max_physmem_bits = _MAX_PHYSMEM_BITS_4_19;
+ else if (THIS_KERNEL_VERSION >= LINUX(3,7,0))
machdep->max_physmem_bits = _MAX_PHYSMEM_BITS_3_7;
else
machdep->max_physmem_bits = _MAX_PHYSMEM_BITS;
6 years, 2 months
[PATCH] dev: add PCI information in recently kernel.
by Masayoshi Mizuma
From: Masayoshi Mizuma <m.mizuma(a)jp.fujitsu.com>
dev -p supports to show the PCI information, however, it works
in old kernel only. This patch gets it available in recently kernel.
And also it will show the PCI BUS information. The BUS information
may be useful for investigation of PCI hotplug issue to track the
PCI bridge.
Signed-off-by: Masayoshi Mizuma <m.mizuma(a)jp.fujitsu.com>
---
defs.h | 11 ++
dev.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 338 insertions(+), 6 deletions(-)
diff --git a/defs.h b/defs.h
index d6492c5..e7d9bb2 100644
--- a/defs.h
+++ b/defs.h
@@ -1624,11 +1624,20 @@ struct offset_table { /* stash of commonly-used offsets */
long pci_dev_global_list;
long pci_dev_next;
long pci_dev_bus;
+ long pci_dev_dev;
long pci_dev_devfn;
long pci_dev_class;
long pci_dev_device;
+ long pci_dev_hdr_type;
+ long pci_dev_pcie_flags_reg;
long pci_dev_vendor;
long pci_bus_number;
+ long pci_bus_node;
+ long pci_bus_devices;
+ long pci_bus_dev;
+ long pci_bus_children;
+ long pci_bus_parent;
+ long pci_bus_self;
long resource_entry_t_from;
long resource_entry_t_num;
long resource_entry_t_name;
@@ -1832,6 +1841,7 @@ struct offset_table { /* stash of commonly-used offsets */
long class_private_devices;
long device_knode_class;
long device_node;
+ long device_kobj;
long gendisk_dev;
long gendisk_kobj;
long gendisk_part0;
@@ -1841,6 +1851,7 @@ struct offset_table { /* stash of commonly-used offsets */
long klist_node_n_klist;
long klist_node_n_node;
long kobject_entry;
+ long kobject_name;
long kset_list;
long request_list_count;
long request_queue_in_flight;
diff --git a/dev.c b/dev.c
index 3db898a..7ce2422 100644
--- a/dev.c
+++ b/dev.c
@@ -24,6 +24,7 @@ static void dump_blkdevs_v3(ulong);
static ulong search_cdev_map_probes(char *, int, int, ulong *);
static ulong search_bdev_map_probes(char *, int, int, ulong *);
static void do_pci(void);
+static void do_pci2(void);
static void do_io(void);
static void do_resource_list(ulong, char *, int);
@@ -51,11 +52,23 @@ dev_init(void)
MEMBER_OFFSET_INIT(pci_dev_global_list, "pci_dev", "global_list");
MEMBER_OFFSET_INIT(pci_dev_next, "pci_dev", "next");
MEMBER_OFFSET_INIT(pci_dev_bus, "pci_dev", "bus");
+ MEMBER_OFFSET_INIT(pci_dev_dev, "pci_dev", "dev");
MEMBER_OFFSET_INIT(pci_dev_devfn, "pci_dev", "devfn");
MEMBER_OFFSET_INIT(pci_dev_class, "pci_dev", "class");
MEMBER_OFFSET_INIT(pci_dev_device, "pci_dev", "device");
+ MEMBER_OFFSET_INIT(pci_dev_hdr_type, "pci_dev", "hdr_type");
+ MEMBER_OFFSET_INIT(pci_dev_pcie_flags_reg, "pci_dev", "pcie_flags_reg");
MEMBER_OFFSET_INIT(pci_dev_vendor, "pci_dev", "vendor");
MEMBER_OFFSET_INIT(pci_bus_number, "pci_bus", "number");
+ MEMBER_OFFSET_INIT(pci_bus_node, "pci_bus", "node");
+ MEMBER_OFFSET_INIT(pci_bus_devices, "pci_bus", "devices");
+ MEMBER_OFFSET_INIT(pci_bus_dev, "pci_bus", "dev");
+ MEMBER_OFFSET_INIT(pci_bus_children, "pci_bus", "children");
+ MEMBER_OFFSET_INIT(pci_bus_parent, "pci_bus", "parent");
+ MEMBER_OFFSET_INIT(pci_bus_self, "pci_bus", "self");
+
+ MEMBER_OFFSET_INIT(device_kobj, "device", "kobj");
+ MEMBER_OFFSET_INIT(kobject_name, "kobject", "name");
STRUCT_SIZE_INIT(resource, "resource");
if ((VALID_STRUCT(resource) && symbol_exists("do_resource_list")) ||
@@ -114,10 +127,14 @@ cmd_dev(void)
return;
case 'p':
- if (machine_type("S390X") ||
- (THIS_KERNEL_VERSION >= LINUX(2,6,26)))
+ if (machine_type("S390X"))
+ option_not_supported(c);
+ if (symbol_exists("pci_devices"))
+ do_pci();
+ else if (symbol_exists("pci_root_buses"))
+ do_pci2();
+ else
option_not_supported(c);
- do_pci();
return;
default:
@@ -2217,6 +2234,313 @@ do_resource_list(ulong first_entry, char *resource_buf, int size)
#endif /* USE_2_2_17_PCI_H */
+#define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */
+#define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */
+#define PCI_EXP_TYPE_LEG_END 0x1 /* Legacy Endpoint */
+#define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */
+#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */
+#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */
+#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCIe to PCI/PCI-X Bridge */
+#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 /* PCI/PCI-X to PCIe Bridge */
+#define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */
+#define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */
+
+static void
+fill_dev_name(ulong pci_dev, char *name)
+{
+ ulong kobj, value;
+
+ memset(name, 0, sizeof(*name) * BUFSIZE);
+
+ kobj = pci_dev + OFFSET(pci_dev_dev) + OFFSET(device_kobj);
+
+ readmem(kobj + OFFSET(kobject_name),
+ KVADDR, &value, sizeof(void *), "kobject name",
+ FAULT_ON_ERROR);
+
+ read_string(value, name, BUFSIZE-1);
+}
+
+static void
+fill_bus_name(ulong pci_bus, char *name)
+{
+ ulong kobj, value;
+
+ memset(name, 0, sizeof(*name) * BUFSIZE);
+
+ kobj = pci_bus + OFFSET(pci_bus_dev) + OFFSET(device_kobj);
+
+ readmem(kobj + OFFSET(kobject_name),
+ KVADDR, &value, sizeof(void *), "kobject name",
+ FAULT_ON_ERROR);
+
+ read_string(value, name, BUFSIZE-1);
+}
+
+static void
+fill_dev_id(ulong pci_dev, char *id)
+{
+ unsigned short device, vendor;
+
+ memset(id, 0, sizeof(*id) * BUFSIZE);
+
+ readmem(pci_dev + OFFSET(pci_dev_device),
+ KVADDR, &device, sizeof(short), "pci dev device",
+ FAULT_ON_ERROR);
+ readmem(pci_dev + OFFSET(pci_dev_vendor), KVADDR,
+ &vendor, sizeof(short), "pci dev vendor", FAULT_ON_ERROR);
+
+ sprintf(id, "%x:%x", vendor, device);
+}
+
+static void
+fill_dev_class(ulong pci_dev, char *c)
+{
+ unsigned int class;
+
+ memset(c, 0, sizeof(*c) * BUFSIZE);
+ readmem(pci_dev + OFFSET(pci_dev_class), KVADDR,
+ &class, sizeof(int), "pci class", FAULT_ON_ERROR);
+
+ class >>= 8;
+
+ sprintf(c, "%04x", class);
+}
+
+static int
+pci_pcie_type(ulong cap)
+{
+ return (cap & PCI_EXP_FLAGS_TYPE) >> 4;
+}
+
+static int
+pci_is_bridge(unsigned char hdr_type)
+{
+ return hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+ hdr_type == PCI_HEADER_TYPE_CARDBUS;
+}
+
+static void
+fill_pcie_type(ulong pcidev, char *t)
+{
+ int type, bufidx = 0;
+ unsigned short pciecap;
+ unsigned char hdr_type;
+
+ memset(t, 0, sizeof(*t) * BUFSIZE);
+
+ readmem(pcidev + OFFSET(pci_dev_hdr_type), KVADDR, &hdr_type,
+ sizeof(char), "pci dev hdr_type", FAULT_ON_ERROR);
+
+ if (!VALID_MEMBER(pci_dev_pcie_flags_reg))
+ goto bridge_chk;
+
+ readmem(pcidev + OFFSET(pci_dev_pcie_flags_reg), KVADDR, &pciecap,
+ sizeof(unsigned short), "pci dev pcie_flags_reg", FAULT_ON_ERROR);
+
+ type = pci_pcie_type(pciecap);
+
+ if (type == PCI_EXP_TYPE_ENDPOINT)
+ bufidx = sprintf(t, "ENDPOINT");
+ else if (type == PCI_EXP_TYPE_LEG_END)
+ bufidx = sprintf(t, "LEG_END");
+ else if (type == PCI_EXP_TYPE_ROOT_PORT)
+ bufidx = sprintf(t, "ROOT_PORT");
+ else if (type == PCI_EXP_TYPE_UPSTREAM)
+ bufidx = sprintf(t, "UPSTREAM");
+ else if (type == PCI_EXP_TYPE_DOWNSTREAM)
+ bufidx = sprintf(t, "DOWNSTREAM");
+ else if (type == PCI_EXP_TYPE_PCI_BRIDGE)
+ bufidx = sprintf(t, "PCI_BRIDGE");
+ else if (type == PCI_EXP_TYPE_PCIE_BRIDGE)
+ bufidx = sprintf(t, "PCIE_BRIDGE");
+ else if (type == PCI_EXP_TYPE_RC_END)
+ bufidx = sprintf(t, "RC_END");
+ else if (type == PCI_EXP_TYPE_RC_EC)
+ bufidx = sprintf(t, "RC_EC");
+
+bridge_chk:
+ if (pci_is_bridge(hdr_type))
+ sprintf(t + bufidx, " [BRIDGE]");
+}
+
+static void
+walk_devices(ulong pci_bus)
+{
+ struct list_data list_data, *ld;
+ int devcnt, i;
+ ulong *devlist, self;
+ char name[BUFSIZE], class[BUFSIZE], id[BUFSIZE], type[BUFSIZE];
+ char pcidev_hdr[BUFSIZE];
+ char buf1[BUFSIZE];
+ char buf2[BUFSIZE];
+ char buf3[BUFSIZE];
+ char buf4[BUFSIZE];
+ char buf5[BUFSIZE];
+
+ ld = &list_data;
+
+ BZERO(ld, sizeof(struct list_data));
+
+ readmem(pci_bus + OFFSET(pci_bus_devices), KVADDR,
+ &ld->start, sizeof(void *), "pci bus devices",
+ FAULT_ON_ERROR);
+
+ if (VALID_MEMBER(pci_dev_pcie_flags_reg))
+ snprintf(pcidev_hdr, sizeof(pcidev_hdr), "%s %s %s %s %s\n",
+ mkstring(buf1, VADDR_PRLEN, CENTER, "PCI DEV"),
+ mkstring(buf2, strlen("0000:00:00.0"), CENTER, "DO:BU:SL.FN"),
+ mkstring(buf3, strlen("0000") + 2, CENTER, "CLASS"),
+ mkstring(buf4, strlen("0000:0000"), CENTER, "PCI_ID"),
+ mkstring(buf5, 10, CENTER, "TYPE"));
+ else
+ snprintf(pcidev_hdr, sizeof(pcidev_hdr), "%s %s %s %s\n",
+ mkstring(buf1, VADDR_PRLEN, CENTER, "PCI DEV"),
+ mkstring(buf2, strlen("0000:00:00.0"), CENTER, "DO:BU:SL.FN"),
+ mkstring(buf3, strlen("0000") + 2, CENTER, "CLASS"),
+ mkstring(buf4, strlen("0000:0000"), CENTER, "PCI_ID"));
+
+ fprintf(fp, " %s", pcidev_hdr);
+
+ readmem(pci_bus + OFFSET(pci_bus_self), KVADDR, &self,
+ sizeof(void *), "pci bus self", FAULT_ON_ERROR);
+ if (self) {
+ fill_dev_name(self, name);
+ fill_dev_class(self, class);
+ fill_dev_id(self, id);
+ fill_pcie_type(self, type);
+ fprintf(fp, " %s %s %s %s %s\n",
+ mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX,
+ MKSTR(self)),
+ mkstring(buf2, strlen("0000:00:00.0"), CENTER, name),
+ mkstring(buf3, strlen("0000") + 2, CENTER, class),
+ mkstring(buf4, strlen("0000:0000"), CENTER, id),
+ mkstring(buf5, 10, CENTER, type));
+ }
+
+ if (ld->start == (pci_bus + OFFSET(pci_bus_devices)))
+ return;
+
+ ld->end = pci_bus + OFFSET(pci_bus_devices);
+ hq_open();
+ devcnt = do_list(ld);
+ devlist = (ulong *)GETBUF(devcnt * sizeof(ulong));
+ devcnt = retrieve_list(devlist, devcnt);
+ hq_close();
+
+ for (i = 0; i < devcnt; i++) {
+ fill_dev_name(devlist[i], name);
+ fill_dev_class(devlist[i], class);
+ fill_dev_id(devlist[i], id);
+ fill_pcie_type(devlist[i], type);
+ fprintf(fp, " %s %s %s %s %s\n",
+ mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX,
+ MKSTR(devlist[i])),
+ mkstring(buf2, strlen("0000:00:00.0"), CENTER, name),
+ mkstring(buf3, strlen("0000") + 2, CENTER, class),
+ mkstring(buf4, strlen("0000:0000"), CENTER, id),
+ mkstring(buf5, 10, CENTER, type));
+ }
+ FREEBUF(devlist);
+}
+
+static void
+walk_buses(ulong pci_bus)
+{
+ struct list_data list_data, *ld;
+ int buscnt, i;
+ ulong *buslist, parent;
+ char pcibus_hdr[BUFSIZE];
+ char buf1[BUFSIZE];
+ char buf2[BUFSIZE];
+
+ ld = &list_data;
+
+ BZERO(ld, sizeof(struct list_data));
+
+ readmem(pci_bus + OFFSET(pci_bus_children), KVADDR,
+ &ld->start, sizeof(void *), "pci bus children",
+ FAULT_ON_ERROR);
+
+ if (ld->start == (pci_bus + OFFSET(pci_bus_children)))
+ return;
+
+ ld->end = pci_bus + OFFSET(pci_bus_children);
+ hq_open();
+ buscnt = do_list(ld);
+ buslist = (ulong *)GETBUF(buscnt * sizeof(ulong));
+ buscnt = retrieve_list(buslist, buscnt);
+ hq_close();
+
+ snprintf(pcibus_hdr, sizeof(pcibus_hdr), "%s %s\n",
+ mkstring(buf1, VADDR_PRLEN, CENTER, "PCI BUS"),
+ mkstring(buf2, VADDR_PRLEN, CENTER, "PARENT BUS"));
+
+ for (i = 0; i < buscnt; i++) {
+ readmem(buslist[i] + OFFSET(pci_bus_parent), KVADDR, &parent,
+ sizeof(void *), "pci bus parent", FAULT_ON_ERROR);
+
+ fprintf(fp, " %s", pcibus_hdr);
+
+ fprintf(fp, " %s %s\n",
+ mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX,
+ MKSTR(buslist[i])),
+ mkstring(buf2, VADDR_PRLEN, LJUST|LONG_HEX,
+ MKSTR(parent)));
+ walk_devices(buslist[i]);
+ fprintf(fp, "\n");
+ walk_buses(buslist[i]);
+ }
+ FREEBUF(buslist);
+}
+
+static void
+do_pci2(void)
+{
+ struct list_data list_data, *ld;
+ int rootbuscnt, i;
+ ulong *rootbuslist;
+ unsigned long pci_root_bus_addr = symbol_value("pci_root_buses");
+ char name[BUFSIZE];
+ char pcirootbus_hdr[BUFSIZE];
+ char buf1[BUFSIZE];
+ char buf2[BUFSIZE];
+
+ ld = &list_data;
+ BZERO(ld, sizeof(struct list_data));
+
+ get_symbol_data("pci_root_buses", sizeof(void *), &ld->start);
+
+ if (ld->start == pci_root_bus_addr)
+ error(FATAL, "no PCI devices found on this system.\n");
+
+ ld->end = pci_root_bus_addr;
+
+ hq_open();
+ rootbuscnt = do_list(ld);
+ rootbuslist = (ulong *)GETBUF(rootbuscnt * sizeof(ulong));
+ rootbuscnt = retrieve_list(rootbuslist, rootbuscnt);
+ hq_close();
+
+ snprintf(pcirootbus_hdr, sizeof(pcirootbus_hdr), "%s %s\n",
+ mkstring(buf1, VADDR_PRLEN, CENTER, "ROOT BUS"),
+ mkstring(buf2, strlen("0000:00"), CENTER, "BUSNAME"));
+
+ for (i = 0; i < rootbuscnt; i++) {
+ fprintf(fp, "%s", pcirootbus_hdr);
+ fill_bus_name(rootbuslist[i], name);
+ fprintf(fp, "%s %s\n",
+ mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX,
+ MKSTR(rootbuslist[i])),
+ mkstring(buf2, strlen("0000:00"), CENTER, name));
+ walk_devices(rootbuslist[i]);
+ walk_buses(rootbuslist[i]);
+
+ fprintf(fp, "\n");
+ }
+ FREEBUF(rootbuslist);
+}
+
static void
do_pci(void)
{
@@ -2230,9 +2554,6 @@ do_pci(void)
char buf2[BUFSIZE];
char buf3[BUFSIZE];
- if (!symbol_exists("pci_devices"))
- error(FATAL, "no PCI devices found on this system.\n");
-
BZERO(&pcilist_data, sizeof(struct list_data));
if (VALID_MEMBER(pci_dev_global_list)) {
--
2.19.0
6 years, 2 months
[ANNOUNCE] crash version 7.2.4 is available
by Dave Anderson
Download from: http://people.redhat.com/anderson
or
https://github.com/crash-utility/crash/releases
The github master branch serves as a development branch that will contain
all patches that are queued for the next release:
$ git clone git://github.com/crash-utility/crash.git
Changelog:
- Fix for the "timer -r" command on Linux 4.10 and later kernels that
contain commit 2456e855354415bfaeb7badaa14e11b3e02c8466, titled
"ktime: Get rid of the union". Without the patch, the command fails
with the error message "timer: invalid structure member offset:
ktime_t_sec".
(k-hagio(a)ab.jp.nec.com)
- Fix for the x86 and x86_64 "mach -m" option on Linux 4.12 and later
kernels to account for the structure name changes "e820map" to
"e820_table", and "e820entry" to "e820_entry", and for the symbol
name change from "e820" to "e820_table". Also updated the display
output to properly translate E820_PRAM and E820_RESERVED_KERN entries.
Without the patch on all kernels, E820_PRAM and E820_RESERVED_KERN
entries show "type 12" and "type 128" respectively. Without the
patch on Linux 4.12 and later kernels, the command fails with the
error message "mach: cannot resolve e820".
(anderson(a)redhat.com)
- Update for the recognition of the new x86_64 CPU_ENTRY_AREA virtual
address range introduced in Linux 4.15. The memory range exists
above the vmemmap range and below the mapped kernel static text/data
region, and where all of the x86_64 exception stacks have been moved.
Without the patch, reads from the new memory region fail because the
address range is not recognized as a legitimate virtual address.
Most notable is the failure of "bt" on tasks whose backtraces
originate from any of the exception stacks, which fail with the two
error messages "bt: seek error: kernel virtual address: <address>
type: stack contents" followed by "bt: read of stack at <address>
failed".
(anderson(a)redhat.com)
- Fix to address a "__builtin___snprintf_chk" compiler warning if bpf.c
is compiled with -D_FORTIFY_SOURCE=2.
(anderson(a)redhat.com)
- Fix for the "bpf -t" option. Although highly unlikely, without the
patch, the target function name of a BPF bytecode call instruction
may fail to be resolved correctly.
(anderson(a)redhat.com)
- If /proc/kcore gets selected for the live memory source because
/dev/mem was configured with CONFIG_STRICT_DEVMEM, its ELF header
contents are not displayed by "help -[dD]", and are not displayed
when the crash session is invoked with -d<number>". Without the
patch, the ELF contents are only displayed in those two situations
if "/proc/kcore" is explicitly entered on the crash command line.
(anderson(a)redhat.com)
- If the default live memory source /dev/mem is determined to be
unusable because the kernel was configured with CONFIG_STRICT_DEVMEM,
the first memory read during session initialization will fail. The
current behavior results in a readmem() error message, followed by two
notification messages that indicate that /dev/mem is restricted and
a switch to using /proc/kcore will be attempted; the readmem is
reattempted from /proc/kcore, and if successful, the session will
continue initialization. With this patch, the behavior will change
such that if the switch to /proc/kcore and the reattempted readmem()
are successful, no messages will be displayed unless the crash
session is invoked with "crash -d<number>".
(anderson(a)redhat.com)
- Fix for the ppc64/ppc64le "bt" command on Linux 4.7 and later kernels
that contain commit d8bff643d81a58181356c0aa3ab771ac10da6894,
titled "[x86] asm: Make sure verify_cpu() has a good stack", which
inadvertently breaks the ppc64/ppc64le kernel stack size calculation
when running with crash-7.2.2 or later. Without the patch, "bt" may
fail with a filtered kdump dumpfile with the two error messages
"bt: page excluded: kernel virtual address: <address> type: stack
contents" and "bt: read of stack at <address> failed".
(anderson(a)redhat.com)
- Fix for PPC64 kernel virtual address translation in Linux 4.17 and
later kernels with commit c2b4d8b7417a59b7f9a52d0d8402f5257cbbd398,
titled "powerpc/mm/hash64: Increase the VA range", in which the
maximum virtual address value has been increased to 4PB. Without
the patch, the translation/access of high vmalloc space addresses
fails; for example, the "kmem -[sS]" option fails the translation
of per-cpu kmem_cache_cpu addresses located in vmalloc space, with
the error messages "kmem: invalid kernel virtual address: <address>
type: kmem_cache_cpu.freelist" and "kmem: invalid kernel virtual
address: <address> type: kmem_cache_cpu.page", and the "vtop"
command shows the addresses as "(not mapped)".
(hbathini(a)linux.ibm.com)
- Fix for the x86_64 "bt" command in which a legitimate exception
frame is appended with the message "bt: WARNING: possibly bogus
exception frame". This only happens in KASLR-enabled kernels when
the text address that was executing when the exception occurred
is marked as a "weak" symbol (type "W") instead of a text symbol
(type "T" or "t"). As a result, the exception frame's RIP is not
recognized as a text symbol, and the warning message is displayed.
(anderson(a)redhat.com)
- Fix for the x86_64 "bt" command in Linux 4.16 and later kernels
containing commit 3aa99fc3e708b9cd9b4cfe2df0b7a66cf293e3cf, titled
"x86/entry/64: Remove 'interrupt' macro". Without the patch, the
exception frame display generated by an interrupt exception will
show incorrect contents, and be followed by the message "bt: WARNING:
possibly bogus exception frame".
(anderson(a)redhat.com)
- Fix for the failure of several "kmem" command options, most notably
seen if the command is piped directly into a crash session, or if
the command is contained in an input file. For examples:
$ echo "kmem -i" | crash ...
$ crash -i <input-file> ...
Without the patch, the kmem command may fail with the error message
"<segmentation violation in gdb>". While the bug is due to a buffer
overflow that has always existed, it only is triggered by certain
kernel configurations.
(anderson(a)redhat.com)
- Update for the "kmem -V" option to also dump the global entries that
are contained in the "vm_numa_stat" array that was introduced in
Linux 4.14. Also, the command output separates the "vm_zone_stat",
"vm_node_stat" and "vm_numa_stat" entries into separate sections with
"VM_ZONE_STAT", "VM_NODE_STAT" and "VM_NUMA_STAT" headers. Without
the patch, the "vm_zone_stat" and "vm_node_stat" entries are listed
together under a "VM_STAT" header.
(anderson(a)redhat.com)
- Support for the "bpf" command on RHEL 3.10.0-913.el7 and later
3.10-based RHEL7 kernels, which contain a backport of the upstream
eBPF code, but still use the older, pre-4.11, IDR facility that does
not use radix trees for linking the active bpf_prog and bpf_map
structures. Without the patch, the command indicates "bpf: command
not supported or applicable on this architecture or kernel".
(anderson(a)redhat.com)
- Third phase of support for x86_64 5-level page tables in Linux 4.17
and later kernels. With this patch, the usage of 5-level page tables
is automatically detected on live systems and when running against
vmcores that contain the new "NUMBER(pgtable_l5_enabled)" VMCOREINFO
entry. Without the patch, the "--machdep vm=5level" command line
option is required.
(douly.fnst(a)cn.fujitsu.com, anderson(a)redhat.com)
- The existing "list" command uses a hash table to detect duplicate
items as it traverses the list. The hash table approach has worked
well for many years. However, with increasing memory sizes and list
sizes, the overhead of the hash table can be substantial, often
leading to commands running for a very long time. For large lists,
we have found that the existing hash based approach may slow the
system to a crawl and possibly never complete. You can turn off
the hash with "set hash off" but then there is no loop detection; in
that case, loop detection must be done manually after dumping the
list to disk or some other method. This patch is an implementation
of the cycle detection algorithm from R. P. Brent as an alternative
algorithm for the "list" command. The algorithm both avoids the
overhead of the hash table and yet is able to detect a loop. In
addition, further loop characteristics are printed, such as the
distance to the start of the loop as well as the loop length.
An excellent description of the algorithm can be found here on
the crash-utility mailing list:
https://www.redhat.com/archives/crash-utility/2018-July/msg00019.html
A new "list -B" option has been added to the "list" command to
invoke this new algorithm rather than using the hash table. In
addition to low memory usage, the output of the list command is
slightly different when a loop is detected. In addition to printing
the first duplicate entry, the length of the loop, and the distance
to the loop is output.
(dwysocha(a)redhat.com)
- Fix for x86_64 "bt" command to prevent an in-kernel exception frame
from not being displayed. Without the patch, if the RIP in a pt_regs
structure on the stack is not a kernel text address, such as a NULL
pointer, it is not recognized as an exception frame and the register
set is not displayed.
(anderson(a)redhat.com)
- Fix for the "repeat" command when the argument consists of an input
file construct, for example, "repeat -1 < input_file". Without the
patch, only the first command line in the input file is executed
each time.
(anderson(a)redhat.com)
- Fourth phase of support for x86_64 5-level page tables in Linux 4.17
and later kernels. This patch adds support for user virtual address
translation when the kernel is configured with CONFIG_X86_5LEVEL.
(douly.fnst(a)cn.fujitsu.com)
- Fix to prevent an unnecessary "read error" message during session
initialization on live systems running a kernel that is configured
with CONFIG_X86_5LEVEL. Without the patch, a message indicating
"crash: read error: kernel virtual address: <address> type:
__pgtable_l5_enabled" will be displayed if /proc/kcore gets
selected as the live memory source after /dev/mem is determined
to be unusable.
(anderson(a)redhat.com)
- Update for "ps" and "foreach" commands to display and recognize two
new process states, "ID" for the TASK_IDLE macro introduced in
Linux 4.2, and "NE" for the TASK_NEW bit introduced in Linux 4.8.
(k-hagio(a)ab.jp.nec.com)
- Fix for running live on ARM64 kernels against /proc/kcore on kernels
configured with CONFIG_RANDOMIZE_BASE. Without the patch, depending
upon the hardware platform, the session may fail with the error message
"crash: vmlinux and /proc/kcore do not match!".
(anderson(a)redhat.com)
- Modify the output of the "kmem -[sS]" header and contents such that
the slab cache "NAME" string is moved from the second column to the
the last column. Since the slab cache name strings have become
increasingly longer over time, without the patch, the numerical
column contents may be skewed so far to the right that the output
becomes difficult to read.
(k-hagio(a)ab.jp.nec.com)
- Fix for the "files" and "net -s" commands when a task has an open
files count that exceeds 1024 (FD_SETSIZE) file descriptors. Without
the patch, the commands may omit the display of open file descriptors.
(tan.hu(a)zte.com.cn)
- As an addendum to the new "kmem -[sS]" output format, align the slab
cache name string so that it is beneath the "NAME" header column when
the "kmem -I <slab-cache>" option is used to ignore a slab cache,
or if the scan of the metadata of a slab cache enounters corruption.
Also remove a superfluous line from the "help kmem" description of
the "kmem -I" option.
(k-hagio(a)ab.jp.nec.com, anderson(a)redhat.com)
- Account for the addition of the new ORC unwinder "orc_entry.end"
member in kernel commit d31a580266eeb1f355df90fde8a71f480e30ad70,
titled "x86/unwind/orc: Detect the end of the stack".
(anderson(a)redhat.com)
- Fix for the "trace.c" extension module for RHEL7.6, which moved the
ftrace_event_call.data member into a new structure contained within
an anonymous union. Without the patch, the module fails to load,
indicating "no commands registered: shared object unloaded".
(xuhuan.fnst(a)cn.fujitsu.com)
- Fix for the "vm -p", user-space "vtop", and "pte" commands in kernels
where the dimension of the static swap_info[] array is not contained
in the vmlinux file's debuginfo data. Without the patch, the
translation of a swapped-out PTE entry fails to determine the swap
device, and the commands display "cannot determine swap location".
(anderson(a)redhat.com)
- Fix for the swap offset calculation in the x86_64 "vm -p", "pte", and
user-space "vtop" commands. The swap offset bits in an x86_64 PTE
were changed in Linux 4.6, and then again in Linux 4.18.1 with the
new L1TF security patchset. Without the patch, the offset value
in the later kernels, or in older kernels with an L1TF backport,
show an incorrect swap offset value.
(anderson(a)redhat.com)
- Fix for the "kmem -V" option on Linux 4.14 and later kernels that are
configured without CONFIG_NUMA, and therefore do not contain the
"numa_stat_item" enumeration. Without the patch, the command causes
the crash session to abort with the error messages "double free or
corruption (!prev)" followed by "Aborted (core dumped)".
(k-hagio(a)ab.jp.nec.com)
- Introduction of a new "kmem -r" option. With the implementation of
per-cgroup kmem_cache slabs, the number of slab caches displayed by
"kmem -s" can number into the thousands. Similar to /proc/slabinfo,
this new option displays the accumulated data of the root cache and
its children. It is limited to Linux 4.11 and later kernels that
contain the "slab_root_caches" list. Currently the command option
is restricted to kernels configured with CONFIG_SLUB.
(k-hagio(a)ab.jp.nec.com)
- Fix for Linux 4.19-rc1 and later kernels that contain kernel commit
2c4704756cab7cfa031ada4dab361562f0e357c0, titled "pids: Move the pgrp
and session pid pointers from task_struct to signal_struct". Without
the patch, the crash session fails during initialization with the
message "crash: invalid structure member offset: task_struct_pids".
(anderson(a)redhat.com)
- Fix for Linux 4.19-rc1 and later kernels that contain kernel commit
7290d58095712a89f845e1bca05334796dd49ed2, titled "module: use
relative references for __ksymtab entries". Without the patch,
kernels configured with CONFIG_HAVE_ARCH_PREL32_RELOCATIONS fail
during session initialization, with a dump of the internel buffer
allocation stats followed by the message "crash: cannot allocate
any more memory!"
(asmadeus(a)codewreck.org)
- Fix a cut-and-paste error in the previous patch application.
(anderson(a)redhat.com)
- Fix for the "files" command in Linux 4.17 and later kernels that
contain commit b93b016313b3ba8003c3b8bb71f569af91f19fc7, titled
"page cache: use xa_lock". Without the patch, the "files -c" option
fails with the message "files: -c option not supported or applicable
on this architecture or kernel", and the "files -p <inode>" option
fails in a similar manner.
(k-hagio(a)ab.jp.nec.com)
- Fix for the "files -p <inode>" option. Without the patch, the
command attempts to translate radix tree node slot entries that
are RADIX_TREE_EXCEPTIONAL_ENTRY types, and as a result may fail
prematurely with an error message of the sort "files: do_radix_tree:
callback operation failed: entry: 5 item: 44788c5000a".
(anderson(a)redhat.com)
- Commit 3db3d3992d781c1e42587d2d2bf81e785408e0c2 in crash-7.1.8 was
aimed at making the PPC64 "bt" command work for dumpfiles saved
with the FADUMP facility, but it introduced a bit of unwarranted
complexity in "bt" command processing. Reworked the "bt" command
processing for PPC64 arch to make it a little less compilated and
also to print symbols for NIP and LR registers in exception frames.
Without the patch, "bt" on non-panic active tasks may fail with
the message "bt: invalid kernel virtual address: <address>
type: Regs NIP value".
(hbathini(a)linux.ibm.com)
- An addendum to crash commit 5fe78861ea1589084f6a2956a6ff63677c9269e1,
this patch for the PPC64 "bt" command prevents an invalid error
message from being displayed when an active non-panic task is
interrupted while running in user space. Without the patch, the
command correctly indicates "Task is running in user space", dumps
the user-space exception frame, but then prints the invalid error
message "bt: invalid kernel virtual address: ffffffffffffff90 type:
Regs NIP value".
(anderson(a)redhat.com)
6 years, 2 months
[PATCH] ppc64: rework bt command
by Hari Bathini
Commit 3db3d3992d78 was aimed at making 'bt' work for dumpfiles saved
with fadump but it introduced a bit of unwarranted complexity in 'bt'
command processing. Rework 'bt' command processing for PPC64 arch to
make it a little less compilated and also, print symbols for NIP and
LR registers on exception frames.
Signed-off-by: Hari Bathini <hbathini(a)linux.ibm.com>
---
ppc64.c | 140 +++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 99 insertions(+), 41 deletions(-)
diff --git a/ppc64.c b/ppc64.c
index f5d0dac..03fecd3 100644
--- a/ppc64.c
+++ b/ppc64.c
@@ -2093,15 +2093,10 @@ ppc64_print_stack_entry(int frame,
lr);
return;
}
- if (req->pc != lr) {
- fprintf(fp, "\n%s[Link Register] ",
- frame < 10 ? " " : "");
- fprintf(fp, "[%lx] %s at %lx",
- req->sp, lrname, lr);
- }
req->ra = lr;
}
- if (!req->name || STREQ(req->name,lrname))
+ if (!req->name || STREQ(req->name, lrname) ||
+ !is_kernel_text(req->pc))
fprintf(fp, " (unreliable)");
fprintf(fp, "\n");
@@ -2219,6 +2214,22 @@ ppc64_print_regs(struct ppc64_pt_regs *regs)
fprintf(fp, " Syscall Result: %016lx\n", regs->result);
}
+static void ppc64_print_nip_lr(struct ppc64_pt_regs *regs, int print_lr)
+{
+ char buf[BUFSIZE];
+ char *sym_buf;
+
+ sym_buf = value_to_symstr(regs->nip, buf, 0);
+ if (sym_buf[0] != NULLCHAR)
+ fprintf(fp, " [NIP : %s]\n", sym_buf);
+
+ if (print_lr) {
+ sym_buf = value_to_symstr(regs->link, buf, 0);
+ if (sym_buf[0] != NULLCHAR)
+ fprintf(fp, " [LR : %s]\n", sym_buf);
+ }
+}
+
/*
* Print the exception frame information
*/
@@ -2231,6 +2242,59 @@ ppc64_print_eframe(char *efrm_str, struct ppc64_pt_regs *regs,
fprintf(fp, " %s [%lx] exception frame:\n", efrm_str, regs->trap);
ppc64_print_regs(regs);
+ ppc64_print_nip_lr(regs, 1);
+}
+
+/*
+ * For vmcore typically saved with KDump or FADump, get SP and IP values
+ * from the saved ptregs.
+ */
+static int
+ppc64_vmcore_stack_frame(struct bt_info *bt_in, ulong *nip, ulong *ksp)
+{
+ struct ppc64_pt_regs *pt_regs;
+ unsigned long unip;
+
+ pt_regs = (struct ppc64_pt_regs *)bt_in->machdep;
+ if (!pt_regs || !pt_regs->gpr[1]) {
+ /*
+ * Not collected regs. May be the corresponding CPU not
+ * responded to an IPI in case of KDump OR f/w has not
+ * not provided the register info in case of FADump.
+ */
+ fprintf(fp, "%0lx: GPR1 register value (SP) was not saved\n",
+ bt_in->task);
+ return FALSE;
+ }
+ *ksp = pt_regs->gpr[1];
+ if (IS_KVADDR(*ksp)) {
+ readmem(*ksp+16, KVADDR, &unip, sizeof(ulong), "Regs NIP value",
+ FAULT_ON_ERROR);
+ *nip = unip;
+ } else {
+ if (IN_TASK_VMA(bt_in->task, *ksp))
+ fprintf(fp, "%0lx: Task is running in user space\n",
+ bt_in->task);
+ else
+ fprintf(fp, "%0lx: Invalid Stack Pointer %0lx\n",
+ bt_in->task, *ksp);
+ *nip = pt_regs->nip;
+ }
+
+ if (bt_in->flags &&
+ ((BT_TEXT_SYMBOLS|BT_TEXT_SYMBOLS_PRINT|BT_TEXT_SYMBOLS_NOPRINT)))
+ return TRUE;
+
+ /*
+ * Print the collected regs for the active task
+ */
+ ppc64_print_regs(pt_regs);
+ if (!IS_KVADDR(*ksp))
+ return FALSE;
+
+ ppc64_print_nip_lr(pt_regs, (unip != pt_regs->link) ? 1 : 0);
+
+ return TRUE;
}
/*
@@ -2239,7 +2303,7 @@ ppc64_print_eframe(char *efrm_str, struct ppc64_pt_regs *regs,
static int
ppc64_get_dumpfile_stack_frame(struct bt_info *bt_in, ulong *nip, ulong *ksp)
{
- int i;
+ int i, ret, panic_task;
char *sym;
ulong *up;
struct bt_info bt_local, *bt;
@@ -2251,11 +2315,29 @@ ppc64_get_dumpfile_stack_frame(struct bt_info *bt_in, ulong *nip, ulong *ksp)
struct ppc64_pt_regs *pt_regs;
struct syment *sp;
- bt = &bt_local;
- BCOPY(bt_in, bt, sizeof(struct bt_info));
- ms = machdep->machspec;
+ bt = &bt_local;
+ BCOPY(bt_in, bt, sizeof(struct bt_info));
+ ms = machdep->machspec;
+ ur_nip = ur_ksp = 0;
+
+ panic_task = tt->panic_task == bt->task ? TRUE : FALSE;
check_hardirq = check_softirq = tt->flags & IRQSTACKS ? TRUE : FALSE;
+ if (panic_task && bt->machdep) {
+ pt_regs = (struct ppc64_pt_regs *)bt->machdep;
+ ur_nip = pt_regs->nip;
+ ur_ksp = pt_regs->gpr[1];
+ } else if ((pc->flags & KDUMP) ||
+ ((pc->flags & DISKDUMP) &&
+ (*diskdump_flags & KDUMP_CMPRS_LOCAL))) {
+ /*
+ * For the KDump or FADump vmcore, use SP and IP values
+ * that are saved in ptregs.
+ */
+ ret = ppc64_vmcore_stack_frame(bt_in, nip, ksp);
+ if (ret)
+ return TRUE;
+ }
if (bt->task != tt->panic_task) {
char cpu_frozen = FALSE;
@@ -2385,38 +2467,14 @@ retry:
check_intrstack = FALSE;
goto retry;
}
-
/*
- * We didn't find what we were looking for, so try to use
- * the SP and IP values saved in ptregs.
+ * We didn't find what we were looking for, so just use what was
+ * passed in the ELF header.
*/
- pt_regs = (struct ppc64_pt_regs *)bt_in->machdep;
- if (!pt_regs || !pt_regs->gpr[1]) {
- /*
- * Not collected regs. May be the corresponding CPU did not
- * respond to an IPI.
- */
- if (CRASHDEBUG(1))
- fprintf(fp, "%0lx: GPR1(SP) register value not saved\n",
- bt_in->task);
- } else {
- *ksp = pt_regs->gpr[1];
- if (IS_KVADDR(*ksp)) {
- readmem(*ksp+16, KVADDR, nip, sizeof(ulong),
- "Regs NIP value", FAULT_ON_ERROR);
- ppc64_print_regs(pt_regs);
- return TRUE;
- } else {
- if (IN_TASK_VMA(bt_in->task, *ksp))
- fprintf(fp, "%0lx: Task is running in user space\n",
- bt_in->task);
- else
- fprintf(fp, "%0lx: Invalid Stack Pointer %0lx\n",
- bt_in->task, *ksp);
- *nip = pt_regs->nip;
- ppc64_print_regs(pt_regs);
- return FALSE;
- }
+ if (ur_nip && ur_ksp) {
+ *nip = ur_nip;
+ *ksp = ur_ksp;
+ return TRUE;
}
console("ppc64_get_dumpfile_stack_frame: cannot find SP for panic task\n");
6 years, 2 months
[PATCH] Fix for "files -[cp]" options on Linux 4.17
by Kazuhito Hagio
Since the kernel commit b93b016313b3ba8003c3b8bb71f569af91f19fc7
("page cache: use xa_lock") renamed the address_space ->page_tree
to ->i_pages, without this patch, the "files -[cp]" options don't
work on Linux 4.17 and later kernels.
(In this case, is it OK not to add a new member to offset_table?)
Also, it looks like the address_space's member that the "files -c"
option really requires is ->nrpages, not ->page_tree.
Signed-off-by: Kazuhito Hagio <k-hagio(a)ab.jp.nec.com>
---
filesys.c | 2 +-
memory.c | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/filesys.c b/filesys.c
index 47f5a24..527b3f6 100644
--- a/filesys.c
+++ b/filesys.c
@@ -2275,7 +2275,7 @@ cmd_files(void)
return;
case 'c':
- if (VALID_MEMBER(address_space_page_tree) &&
+ if (VALID_MEMBER(address_space_nrpages) &&
VALID_MEMBER(inode_i_mapping))
open_flags |= PRINT_NRPAGES;
else
diff --git a/memory.c b/memory.c
index 24fce5e..ea25047 100644
--- a/memory.c
+++ b/memory.c
@@ -487,6 +487,8 @@ vm_init(void)
MEMBER_OFFSET_INIT(block_device_bd_disk, "block_device", "bd_disk");
MEMBER_OFFSET_INIT(inode_i_mapping, "inode", "i_mapping");
MEMBER_OFFSET_INIT(address_space_page_tree, "address_space", "page_tree");
+ if (INVALID_MEMBER(address_space_page_tree))
+ MEMBER_OFFSET_INIT(address_space_page_tree, "address_space", "i_pages");
MEMBER_OFFSET_INIT(address_space_nrpages, "address_space", "nrpages");
if (INVALID_MEMBER(address_space_nrpages))
MEMBER_OFFSET_INIT(address_space_nrpages, "address_space", "__nrpages");
--
1.8.3.1
6 years, 2 months