Hi Dave,
I hope I have captured everything you asked for here, if remote debugging over e-mail is
too tedious, I can arrange to post a vmlinux/vmcore on our FTP site (roughly 600MB
together).
*** Setup some breakpoints to watch bt->machdep:
get_netdump_regs_x86_64(struct bt_info *bt, ulong *ripp, ulong *rspp)
{
...
if (((NETDUMP_DUMPFILE() || KDUMP_DUMPFILE()) &&
VALID_STRUCT(user_regs_struct) && (bt->task == tt->panic_task))
||
(KDUMP_DUMPFILE() && (kt->flags & DWARF_UNWIND) &&
(bt->flags & BT_DUMPFILE_SEARCH))) {
...
2287 bt->machdep = (void *)user_regs;
...
if (ELF_NOTES_VALID() &&
(bt->flags & BT_DUMPFILE_SEARCH) && DISKDUMP_DUMPFILE() &&
(note = (Elf64_Nhdr *)
diskdump_get_prstatus_percpu(bt->tc->processor))) {
...
2306 bt->machdep = (void *)user_regs;
...
(gdb) break get_netdump_regs_x86_64
Breakpoint 1 at 0x519740: file netdump.c, line 2238.
(gdb) break netdump.c:2287
Breakpoint 2 at 0x519970: file netdump.c, line 2287.
(gdb) break netdump.c:2306
Breakpoint 3 at 0x5199e7: file netdump.c, line 2306.
(gdb) r
please wait... (determining panic task)
Breakpoint 1, get_netdump_regs_x86_64 (bt=0x7fffffffcd70, ripp=0x7fffffffcce0,
rspp=0x7fffffffcce8) at netdump.c:2238
2238 {
(gdb) c
Continuing.
Breakpoint 3, get_netdump_regs_x86_64 (bt=0x7fffffffcd70, ripp=0x7fffffffcce0,
rspp=0x7fffffffcce8) at netdump.c:2306
2306 bt->machdep = (void *)user_regs;
(gdb) p user_regs
$1 = 0xd14084 ""
(gdb) c
Continuing.
Breakpoint 1, get_netdump_regs_x86_64 (bt=0x7fffffffcd70, ripp=0x7fffffffcce0,
rspp=0x7fffffffcce8) at netdump.c:2238
2238 {
(gdb) c
Continuing.
Program received signal SIGSEGV, Segmentation fault.
x86_64_get_dumpfile_stack_frame (rsp=0x7fffffffcce8, rip=0x7fffffffcce0,
bt_in=0x7fffffffcd70) at x86_64.c:4183
4183 ur_rip = ULONG(user_regs +
*** So in its second invocation, get_netdump_regs_x86_64() never sets bt->machdep (only
breakpoint 1 fired)
*** Let's see what diskdump_get_prstatus_percpu() is returning
(gdb) break diskdump_get_prstatus_percpu
Breakpoint 1 at 0x526070: file diskdump.c, line 1451.
(gdb) r
please wait... (determining panic task)
Breakpoint 1, diskdump_get_prstatus_percpu (cpu=0) at diskdump.c:1451
1451 return dd->nt_prstatus_percpu[cpu];
(gdb) display dd->nt_prstatus_percpu[0]@16
1: dd->nt_prstatus_percpu[0]@16 = {0xd1c000, 0x0, 0x0, 0xd26472, 0xbf35ab2,
0xd26472, 0x200000012, 0xd1c850, 0xd1c600, 0x1010000012b,
0xffffffff814e4fa0, 0x14e4fa0, 0x4270, 0x0, 0x0, 0x0}
(gdb) c
Continuing.
Breakpoint 1, diskdump_get_prstatus_percpu (cpu=1) at diskdump.c:1451
1451 return dd->nt_prstatus_percpu[cpu];
1: dd->nt_prstatus_percpu[0]@16 = {0xd1c000, 0x0, 0x0, 0xd26472, 0xbf35ab2,
0xd26472, 0x200000012, 0xd1c850, 0xd1c600, 0x1010000012b,
0xffffffff814e4fa0, 0x14e4fa0, 0x4270, 0x0, 0x0, 0x0}
*** See crash -d1 vmlinux vmcore output at the bottom of the mail, particularly the part
that says...
crash: page excluded: kernel virtual address: ffffffff81bb3b00 type: "cpu number
(per_cpu)"
crash: get_cpus_present: present: 16
*** Bogus note->n_descsz value
*** Apply first patch to get us further into ELF Note processing
From inside netdump.c :: get_regs_from_note() at the point of the
fault, I don't see dd->nt_prstatus[], for dd is now type *diskdump_data... The
*note passed in can be found in dd->nt_prstatus_percpu[] however...
please wait... (determining panic task)
Program received signal SIGSEGV, Segmentation fault.
get_regs_from_note (note=0xd26472 "\b", ip=0x7fffffffc590, sp=0x7fffffffc598)
at netdump.c:2221
2221 *sp = ULONG(user_regs + offset_sp);
(gdb) p/x *((Elf64_Nhdr *)note)
$1 = {n_namesz = 0x8, n_descsz = 0xccf80000, n_type = 0x8}
(gdb) p dd->nt_prstatus_percpu[0]@16
$2 = {0xd1c000, 0x0, 0x0, 0xd26472, 0xbf35ab2, 0xd26472, 0x200000012,
0xd1c850, 0xd1c600, 0x1010000012b, 0xffffffff814e4fa0, 0x14e4fa0, 0x4270,
0x0, 0x0, 0x0}
(gdb) ptype dd
type = struct diskdump_data {
char *filename;
ulong flags;
int dfd;
FILE *ofp;
int machine_type;
struct disk_dump_header *header;
struct disk_dump_sub_header *sub_header;
struct kdump_sub_header *sub_header_kdump;
size_t data_offset;
int block_size;
int block_shift;
char *bitmap;
int bitmap_len;
char *dumpable_bitmap;
int byte;
int bit;
char *compressed_page;
char *curbufptr;
unsigned char *notes_buf;
void **nt_prstatus_percpu;
uint num_prstatus_notes;
struct page_cache_hdr page_cache_hdr[16];
char *page_cache_buf;
int evict_index;
ulong evictions;
ulong cached_reads;
ulong *valid_pages;
ulong accesses;
} *
*** Unpatched crash -d1 vmlinux vmcore output:
crash 5.1.8
Copyright (C) 2002-2011 Red Hat, Inc.
Copyright (C) 2004, 2005, 2006 IBM Corporation
Copyright (C) 1999-2006 Hewlett-Packard Co
Copyright (C) 2005, 2006 Fujitsu Limited
Copyright (C) 2006, 2007 VA Linux Systems Japan K.K.
Copyright (C) 2005 NEC Corporation
Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public License,
and you are welcome to change it and/or distribute copies of it under
certain conditions. Enter "help copying" to see the conditions.
This program has absolutely no warranty. Enter "help warranty" for details.
compressed kdump: header->utsname.machine: x86_64
diskdump_data:
filename: vmcore
flags: 6 (KDUMP_CMPRS_LOCAL|ERROR_EXCLUDED)
dfd: 3
ofp: 0
machine_type: 62 (EM_X86_64)
header: 2cc1fe0
signature: "KDUMP "
header_version: 4
utsname:
sysname: Linux
nodename:
bahamut.mno.stratus.com
release: 2.6.32-131.0.15.el6.exp10.bz16586.x86_64
version: #1 SMP Thu Jun 16 13:13:45 EDT 2011
machine: x86_64
domainname: sraeng
timestamp:
tv_sec: 4e4fe6e3
tv_usec: 0
status: 0 ()
block_size: 4096
sub_hdr_size: 1
bitmap_blocks: 288
max_mapnr: 4718592
total_ram_blocks: 0
device_blocks: 0
written_blocks: 0
current_cpu: 0
nr_cpus: 1
tasks[nr_cpus]: 0
sub_header: 0 (n/a)
sub_header_kdump: 2cc2ff0
phys_base: 0
dump_level: 31 (0x1f)
(DUMP_EXCLUDE_ZERO|DUMP_EXCLUDE_CACHE|DUMP_EXCLUDE_CACHE_PRI|DUMP_EXCLUDE_USER_DATA|DUMP_EXCLUDE_FREE)
offset_vmcoreinfo: 11bc
size_vmcoreinfo: 1392
OSRELEASE=2.6.32-131.0.15.el6.exp10.bz16586.x86_64
PAGESIZE=4096
SYMBOL(init_uts_ns)=ffffffff81a2e8c0
SYMBOL(node_online_map)=ffffffff81ba0860
SYMBOL(swapper_pg_dir)=ffffffff81a25000
SYMBOL(_stext)=ffffffff81000198
SYMBOL(vmlist)=ffffffff81ee60b8
SYMBOL(mem_section)=ffffffff81ef03c0
LENGTH(mem_section)=4096
SIZE(mem_section)=32
OFFSET(mem_section.section_mem_map)=0
SIZE(page)=56
SIZE(pglist_data)=212416
SIZE(zone)=34496
SIZE(free_area)=88
SIZE(list_head)=16
SIZE(nodemask_t)=64
OFFSET(page.flags)=0
OFFSET(page._count)=8
OFFSET(page.mapping)=24
OFFSET(page.lru)=40
OFFSET(pglist_data.node_zones)=0
OFFSET(pglist_data.nr_zones)=212288
OFFSET(pglist_data.node_start_pfn)=212312
OFFSET(pglist_data.node_spanned_pages)=212328
OFFSET(pglist_data.node_id)=212336
OFFSET(zone.free_area)=32864
OFFSET(zone.vm_stat)=34032
OFFSET(zone.spanned_pages)=34344
OFFSET(free_area.free_list)=0
OFFSET(list_head.next)=0
OFFSET(list_head.prev)=8
OFFSET(vm_struct.addr)=8
LENGTH(zone.free_area)=11
SYMBOL(log_buf)=ffffffff81a37210
SYMBOL(log_end)=ffffffff81d5b820
SYMBOL(log_buf_len)=ffffffff81a37208
SYMBOL(logged_chars)=ffffffff81ddb920
LENGTH(free_area.free_list)=5
NUMBER(NR_FREE_PAGES)=0
NUMBER(PG_lru)=5
NUMBER(PG_private)=11
NUMBER(PG_swapcache)=16
SYMBOL(phys_base)=ffffffff81a2d010
SYMBOL(init_level4_pgt)=ffffffff81a25000
SYMBOL(node_data)=ffffffff81b9cda0
LENGTH(node_data)=512
CRASHTIME=1313859299
offset_note: 1040
size_note: 1780
num_prstatus_notes: 1
notes_buf: 2cc4000
notes[0]: 2cc4000
NT_PRSTATUS_offset: 1040
data_offset: 122000
block_size: 4096
block_shift: 12
bitmap: 7fa5296fc010
bitmap_len: 1179648
dumpable_bitmap: 7fa528890010
byte: 0
bit: 0
compressed_page: 2cdeb30
curbufptr: 0
page_cache_hdr[0]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cceb20
pg_hit_count: 0
page_cache_hdr[1]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2ccfb20
pg_hit_count: 0
page_cache_hdr[2]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd0b20
pg_hit_count: 0
page_cache_hdr[3]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd1b20
pg_hit_count: 0
page_cache_hdr[4]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd2b20
pg_hit_count: 0
page_cache_hdr[5]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd3b20
pg_hit_count: 0
page_cache_hdr[6]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd4b20
pg_hit_count: 0
page_cache_hdr[7]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd5b20
pg_hit_count: 0
page_cache_hdr[8]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd6b20
pg_hit_count: 0
page_cache_hdr[9]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd7b20
pg_hit_count: 0
page_cache_hdr[10]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd8b20
pg_hit_count: 0
page_cache_hdr[11]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cd9b20
pg_hit_count: 0
page_cache_hdr[12]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cdab20
pg_hit_count: 0
page_cache_hdr[13]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cdbb20
pg_hit_count: 0
page_cache_hdr[14]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cdcb20
pg_hit_count: 0
page_cache_hdr[15]:
pg_flags: 0 ()
pg_addr: 0
pg_bufptr: 2cddb20
pg_hit_count: 0
page_cache_buf: 2cceb20
evict_index: 0
evictions: 0
accesses: 0
cached_reads: 0
valid_pages: 2ccc710
crash: pv_init_ops exists: ARCH_PVOPS
compressed kdump: phys_base: 0
gdb vmlinux
GNU gdb (GDB) 7.0
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <
http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...
cpu_possible_map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
cpu_present_map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
cpu_online_map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
base kernel version: 2.6.32
verify_namelist:
dumpfile /proc/version:
Linux version 2.6.32-131.0.15.el6.exp10.bz16586.x86_64 (root(a)druk.mno.stratus.com) (gcc
version 4.4.5 20110214 (Red Hat 4.4.5-6) (GCC) ) #1 SMP Thu Jun 16 13:13:45 EDT 2011
vmlinux:
Linux version 2.6.32-131.0.15.el6.exp10.bz16586.x86_64 (root(a)druk.mno.stratus.com) (gcc
version 4.4.5 20110214 (Red Hat 4.4.5-6) (GCC) ) #1 SMP Thu Jun 16 13:13:45 EDT 2011
crash: page excluded: kernel virtual address: ffffffff81bb3b00 type: "cpu number
(per_cpu)"
crash: get_cpus_present: present: 16
crash: page excluded: kernel virtual address: ffffffff81bb3b00 type: "cpu number
(per_cpu)"
crash: get_cpus_present: present: 16
IRQ stack link register: undetermined
PAGESIZE=4096
mem_section_size = 32768
NR_SECTION_ROOTS = 4096
NR_MEM_SECTIONS = 524288
SECTIONS_PER_ROOT = 128
SECTION_ROOT_MASK = 0x7f
PAGES_PER_SECTION = 32768
node_online_map: [3, 0, 0, 0, 0, 0, 0, 0] -> nodes online: 2
node_table[0]:
id: 0
pgdat: ffff880000020040
size: 0
present: 0
mem_map: ffffea0000000000
start_paddr: 0
start_mapnr: 0
WARNING: sparsemem: invalid section number: 137438888923
WARNING: sparsemem: invalid section number: 137438888923
crash: invalid kernel virtual address: 0 type: "readstring characters"
crash: invalid kernel virtual address: 0 type: "readstring characters"
node_table[1]:
id: 1
pgdat: ffff880280000040
size: 2097152
present: 2097152
mem_map: ffffea0008c00000
start_paddr: 280000000
start_mapnr: 2621440
NOTE: page_hash_table does not exist in this kernel
^Mplease wait... (gathering kmem slab cache data)
kmem_cache_downsize: SIZE(kmem_cache_s): 36968 cache_cache.buffer_size: 32896
kmem_cache_downsize: nr_node_ids: 2
^M ^MNOTE: unwind_table structure has
changed, or does not exist in this kernel
init_unwind_table: DWARF_UNWIND_EH_FRAME
^Mplease wait... (gathering module symbol data)^M
^M^Mplease wait... (gathering task table data)^M
^Mcrash: get_cpus_online: online: 16
^Mplease wait... (determining panic task)
crash: get_active_set_panic_task: failed
Thanks,
-- Joe Lawrence