Re: [PATCH] x86_64: Add top_of_kernel_stack_padding for kernel stack
by lijiang
On Fri, May 31, 2024 at 5:21 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 13:14:02 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] Re: [PATCH] x86_64: Add
> top_of_kernel_stack_padding for kernel stack
> To: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: devel(a)lists.crash-utility.osci.io
> Message-ID:
> <
> CAO7dBbWZgeiJBN2sv-3e00TSU2uO_kMbSZ_NGVCft6xzdj3EMA(a)mail.gmail.com>
> Content-Type: text/plain; charset="UTF-8"
>
> Hi Lianbo,
>
> On Mon, May 27, 2024 at 11:30 AM Lianbo Jiang <lijiang(a)redhat.com> wrote:
> >
> > Hi, Tao
> >
> > Thank you for the fix.
> >
> > On 5/23/24 12:06 PM, devel-request(a)lists.crash-utility.osci.io wrote:
> > > Date: Thu, 23 May 2024 12:06:03 +0800
> > > From: Tao Liu<ltao(a)redhat.com>
> > > Subject: [Crash-utility] [PATCH] x86_64: Add
> > > top_of_kernel_stack_padding for kernel stack
> > > To:devel@lists.crash-utility.osci.io
> > > Cc: Tao Liu<ltao(a)redhat.com>
> > > Message-ID:<20240523040603.10304-1-ltao(a)redhat.com>
> > > Content-Type: text/plain; charset="US-ASCII"; x-default=true
> > >
> > > With kernel patch [1], x86_64 will add extra padding for kernel stack,
> > > as a result, the pt_regs will be shift down by the offset of padding.
> > > Without the patch, the values of registers read from pt_regs will be
> > > incorrect.
> > >
> > > Though currently the TOP_OF_KERNEL_STACK_PADDING is configured by
> > > Kconfig, according to kernel code comment [2], the value may be made
> > > dynamicly later. In addition there might be systems compiled without
> > > Kconfig avaliable. So in this patch, we will calculate the value of
> > > TOP_OF_KERNEL_STACK_PADDING.
> > >
> > > The calculation is as follows:
> > >
> > > 1) in startup_64(), there is a lea instruction as:
> > > leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING -
> PTREGS_SIZE)(%rip), %rsp
> > >
> > > 2) in rewind_stack_and_make_dead(), there is a lea instruction as:
> > > leaq -PTREGS_SIZE(%rax), %rsp
> > >
> > > The disassembled 2 instructions will be like:
> > >
> > > 1) 0xffffffff93a0007d <startup_64+3>: lea
> 0x1e03ec4(%rip),%rsp # 0xffffffff95803f48
> > >
> ^^^^^^^^^^^^^^^^^^^^
> > > 2) 0xffffffff93a0465a <rewind_stack_and_make_dead+10>: lea
> -0xa8(%rax),%rsp
> > >
> ^^^^
> > > 0xffffffff95803f48 is the value of (__end_init_task -
> > > TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE), and 0xa8 is the value of
> > > PTREGS_SIZE, __end_init_task can be get by symbol reading.
> >
> > Calculating the value of TOP_OF_KERNEL_STACK_PADDING, which looks good,
> but it heavily relies on compiler.
> > Normally we would use this way unless there is no other choice.
> >
> > How about the following changes? Although it doesn't handle the case
> that the value is dynamic, let's see
> > how to change in the kernel in future, and then consider how to reflect
> it in crash-utility.
> >
> Sure, looks good to me, so let's go with this, and update it later
> when kernel changes.
>
Ok. Thanks, Tao.
Applied with minor changes:
https://github.com/crash-utility/crash/commit/48764a14bc5856f0b0bb3068533...
Lianbo
>
> Thanks,
> Tao Liu
>
> >
> > diff --git a/defs.h b/defs.h
> > index 01f316e67dde..42d875965256 100644
> > --- a/defs.h
> > +++ b/defs.h
> > @@ -2414,6 +2414,7 @@ struct size_table { /* stash of
> commonly-used sizes */
> > long maple_tree;
> > long maple_node;
> > long module_memory;
> > + long fred_frame;
> > };
> >
> > struct array_table {
> > diff --git a/kernel.c b/kernel.c
> > index 1728b70c1b5c..cd3d6044cc9a 100644
> > --- a/kernel.c
> > +++ b/kernel.c
> > @@ -668,6 +668,7 @@ kernel_init()
> > STRUCT_SIZE_INIT(softirq_state, "softirq_state");
> > STRUCT_SIZE_INIT(softirq_action, "softirq_action");
> > STRUCT_SIZE_INIT(desc_struct, "desc_struct");
> > + STRUCT_SIZE_INIT(fred_frame, "fred_frame");
> >
> > STRUCT_SIZE_INIT(char_device_struct, "char_device_struct");
> > if (VALID_STRUCT(char_device_struct)) {
> > diff --git a/x86_64.c b/x86_64.c
> > index 0c21eb827e4a..6777c93e6b47 100644
> > --- a/x86_64.c
> > +++ b/x86_64.c
> > @@ -4086,10 +4086,11 @@ in_exception_stack:
> >
> > if (!irq_eframe && !is_kernel_thread(bt->tc->task) &&
> > (GET_STACKBASE(bt->tc->task) == bt->stackbase)) {
> > + long stack_padding_size = SIZE(fred_frame) > 0 ? (2*8) :
> 0;
> > user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > if (last_process_stack_eframe < user_mode_eframe)
> > x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > - (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > + (bt->stacktop - stack_padding_size -
> bt->stackbase) - SIZE(pt_regs),
> > bt, ofp);
> > }
> >
> > @@ -4407,10 +4408,11 @@ in_exception_stack:
> >
> > if (!irq_eframe && !is_kernel_thread(bt->tc->task) &&
> > (GET_STACKBASE(bt->tc->task) == bt->stackbase)) {
> > + long stack_padding_size = SIZE(fred_frame) > 0 ? (2*8) :
> 0;
> > user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > if (last_process_stack_eframe < user_mode_eframe)
> > x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > - (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > + (bt->stacktop - stack_padding_size -
> bt->stackbase) - SIZE(pt_regs),
> > bt, ofp);
> > }
> >
> > Thanks
> > Lianbo
> >
> > > [1]:
> https://lore.kernel.org/all/170668568261.398.10403890006820046961.tip-bot...
> > > [2]:
> https://elixir.bootlin.com/linux/v6.9.1/source/arch/x86/include/asm/threa...
> > >
> > > Signed-off-by: Tao Liu<ltao(a)redhat.com>
> > > ---
> > > x86_64.c | 84
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > 1 file changed, 82 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/x86_64.c b/x86_64.c
> > > index 0c21eb8..43a31c2 100644
> > > --- a/x86_64.c
> > > +++ b/x86_64.c
> > > @@ -137,6 +137,7 @@ static orc_entry *orc_find(ulong);
> > > static orc_entry *orc_module_find(ulong);
> > > static ulong ip_table_to_vaddr(ulong);
> > > static void orc_dump(ulong);
> > > +static long top_of_kernel_stack_padding(void);
> > >
> > > struct machine_specific x86_64_machine_specific = { 0 };
> > >
> > > @@ -4089,7 +4090,8 @@ in_exception_stack:
> > > user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > > if (last_process_stack_eframe < user_mode_eframe)
> > > x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > > - (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > > + (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs) -
> > > + top_of_kernel_stack_padding(),
> > > bt, ofp);
> > > }
> > >
> > > @@ -4410,7 +4412,8 @@ in_exception_stack:
> > > user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > > if (last_process_stack_eframe < user_mode_eframe)
> > > x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > > - (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > > + (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs) -
> > > + top_of_kernel_stack_padding(),
> > > bt, ofp);
> > > }
> > >
> > > @@ -9541,4 +9544,81 @@ x86_64_swp_offset(ulong entry)
> > > return SWP_OFFSET(entry);
> > > }
> > >
> > > +static long
> > > +top_of_kernel_stack_padding(void)
> > > +{
> > > + char buf1[BUFSIZE];
> > > + char *cursor;
> > > + long final_value, ptregs_size_value;
> > > + char *arglist[MAXARGS];
> > > + bool found = FALSE;
> > > +
> > > + static long kernel_stack_padding = -1;
> > > +
> > > + if (kernel_stack_padding >= 0)
> > > + return kernel_stack_padding;
> > > +
> > > + /*
> > > + * startup_64:
> > > + * ...
> > > + * mov %rsi,%r15
> > > + * leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING -
> PTREGS_SIZE)(%rip), %rsp
> > > + */
> > > + sprintf(buf1, "disass /r startup_64");
> > > + open_tmpfile2();
> > > + if (!gdb_pass_through(buf1, pc->tmpfile2, GNU_RETURN_ON_ERROR)) {
> > > + kernel_stack_padding = 0;
> > > + goto out;
> > > + }
> > > +
> > > + rewind(pc->tmpfile2);
> > > + while (fgets(buf1, BUFSIZE, pc->tmpfile2) && !found) {
> > > + // machine code of "mov %rsi,%r15"
> > > + if (strstr(buf1, "49 89 f7"))
> > > + found = TRUE;
> > > + }
> > > + if (!found || !(cursor = strstr(buf1, "# 0x"))) {
> > > + kernel_stack_padding = 0;
> > > + goto out;
> > > + }
> > > +
> > > + parse_line(cursor, arglist);
> > > + final_value = stol(arglist[1], FAULT_ON_ERROR, NULL);
> > > +
> > > + /*
> > > + * rewind_stack_and_make_dead:
> > > + * ...
> > > + * leaq -PTREGS_SIZE(%rax), %rsp
> > > + */
> > > + found = FALSE;
> > > + rewind(pc->tmpfile2);
> > > + sprintf(buf1, "disass rewind_stack_and_make_dead");
> > > + if (!gdb_pass_through(buf1, pc->tmpfile2, GNU_RETURN_ON_ERROR)) {
> > > + kernel_stack_padding = 0;
> > > + goto out;
> > > + }
> > > + rewind(pc->tmpfile2);
> > > + while (fgets(buf1, BUFSIZE, pc->tmpfile2)) {
> > > + // find leaq -PTREGS_SIZE(%rax), %rsp
> > > + if (strstr(buf1, "lea") && (cursor = strstr(buf1,
> "-0x"))) {
> > > + parse_line(cursor, arglist);
> > > + char *p = strchr(arglist[0], '(');
> > > + *p = '\0';
> > > + ptregs_size_value = stol(arglist[0] + 1,
> FAULT_ON_ERROR, NULL);
> > > + found = TRUE;
> > > + break;
> > > + }
> > > + }
> > > + if (!found) {
> > > + kernel_stack_padding = 0;
> > > + goto out;
> > > + }
> > > +
> > > + struct syment *s = symbol_search("__end_init_task");
> > > + kernel_stack_padding = s->value - final_value -
> ptregs_size_value;
> > > +out:
> > > + close_tmpfile2();
> > > + return kernel_stack_padding;
> > > +}
> > > +
> > > #endif /* X86_64 */
> > > -- 2.40.1
> >
>
6 months, 1 week
[PATCH 1/2] X86 64: fix for crash session loading failure
by Lianbo Jiang
Kernel commit 223b5e57d0d5 ("mm/execmem, arch: convert remaining
overrides of module_alloc to execmem") makes crash session loading
failure as below:
# ./crash -s
crash: seek error: kernel virtual address: ffffffff826bb418 type: "page_offset_base"
For X86 64 architecture, currently crash will search for symbol
"module_load_offset" to determine if the KASLR is enabled, and go
into the relevant code block. But the symbols "module_load_offset"
has been removed since Linux v6.10-rc1, which caused the current
failure.
And this issue can occur with live debugging and core dump file
debugging.
Let's check the symbol "kaslr_regions" instead of "module_load_offset"
to fix it.
Signed-off-by: Lianbo Jiang <lijiang(a)redhat.com>
---
symbols.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/symbols.c b/symbols.c
index b7627a83587a..1cdf21d86d69 100644
--- a/symbols.c
+++ b/symbols.c
@@ -619,9 +619,9 @@ strip_symbol_end(const char *name, char *buf)
* or in /proc/kallsyms on a live system.
*
* Setting KASLR_CHECK will trigger a search for "module_load_offset"
- * during the initial symbol sort operation, and if found, will
- * set (RELOC_AUTO|KASLR). On live systems, the search is done
- * here by checking /proc/kallsyms.
+ * or "kaslr_regions" during the initial symbol sort operation, and
+ * if found, will set (RELOC_AUTO|KASLR). On live systems, the search
+ * is done here by checking /proc/kallsyms.
*/
static void
kaslr_init(void)
@@ -646,7 +646,8 @@ kaslr_init(void)
st->_stext_vmlinux = UNINITIALIZED;
if (ACTIVE() && /* Linux 3.15 */
- (symbol_value_from_proc_kallsyms("module_load_offset") != BADVAL)) {
+ ((symbol_value_from_proc_kallsyms("kaslr_regions") != BADVAL) ||
+ (symbol_value_from_proc_kallsyms("module_load_offset") != BADVAL))) {
kt->flags2 |= (RELOC_AUTO|KASLR);
st->_stext_vmlinux = UNINITIALIZED;
}
@@ -14247,7 +14248,9 @@ numeric_forward(const void *P_x, const void *P_y)
st->_stext_vmlinux = valueof(y);
}
if (kt->flags2 & KASLR_CHECK) {
- if (STREQ(x->name, "module_load_offset") ||
+ if (STREQ(x->name, "kaslr_regions") ||
+ STREQ(y->name, "kaslr_regions") ||
+ STREQ(x->name, "module_load_offset") ||
STREQ(y->name, "module_load_offset")) {
kt->flags2 &= ~KASLR_CHECK;
kt->flags2 |= (RELOC_AUTO|KASLR);
--
2.45.1
6 months, 1 week
Re: [PATCH] Fix "kmem -v" option on Linux 6.9 and later kernels
by Lianbo Jiang
Hi, Kazu
Thank you for the fix.
On 6/5/24 6:05 PM, devel-request(a)lists.crash-utility.osci.io wrote:
> Date: Wed, 5 Jun 2024 07:30:03 +0000
> From: HAGIO KAZUHITO(萩尾 一仁)<k-hagio-ab(a)nec.com>
> Subject: [Crash-utility] [PATCH] Fix "kmem -v" option on Linux 6.9 and
> later kernels
> To:"devel(a)lists.crash-utility.osci.io"
> <devel(a)lists.crash-utility.osci.io>
> Message-ID:<1717572599-30426-1-git-send-email-k-hagio-ab(a)nec.com>
> Content-Type: text/plain; charset="iso-2022-jp"
>
> The following kernel commits removed vmap_area_list and vmap_area_root
> rb-tree, and introduced vmap_nodes.
>
> 55c49fee57af mm/vmalloc: remove vmap_area_list
> d093602919ad mm: vmalloc: remove global vmap_area_root rb-tree
>
> Without the patch, the "kmem -v" option and functions that use
> dump_vmlist() fail with or without an error:
>
> crash> kmem -v
> VM_STRUCT ADDRESS RANGE SIZE
> kmem: invalid kernel virtual address: ccccccccccccccd4 type: "vmlist addr"
>
> crash> kmem -v
> crash>
>
> Signed-off-by: Kazuhito Hagio<k-hagio-ab(a)nec.com>
> ---
> defs.h | 4 ++
> memory.c | 135 +++++++++++++++++++++++++++++++++++++++++++++---------
> symbols.c | 3 ++
> 3 files changed, 120 insertions(+), 22 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index 01f316e67dde..95de33188070 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2240,6 +2240,8 @@ struct offset_table { /* stash of commonly-used offsets */
> long mnt_namespace_nr_mounts;
> long mount_mnt_node;
> long log_caller_id;
> + long vmap_node_busy;
> + long rb_list_head;
> };
>
> struct size_table { /* stash of commonly-used sizes */
> @@ -2414,6 +2416,7 @@ struct size_table { /* stash of commonly-used sizes */
> long maple_tree;
> long maple_node;
> long module_memory;
> + long vmap_node;
> };
>
> struct array_table {
> @@ -2678,6 +2681,7 @@ struct vm_table { /* kernel VM-related data */
> #define SLAB_OVERLOAD_PAGE (0x8000000)
> #define SLAB_CPU_CACHE (0x10000000)
> #define SLAB_ROOT_CACHES (0x20000000)
> +#define USE_VMAP_NODES (0x40000000)
>
> #define IS_FLATMEM() (vt->flags & FLATMEM)
> #define IS_DISCONTIGMEM() (vt->flags & DISCONTIGMEM)
> diff --git a/memory.c b/memory.c
> index 34ed646b5d1e..acb8507cfb75 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -235,6 +235,7 @@ static void dump_slab_objects(struct meminfo *);
> static void dump_slab_objects_percpu(struct meminfo *);
> static void dump_vmlist(struct meminfo *);
> static void dump_vmap_area(struct meminfo *);
> +static int get_vmap_area_list_from_nodes(ulong **);
> static int dump_page_lists(struct meminfo *);
> static void dump_kmeminfo(void);
> static int page_to_phys(ulong, physaddr_t *);
> @@ -433,9 +434,15 @@ vm_init(void)
> if (VALID_MEMBER(vmap_area_va_start) &&
> VALID_MEMBER(vmap_area_va_end) &&
> VALID_MEMBER(vmap_area_list) &&
> - VALID_MEMBER(vmap_area_vm) &&
> - kernel_symbol_exists("vmap_area_list"))
> - vt->flags |= USE_VMAP_AREA;
> + VALID_MEMBER(vmap_area_vm)) {
> + if (kernel_symbol_exists("vmap_nodes")) {
> + STRUCT_SIZE_INIT(vmap_node, "vmap_node");
> + MEMBER_OFFSET_INIT(vmap_node_busy, "vmap_node", "busy");
> + MEMBER_OFFSET_INIT(rb_list_head, "rb_list", "head");
> + vt->flags |= USE_VMAP_NODES;
> + } else if (kernel_symbol_exists("vmap_area_list"))
> + vt->flags |= USE_VMAP_AREA;
> + }
>
> if (kernel_symbol_exists("hstates")) {
> STRUCT_SIZE_INIT(hstate, "hstate");
> @@ -8957,7 +8964,7 @@ dump_vmlist(struct meminfo *vi)
> physaddr_t paddr;
> int mod_vmlist;
>
> - if (vt->flags & USE_VMAP_AREA) {
> + if (vt->flags & (USE_VMAP_AREA|USE_VMAP_NODES)) {
> dump_vmap_area(vi);
> return;
> }
> @@ -9067,6 +9074,77 @@ next_entry:
> vi->retval = verified;
> }
>
> +static int
> +sort_by_va_start(const void *arg1, const void *arg2)
> +{
> + ulong va_start1, va_start2;
> +
> + readmem(*(ulong *)arg1 + OFFSET(vmap_area_va_start), KVADDR, &va_start1,
> + sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
> + readmem(*(ulong *)arg2 + OFFSET(vmap_area_va_start), KVADDR, &va_start2,
> + sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
> +
> + return va_start1 < va_start2 ? -1 : (va_start1 == va_start2 ? 0 : 1);
> +}
> +
> +/* Linux 6.9 and later kernels use "vmap_nodes". */
> +static int
> +get_vmap_area_list_from_nodes(ulong **list_ptr)
> +{
> + int i, cnt, c;
> + struct list_data list_data, *ld = &list_data;
> + uint nr_vmap_nodes;
> + ulong vmap_nodes, list_head;
> + ulong *list, *ptr;
> +
> + get_symbol_data("nr_vmap_nodes", sizeof(uint), &nr_vmap_nodes);
> + get_symbol_data("vmap_nodes", sizeof(ulong), &vmap_nodes);
> +
> + /* count up all vmap_areas. */
> + cnt = 0;
> + for (i = 0; i < nr_vmap_nodes; i++) {
> + BZERO(ld, sizeof(struct list_data));
> + list_head = vmap_nodes + SIZE(vmap_node) * i +
> + OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
> + readmem(list_head, KVADDR, &ld->start, sizeof(void *),
> + "rb_list.head", FAULT_ON_ERROR);
> + ld->list_head_offset = OFFSET(vmap_area_list);
> + ld->end = list_head;
> + c = do_list(ld);
> + if (c < 0)
> + return -1;
> +
> + cnt += c;
> + }
> +
> + list = ptr = (ulong *)GETBUF(sizeof(void *) * cnt);
> +
> + /* gather all vmap_areas into a list. */
> + for (i = 0; i < nr_vmap_nodes; i++) {
> + BZERO(ld, sizeof(struct list_data));
> + ld->flags = LIST_ALLOCATE;
> + list_head = vmap_nodes + SIZE(vmap_node) * i +
> + OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
> + readmem(list_head, KVADDR, &ld->start, sizeof(void *),
> + "rb_list.head", FAULT_ON_ERROR);
> + ld->list_head_offset = OFFSET(vmap_area_list);
> + ld->end = list_head;
> + c = do_list(ld);
> + if (c < 0)
> + return -1;
> +
> + memcpy(ptr, ld->list_ptr, sizeof(void *) * c);
> + ptr += c;
> +
> + FREEBUF(ld->list_ptr);
> + }
The above two for-loop code blocks seem duplicated a little bit, but I
have no better way too.
Let's go with this, so for the patch: Ack.
Thanks
Lianbo
> +
> + qsort(list, cnt, sizeof(void *), sort_by_va_start);
> +
> + *list_ptr = list;
> + return cnt;
> +}
> +
> static void
> dump_vmap_area(struct meminfo *vi)
> {
> @@ -9080,26 +9158,37 @@ dump_vmap_area(struct meminfo *vi)
> char buf2[BUFSIZE];
> char buf3[BUFSIZE];
> char buf4[BUFSIZE];
> + ulong *list_ptr;
>
> #define VM_VM_AREA 0x4 /* mm/vmalloc.c */
>
> - vmap_area_buf = GETBUF(SIZE(vmap_area));
> start = count = verified = size = 0;
>
> - ld = &list_data;
> - BZERO(ld, sizeof(struct list_data));
> - ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
> - get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
> - ld->list_head_offset = OFFSET(vmap_area_list);
> - ld->end = symbol_value("vmap_area_list");
> - cnt = do_list(ld);
> - if (cnt < 0) {
> - FREEBUF(vmap_area_buf);
> - error(WARNING, "invalid/corrupt vmap_area_list\n");
> - vi->retval = 0;
> - return;
> + if (vt->flags & USE_VMAP_NODES) {
> + cnt = get_vmap_area_list_from_nodes(&list_ptr);
> + if (cnt < 0) {
> + error(WARNING, "invalid/corrupt vmap_nodes.busy list\n");
> + vi->retval = 0;
> + return;
> + }
> + } else {
> + ld = &list_data;
> + BZERO(ld, sizeof(struct list_data));
> + ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
> + get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
> + ld->list_head_offset = OFFSET(vmap_area_list);
> + ld->end = symbol_value("vmap_area_list");
> + cnt = do_list(ld);
> + if (cnt < 0) {
> + error(WARNING, "invalid/corrupt vmap_area_list\n");
> + vi->retval = 0;
> + return;
> + }
> + list_ptr = ld->list_ptr;
> }
>
> + vmap_area_buf = GETBUF(SIZE(vmap_area));
> +
> for (i = 0; i < cnt; i++) {
> if (!(pc->curcmd_flags & HEADER_PRINTED) && (i == 0) &&
> !(vi->flags & (GET_HIGHEST|GET_PHYS_TO_VMALLOC|
> @@ -9116,7 +9205,7 @@ dump_vmap_area(struct meminfo *vi)
> pc->curcmd_flags |= HEADER_PRINTED;
> }
>
> - readmem(ld->list_ptr[i], KVADDR, vmap_area_buf,
> + readmem(list_ptr[i], KVADDR, vmap_area_buf,
> SIZE(vmap_area), "vmap_area struct", FAULT_ON_ERROR);
>
> if (VALID_MEMBER(vmap_area_flags) &&
> @@ -9158,7 +9247,7 @@ dump_vmap_area(struct meminfo *vi)
> }
> fprintf(fp, "%s%s %s%s %s - %s %7ld\n",
> mkstring(buf1,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
> - MKSTR(ld->list_ptr[i])), space(MINSPACE-1),
> + MKSTR(list_ptr[i])), space(MINSPACE-1),
> mkstring(buf2,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
> MKSTR(vm_struct)), space(MINSPACE-1),
> mkstring(buf3, VADDR_PRLEN, LONG_HEX|RJUST,
> @@ -9179,14 +9268,14 @@ dump_vmap_area(struct meminfo *vi)
> if (vi->flags & GET_PHYS_TO_VMALLOC) {
> vi->retval = pcheck +
> PAGEOFFSET(vi->spec_addr);
> - FREEBUF(ld->list_ptr);
> + FREEBUF(list_ptr);
> return;
> } else
> fprintf(fp,
> "%s%s %s%s %s - %s %7ld\n",
> mkstring(buf1,VADDR_PRLEN,
> LONG_HEX|CENTER|LJUST,
> - MKSTR(ld->list_ptr[i])),
> + MKSTR(list_ptr[i])),
> space(MINSPACE-1),
> mkstring(buf2, VADDR_PRLEN,
> LONG_HEX|CENTER|LJUST,
> @@ -9204,7 +9293,7 @@ dump_vmap_area(struct meminfo *vi)
> }
>
> FREEBUF(vmap_area_buf);
> - FREEBUF(ld->list_ptr);
> + FREEBUF(list_ptr);
>
> if (vi->flags & GET_HIGHEST)
> vi->retval = start+size;
> @@ -14001,6 +14090,8 @@ dump_vm_table(int verbose)
> fprintf(fp, "%sSLAB_ROOT_CACHES", others++ ? "|" : "");\
> if (vt->flags & USE_VMAP_AREA)
> fprintf(fp, "%sUSE_VMAP_AREA", others++ ? "|" : "");\
> + if (vt->flags & USE_VMAP_NODES)
> + fprintf(fp, "%sUSE_VMAP_NODES", others++ ? "|" : "");\
> if (vt->flags & CONFIG_NUMA)
> fprintf(fp, "%sCONFIG_NUMA", others++ ? "|" : "");\
> if (vt->flags & VM_EVENT)
> diff --git a/symbols.c b/symbols.c
> index b7627a83587a..ded34412ff41 100644
> --- a/symbols.c
> +++ b/symbols.c
> @@ -10167,6 +10167,8 @@ dump_offset_table(char *spec, ulong makestruct)
> fprintf(fp, " vmap_area_flags: %ld\n",
> OFFSET(vmap_area_flags));
> fprintf(fp, " vmap_area_purge_list: %ld\n", OFFSET(vmap_area_purge_list));
> + fprintf(fp, " vmap_node_busy: %ld\n", OFFSET(vmap_node_busy));
> + fprintf(fp, " rb_list_head: %ld\n", OFFSET(rb_list_head));
>
> fprintf(fp, " module_size_of_struct: %ld\n",
> OFFSET(module_size_of_struct));
> @@ -12040,6 +12042,7 @@ dump_offset_table(char *spec, ulong makestruct)
> SIZE(task_group));
> fprintf(fp, " vmap_area: %ld\n",
> SIZE(vmap_area));
> + fprintf(fp, " vmap_node: %ld\n", SIZE(vmap_node));
> fprintf(fp, " hrtimer_clock_base: %ld\n",
> SIZE(hrtimer_clock_base));
> fprintf(fp, " hrtimer_base: %ld\n",
> -- 2.31.1
6 months, 1 week
[PATCH] Fix "kmem -v" option on Linux 6.9 and later kernels
by HAGIO KAZUHITO(萩尾 一仁)
The following kernel commits removed vmap_area_list and vmap_area_root
rb-tree, and introduced vmap_nodes.
55c49fee57af mm/vmalloc: remove vmap_area_list
d093602919ad mm: vmalloc: remove global vmap_area_root rb-tree
Without the patch, the "kmem -v" option and functions that use
dump_vmlist() fail with or without an error:
crash> kmem -v
VM_STRUCT ADDRESS RANGE SIZE
kmem: invalid kernel virtual address: ccccccccccccccd4 type: "vmlist addr"
crash> kmem -v
crash>
Signed-off-by: Kazuhito Hagio <k-hagio-ab(a)nec.com>
---
defs.h | 4 ++
memory.c | 135 +++++++++++++++++++++++++++++++++++++++++++++---------
symbols.c | 3 ++
3 files changed, 120 insertions(+), 22 deletions(-)
diff --git a/defs.h b/defs.h
index 01f316e67dde..95de33188070 100644
--- a/defs.h
+++ b/defs.h
@@ -2240,6 +2240,8 @@ struct offset_table { /* stash of commonly-used offsets */
long mnt_namespace_nr_mounts;
long mount_mnt_node;
long log_caller_id;
+ long vmap_node_busy;
+ long rb_list_head;
};
struct size_table { /* stash of commonly-used sizes */
@@ -2414,6 +2416,7 @@ struct size_table { /* stash of commonly-used sizes */
long maple_tree;
long maple_node;
long module_memory;
+ long vmap_node;
};
struct array_table {
@@ -2678,6 +2681,7 @@ struct vm_table { /* kernel VM-related data */
#define SLAB_OVERLOAD_PAGE (0x8000000)
#define SLAB_CPU_CACHE (0x10000000)
#define SLAB_ROOT_CACHES (0x20000000)
+#define USE_VMAP_NODES (0x40000000)
#define IS_FLATMEM() (vt->flags & FLATMEM)
#define IS_DISCONTIGMEM() (vt->flags & DISCONTIGMEM)
diff --git a/memory.c b/memory.c
index 34ed646b5d1e..acb8507cfb75 100644
--- a/memory.c
+++ b/memory.c
@@ -235,6 +235,7 @@ static void dump_slab_objects(struct meminfo *);
static void dump_slab_objects_percpu(struct meminfo *);
static void dump_vmlist(struct meminfo *);
static void dump_vmap_area(struct meminfo *);
+static int get_vmap_area_list_from_nodes(ulong **);
static int dump_page_lists(struct meminfo *);
static void dump_kmeminfo(void);
static int page_to_phys(ulong, physaddr_t *);
@@ -433,9 +434,15 @@ vm_init(void)
if (VALID_MEMBER(vmap_area_va_start) &&
VALID_MEMBER(vmap_area_va_end) &&
VALID_MEMBER(vmap_area_list) &&
- VALID_MEMBER(vmap_area_vm) &&
- kernel_symbol_exists("vmap_area_list"))
- vt->flags |= USE_VMAP_AREA;
+ VALID_MEMBER(vmap_area_vm)) {
+ if (kernel_symbol_exists("vmap_nodes")) {
+ STRUCT_SIZE_INIT(vmap_node, "vmap_node");
+ MEMBER_OFFSET_INIT(vmap_node_busy, "vmap_node", "busy");
+ MEMBER_OFFSET_INIT(rb_list_head, "rb_list", "head");
+ vt->flags |= USE_VMAP_NODES;
+ } else if (kernel_symbol_exists("vmap_area_list"))
+ vt->flags |= USE_VMAP_AREA;
+ }
if (kernel_symbol_exists("hstates")) {
STRUCT_SIZE_INIT(hstate, "hstate");
@@ -8957,7 +8964,7 @@ dump_vmlist(struct meminfo *vi)
physaddr_t paddr;
int mod_vmlist;
- if (vt->flags & USE_VMAP_AREA) {
+ if (vt->flags & (USE_VMAP_AREA|USE_VMAP_NODES)) {
dump_vmap_area(vi);
return;
}
@@ -9067,6 +9074,77 @@ next_entry:
vi->retval = verified;
}
+static int
+sort_by_va_start(const void *arg1, const void *arg2)
+{
+ ulong va_start1, va_start2;
+
+ readmem(*(ulong *)arg1 + OFFSET(vmap_area_va_start), KVADDR, &va_start1,
+ sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
+ readmem(*(ulong *)arg2 + OFFSET(vmap_area_va_start), KVADDR, &va_start2,
+ sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
+
+ return va_start1 < va_start2 ? -1 : (va_start1 == va_start2 ? 0 : 1);
+}
+
+/* Linux 6.9 and later kernels use "vmap_nodes". */
+static int
+get_vmap_area_list_from_nodes(ulong **list_ptr)
+{
+ int i, cnt, c;
+ struct list_data list_data, *ld = &list_data;
+ uint nr_vmap_nodes;
+ ulong vmap_nodes, list_head;
+ ulong *list, *ptr;
+
+ get_symbol_data("nr_vmap_nodes", sizeof(uint), &nr_vmap_nodes);
+ get_symbol_data("vmap_nodes", sizeof(ulong), &vmap_nodes);
+
+ /* count up all vmap_areas. */
+ cnt = 0;
+ for (i = 0; i < nr_vmap_nodes; i++) {
+ BZERO(ld, sizeof(struct list_data));
+ list_head = vmap_nodes + SIZE(vmap_node) * i +
+ OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
+ readmem(list_head, KVADDR, &ld->start, sizeof(void *),
+ "rb_list.head", FAULT_ON_ERROR);
+ ld->list_head_offset = OFFSET(vmap_area_list);
+ ld->end = list_head;
+ c = do_list(ld);
+ if (c < 0)
+ return -1;
+
+ cnt += c;
+ }
+
+ list = ptr = (ulong *)GETBUF(sizeof(void *) * cnt);
+
+ /* gather all vmap_areas into a list. */
+ for (i = 0; i < nr_vmap_nodes; i++) {
+ BZERO(ld, sizeof(struct list_data));
+ ld->flags = LIST_ALLOCATE;
+ list_head = vmap_nodes + SIZE(vmap_node) * i +
+ OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
+ readmem(list_head, KVADDR, &ld->start, sizeof(void *),
+ "rb_list.head", FAULT_ON_ERROR);
+ ld->list_head_offset = OFFSET(vmap_area_list);
+ ld->end = list_head;
+ c = do_list(ld);
+ if (c < 0)
+ return -1;
+
+ memcpy(ptr, ld->list_ptr, sizeof(void *) * c);
+ ptr += c;
+
+ FREEBUF(ld->list_ptr);
+ }
+
+ qsort(list, cnt, sizeof(void *), sort_by_va_start);
+
+ *list_ptr = list;
+ return cnt;
+}
+
static void
dump_vmap_area(struct meminfo *vi)
{
@@ -9080,26 +9158,37 @@ dump_vmap_area(struct meminfo *vi)
char buf2[BUFSIZE];
char buf3[BUFSIZE];
char buf4[BUFSIZE];
+ ulong *list_ptr;
#define VM_VM_AREA 0x4 /* mm/vmalloc.c */
- vmap_area_buf = GETBUF(SIZE(vmap_area));
start = count = verified = size = 0;
- ld = &list_data;
- BZERO(ld, sizeof(struct list_data));
- ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
- get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
- ld->list_head_offset = OFFSET(vmap_area_list);
- ld->end = symbol_value("vmap_area_list");
- cnt = do_list(ld);
- if (cnt < 0) {
- FREEBUF(vmap_area_buf);
- error(WARNING, "invalid/corrupt vmap_area_list\n");
- vi->retval = 0;
- return;
+ if (vt->flags & USE_VMAP_NODES) {
+ cnt = get_vmap_area_list_from_nodes(&list_ptr);
+ if (cnt < 0) {
+ error(WARNING, "invalid/corrupt vmap_nodes.busy list\n");
+ vi->retval = 0;
+ return;
+ }
+ } else {
+ ld = &list_data;
+ BZERO(ld, sizeof(struct list_data));
+ ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
+ get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
+ ld->list_head_offset = OFFSET(vmap_area_list);
+ ld->end = symbol_value("vmap_area_list");
+ cnt = do_list(ld);
+ if (cnt < 0) {
+ error(WARNING, "invalid/corrupt vmap_area_list\n");
+ vi->retval = 0;
+ return;
+ }
+ list_ptr = ld->list_ptr;
}
+ vmap_area_buf = GETBUF(SIZE(vmap_area));
+
for (i = 0; i < cnt; i++) {
if (!(pc->curcmd_flags & HEADER_PRINTED) && (i == 0) &&
!(vi->flags & (GET_HIGHEST|GET_PHYS_TO_VMALLOC|
@@ -9116,7 +9205,7 @@ dump_vmap_area(struct meminfo *vi)
pc->curcmd_flags |= HEADER_PRINTED;
}
- readmem(ld->list_ptr[i], KVADDR, vmap_area_buf,
+ readmem(list_ptr[i], KVADDR, vmap_area_buf,
SIZE(vmap_area), "vmap_area struct", FAULT_ON_ERROR);
if (VALID_MEMBER(vmap_area_flags) &&
@@ -9158,7 +9247,7 @@ dump_vmap_area(struct meminfo *vi)
}
fprintf(fp, "%s%s %s%s %s - %s %7ld\n",
mkstring(buf1,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
- MKSTR(ld->list_ptr[i])), space(MINSPACE-1),
+ MKSTR(list_ptr[i])), space(MINSPACE-1),
mkstring(buf2,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
MKSTR(vm_struct)), space(MINSPACE-1),
mkstring(buf3, VADDR_PRLEN, LONG_HEX|RJUST,
@@ -9179,14 +9268,14 @@ dump_vmap_area(struct meminfo *vi)
if (vi->flags & GET_PHYS_TO_VMALLOC) {
vi->retval = pcheck +
PAGEOFFSET(vi->spec_addr);
- FREEBUF(ld->list_ptr);
+ FREEBUF(list_ptr);
return;
} else
fprintf(fp,
"%s%s %s%s %s - %s %7ld\n",
mkstring(buf1,VADDR_PRLEN,
LONG_HEX|CENTER|LJUST,
- MKSTR(ld->list_ptr[i])),
+ MKSTR(list_ptr[i])),
space(MINSPACE-1),
mkstring(buf2, VADDR_PRLEN,
LONG_HEX|CENTER|LJUST,
@@ -9204,7 +9293,7 @@ dump_vmap_area(struct meminfo *vi)
}
FREEBUF(vmap_area_buf);
- FREEBUF(ld->list_ptr);
+ FREEBUF(list_ptr);
if (vi->flags & GET_HIGHEST)
vi->retval = start+size;
@@ -14001,6 +14090,8 @@ dump_vm_table(int verbose)
fprintf(fp, "%sSLAB_ROOT_CACHES", others++ ? "|" : "");\
if (vt->flags & USE_VMAP_AREA)
fprintf(fp, "%sUSE_VMAP_AREA", others++ ? "|" : "");\
+ if (vt->flags & USE_VMAP_NODES)
+ fprintf(fp, "%sUSE_VMAP_NODES", others++ ? "|" : "");\
if (vt->flags & CONFIG_NUMA)
fprintf(fp, "%sCONFIG_NUMA", others++ ? "|" : "");\
if (vt->flags & VM_EVENT)
diff --git a/symbols.c b/symbols.c
index b7627a83587a..ded34412ff41 100644
--- a/symbols.c
+++ b/symbols.c
@@ -10167,6 +10167,8 @@ dump_offset_table(char *spec, ulong makestruct)
fprintf(fp, " vmap_area_flags: %ld\n",
OFFSET(vmap_area_flags));
fprintf(fp, " vmap_area_purge_list: %ld\n", OFFSET(vmap_area_purge_list));
+ fprintf(fp, " vmap_node_busy: %ld\n", OFFSET(vmap_node_busy));
+ fprintf(fp, " rb_list_head: %ld\n", OFFSET(rb_list_head));
fprintf(fp, " module_size_of_struct: %ld\n",
OFFSET(module_size_of_struct));
@@ -12040,6 +12042,7 @@ dump_offset_table(char *spec, ulong makestruct)
SIZE(task_group));
fprintf(fp, " vmap_area: %ld\n",
SIZE(vmap_area));
+ fprintf(fp, " vmap_node: %ld\n", SIZE(vmap_node));
fprintf(fp, " hrtimer_clock_base: %ld\n",
SIZE(hrtimer_clock_base));
fprintf(fp, " hrtimer_base: %ld\n",
--
2.31.1
6 months, 2 weeks