 
                                        
                                
                         
                        
                                
                                
                                        
                                                
                                        
                                        
                                        Re: [PATCH] x86_64: Add top_of_kernel_stack_padding for kernel stack
                                
                                
                                
                                    
                                        by lijiang
                                    
                                
                                
                                        On Fri, May 31, 2024 at 5:21 PM <devel-request(a)lists.crash-utility.osci.io>
wrote:
> Date: Fri, 31 May 2024 13:14:02 +0800
> From: Tao Liu <ltao(a)redhat.com>
> Subject: [Crash-utility] Re: [PATCH] x86_64: Add
>         top_of_kernel_stack_padding for kernel stack
> To: Lianbo Jiang <lijiang(a)redhat.com>
> Cc: devel(a)lists.crash-utility.osci.io
> Message-ID:
>         <
> CAO7dBbWZgeiJBN2sv-3e00TSU2uO_kMbSZ_NGVCft6xzdj3EMA(a)mail.gmail.com>
> Content-Type: text/plain; charset="UTF-8"
>
> Hi Lianbo,
>
> On Mon, May 27, 2024 at 11:30 AM Lianbo Jiang <lijiang(a)redhat.com> wrote:
> >
> > Hi, Tao
> >
> > Thank you for the fix.
> >
> > On 5/23/24 12:06 PM, devel-request(a)lists.crash-utility.osci.io wrote:
> > > Date: Thu, 23 May 2024 12:06:03 +0800
> > > From: Tao Liu<ltao(a)redhat.com>
> > > Subject: [Crash-utility] [PATCH] x86_64: Add
> > >       top_of_kernel_stack_padding for kernel stack
> > > To:devel@lists.crash-utility.osci.io
> > > Cc: Tao Liu<ltao(a)redhat.com>
> > > Message-ID:<20240523040603.10304-1-ltao(a)redhat.com>
> > > Content-Type: text/plain; charset="US-ASCII"; x-default=true
> > >
> > > With kernel patch [1], x86_64 will add extra padding for kernel stack,
> > > as a result, the pt_regs will be shift down by the offset of padding.
> > > Without the patch, the values of registers read from pt_regs will be
> > > incorrect.
> > >
> > > Though currently the TOP_OF_KERNEL_STACK_PADDING is configured by
> > > Kconfig, according to kernel code comment [2], the value may be made
> > > dynamicly later. In addition there might be systems compiled without
> > > Kconfig avaliable. So in this patch, we will calculate the value of
> > > TOP_OF_KERNEL_STACK_PADDING.
> > >
> > > The calculation is as follows:
> > >
> > > 1) in startup_64(), there is a lea instruction as:
> > >     leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING -
> PTREGS_SIZE)(%rip), %rsp
> > >
> > > 2) in rewind_stack_and_make_dead(), there is a lea instruction as:
> > >     leaq      -PTREGS_SIZE(%rax), %rsp
> > >
> > > The disassembled 2 instructions will be like:
> > >
> > > 1) 0xffffffff93a0007d <startup_64+3>:      lea
> 0x1e03ec4(%rip),%rsp        # 0xffffffff95803f48
> > >
>         ^^^^^^^^^^^^^^^^^^^^
> > > 2) 0xffffffff93a0465a <rewind_stack_and_make_dead+10>:     lea
> -0xa8(%rax),%rsp
> > >
>  ^^^^
> > > 0xffffffff95803f48 is the value of (__end_init_task -
> > > TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE), and 0xa8 is the value of
> > > PTREGS_SIZE, __end_init_task can be get by symbol reading.
> >
> > Calculating the value of TOP_OF_KERNEL_STACK_PADDING, which looks good,
> but it heavily relies on compiler.
> > Normally we would use this way unless there is no other choice.
> >
> > How about the following changes? Although it doesn't handle the case
> that the value is dynamic, let's see
> > how to change in the kernel in future, and then consider how to reflect
> it in crash-utility.
> >
> Sure, looks good to me, so let's go with this, and update it later
> when kernel changes.
>
Ok. Thanks, Tao.
Applied with minor changes:
https://github.com/crash-utility/crash/commit/48764a14bc5856f0b0bb3068533...
Lianbo
>
> Thanks,
> Tao Liu
>
> >
> > diff --git a/defs.h b/defs.h
> > index 01f316e67dde..42d875965256 100644
> > --- a/defs.h
> > +++ b/defs.h
> > @@ -2414,6 +2414,7 @@ struct size_table {         /* stash of
> commonly-used sizes */
> >         long maple_tree;
> >         long maple_node;
> >         long module_memory;
> > +       long fred_frame;
> >   };
> >
> >   struct array_table {
> > diff --git a/kernel.c b/kernel.c
> > index 1728b70c1b5c..cd3d6044cc9a 100644
> > --- a/kernel.c
> > +++ b/kernel.c
> > @@ -668,6 +668,7 @@ kernel_init()
> >         STRUCT_SIZE_INIT(softirq_state, "softirq_state");
> >         STRUCT_SIZE_INIT(softirq_action, "softirq_action");
> >         STRUCT_SIZE_INIT(desc_struct, "desc_struct");
> > +       STRUCT_SIZE_INIT(fred_frame, "fred_frame");
> >
> >         STRUCT_SIZE_INIT(char_device_struct, "char_device_struct");
> >         if (VALID_STRUCT(char_device_struct)) {
> > diff --git a/x86_64.c b/x86_64.c
> > index 0c21eb827e4a..6777c93e6b47 100644
> > --- a/x86_64.c
> > +++ b/x86_64.c
> > @@ -4086,10 +4086,11 @@ in_exception_stack:
> >
> >           if (!irq_eframe && !is_kernel_thread(bt->tc->task) &&
> >               (GET_STACKBASE(bt->tc->task) == bt->stackbase)) {
> > +               long stack_padding_size = SIZE(fred_frame) > 0 ? (2*8) :
> 0;
> >                 user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> >                 if (last_process_stack_eframe < user_mode_eframe)
> >                         x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > -                               (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > +                               (bt->stacktop - stack_padding_size -
> bt->stackbase) - SIZE(pt_regs),
> >                                 bt, ofp);
> >         }
> >
> > @@ -4407,10 +4408,11 @@ in_exception_stack:
> >
> >           if (!irq_eframe && !is_kernel_thread(bt->tc->task) &&
> >               (GET_STACKBASE(bt->tc->task) == bt->stackbase)) {
> > +               long stack_padding_size = SIZE(fred_frame) > 0 ? (2*8) :
> 0;
> >                 user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> >                 if (last_process_stack_eframe < user_mode_eframe)
> >                         x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > -                               (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > +                               (bt->stacktop - stack_padding_size -
> bt->stackbase) - SIZE(pt_regs),
> >                                 bt, ofp);
> >         }
> >
> > Thanks
> > Lianbo
> >
> > > [1]:
> https://lore.kernel.org/all/170668568261.398.10403890006820046961.tip-bot...
> > > [2]:
> https://elixir.bootlin.com/linux/v6.9.1/source/arch/x86/include/asm/threa...
> > >
> > > Signed-off-by: Tao Liu<ltao(a)redhat.com>
> > > ---
> > >   x86_64.c | 84
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > >   1 file changed, 82 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/x86_64.c b/x86_64.c
> > > index 0c21eb8..43a31c2 100644
> > > --- a/x86_64.c
> > > +++ b/x86_64.c
> > > @@ -137,6 +137,7 @@ static orc_entry *orc_find(ulong);
> > >   static orc_entry *orc_module_find(ulong);
> > >   static ulong ip_table_to_vaddr(ulong);
> > >   static void orc_dump(ulong);
> > > +static long top_of_kernel_stack_padding(void);
> > >
> > >   struct machine_specific x86_64_machine_specific = { 0 };
> > >
> > > @@ -4089,7 +4090,8 @@ in_exception_stack:
> > >               user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > >               if (last_process_stack_eframe < user_mode_eframe)
> > >                       x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > > -                             (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > > +                             (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs) -
> > > +                             top_of_kernel_stack_padding(),
> > >                               bt, ofp);
> > >       }
> > >
> > > @@ -4410,7 +4412,8 @@ in_exception_stack:
> > >               user_mode_eframe = bt->stacktop - SIZE(pt_regs);
> > >               if (last_process_stack_eframe < user_mode_eframe)
> > >                       x86_64_exception_frame(EFRAME_PRINT, 0,
> bt->stackbuf +
> > > -                             (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs),
> > > +                             (bt->stacktop - bt->stackbase) -
> SIZE(pt_regs) -
> > > +                             top_of_kernel_stack_padding(),
> > >                               bt, ofp);
> > >       }
> > >
> > > @@ -9541,4 +9544,81 @@ x86_64_swp_offset(ulong entry)
> > >       return SWP_OFFSET(entry);
> > >   }
> > >
> > > +static long
> > > +top_of_kernel_stack_padding(void)
> > > +{
> > > +     char buf1[BUFSIZE];
> > > +     char *cursor;
> > > +     long final_value, ptregs_size_value;
> > > +     char *arglist[MAXARGS];
> > > +     bool found = FALSE;
> > > +
> > > +     static long kernel_stack_padding = -1;
> > > +
> > > +     if (kernel_stack_padding >= 0)
> > > +             return kernel_stack_padding;
> > > +
> > > +     /*
> > > +     * startup_64:
> > > +     * ...
> > > +     * mov %rsi,%r15
> > > +     * leaq  (__end_init_task - TOP_OF_KERNEL_STACK_PADDING -
> PTREGS_SIZE)(%rip), %rsp
> > > +     */
> > > +     sprintf(buf1, "disass /r startup_64");
> > > +     open_tmpfile2();
> > > +     if (!gdb_pass_through(buf1, pc->tmpfile2, GNU_RETURN_ON_ERROR)) {
> > > +             kernel_stack_padding = 0;
> > > +             goto out;
> > > +     }
> > > +
> > > +     rewind(pc->tmpfile2);
> > > +     while (fgets(buf1, BUFSIZE, pc->tmpfile2) && !found) {
> > > +             // machine code of "mov %rsi,%r15"
> > > +             if (strstr(buf1, "49 89 f7"))
> > > +                     found = TRUE;
> > > +     }
> > > +     if (!found || !(cursor = strstr(buf1, "# 0x"))) {
> > > +             kernel_stack_padding = 0;
> > > +             goto out;
> > > +     }
> > > +
> > > +     parse_line(cursor, arglist);
> > > +     final_value = stol(arglist[1], FAULT_ON_ERROR, NULL);
> > > +
> > > +     /*
> > > +     * rewind_stack_and_make_dead:
> > > +     * ...
> > > +     * leaq  -PTREGS_SIZE(%rax), %rsp
> > > +     */
> > > +     found = FALSE;
> > > +     rewind(pc->tmpfile2);
> > > +     sprintf(buf1, "disass rewind_stack_and_make_dead");
> > > +     if (!gdb_pass_through(buf1, pc->tmpfile2, GNU_RETURN_ON_ERROR)) {
> > > +             kernel_stack_padding = 0;
> > > +             goto out;
> > > +     }
> > > +     rewind(pc->tmpfile2);
> > > +     while (fgets(buf1, BUFSIZE, pc->tmpfile2)) {
> > > +             // find leaq -PTREGS_SIZE(%rax), %rsp
> > > +             if (strstr(buf1, "lea") && (cursor = strstr(buf1,
> "-0x"))) {
> > > +                     parse_line(cursor, arglist);
> > > +                     char *p = strchr(arglist[0], '(');
> > > +                     *p = '\0';
> > > +                     ptregs_size_value = stol(arglist[0] + 1,
> FAULT_ON_ERROR, NULL);
> > > +                     found = TRUE;
> > > +                     break;
> > > +             }
> > > +     }
> > > +     if (!found) {
> > > +             kernel_stack_padding = 0;
> > > +             goto out;
> > > +     }
> > > +
> > > +     struct syment *s = symbol_search("__end_init_task");
> > > +     kernel_stack_padding = s->value - final_value -
> ptregs_size_value;
> > > +out:
> > > +     close_tmpfile2();
> > > +     return kernel_stack_padding;
> > > +}
> > > +
> > >   #endif  /* X86_64 */
> > > -- 2.40.1
> >
>
                                
                         
                        
                                
                                1 year, 4 months
                        
                        
                 
         
 
        
            
        
        
        
                
                        
                                
                                 
                                        
                                
                         
                        
                                
                                
                                        
                                                
                                        
                                        
                                        [PATCH 1/2] X86 64: fix for crash session loading failure
                                
                                
                                
                                    
                                        by Lianbo Jiang
                                    
                                
                                
                                        Kernel commit 223b5e57d0d5 ("mm/execmem, arch: convert remaining
overrides of module_alloc to execmem") makes crash session loading
failure as below:
  # ./crash -s
  crash: seek error: kernel virtual address: ffffffff826bb418  type: "page_offset_base"
For X86 64 architecture, currently crash will search for symbol
"module_load_offset" to determine if the KASLR is enabled, and go
into the relevant code block. But the symbols "module_load_offset"
has been removed since Linux v6.10-rc1, which caused the current
failure.
And this issue can occur with live debugging and core dump file
debugging.
Let's check the symbol "kaslr_regions" instead of "module_load_offset"
to fix it.
Signed-off-by: Lianbo Jiang <lijiang(a)redhat.com>
---
 symbols.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/symbols.c b/symbols.c
index b7627a83587a..1cdf21d86d69 100644
--- a/symbols.c
+++ b/symbols.c
@@ -619,9 +619,9 @@ strip_symbol_end(const char *name, char *buf)
  *  or in /proc/kallsyms on a live system.
  *
  *  Setting KASLR_CHECK will trigger a search for "module_load_offset"
- *  during the initial symbol sort operation, and if found, will
- *  set (RELOC_AUTO|KASLR).  On live systems, the search is done
- *  here by checking /proc/kallsyms.
+ *  or "kaslr_regions" during the initial symbol sort operation, and
+ *  if found, will set (RELOC_AUTO|KASLR).  On live systems, the search
+ *  is done here by checking /proc/kallsyms.
  */
 static void
 kaslr_init(void)
@@ -646,7 +646,8 @@ kaslr_init(void)
 		st->_stext_vmlinux = UNINITIALIZED;
 
 	if (ACTIVE() &&   /* Linux 3.15 */
-	    (symbol_value_from_proc_kallsyms("module_load_offset") != BADVAL)) {
+	    ((symbol_value_from_proc_kallsyms("kaslr_regions") != BADVAL) ||
+	    (symbol_value_from_proc_kallsyms("module_load_offset") != BADVAL))) {
 		kt->flags2 |= (RELOC_AUTO|KASLR);
 		st->_stext_vmlinux = UNINITIALIZED;
 	}
@@ -14247,7 +14248,9 @@ numeric_forward(const void *P_x, const void *P_y)
 			st->_stext_vmlinux = valueof(y);
 	}
 	if (kt->flags2 & KASLR_CHECK) {
-		if (STREQ(x->name, "module_load_offset") || 
+		if (STREQ(x->name, "kaslr_regions") ||
+		    STREQ(y->name, "kaslr_regions") ||
+		    STREQ(x->name, "module_load_offset") ||
 		    STREQ(y->name, "module_load_offset")) {
 			kt->flags2 &= ~KASLR_CHECK;
 			kt->flags2 |= (RELOC_AUTO|KASLR);
-- 
2.45.1
                                
                         
                        
                                
                                1 year, 4 months
                        
                        
                 
         
 
        
            
        
        
        
                
                        
                                
                                 
                                        
                                
                         
                        
                                
                                
                                        
                                                
                                        
                                        
                                        Re: [PATCH] Fix "kmem -v" option on Linux 6.9 and later kernels
                                
                                
                                
                                    
                                        by Lianbo Jiang
                                    
                                
                                
                                        Hi, Kazu
Thank you for the fix.
On 6/5/24 6:05 PM, devel-request(a)lists.crash-utility.osci.io wrote:
> Date: Wed, 5 Jun 2024 07:30:03 +0000
> From: HAGIO KAZUHITO(萩尾 一仁)<k-hagio-ab(a)nec.com>
> Subject: [Crash-utility] [PATCH] Fix "kmem -v" option on Linux 6.9 and
> 	later kernels
> To:"devel(a)lists.crash-utility.osci.io"
> 	<devel(a)lists.crash-utility.osci.io>
> Message-ID:<1717572599-30426-1-git-send-email-k-hagio-ab(a)nec.com>
> Content-Type: text/plain; charset="iso-2022-jp"
>
> The following kernel commits removed vmap_area_list and vmap_area_root
> rb-tree, and introduced vmap_nodes.
>
>    55c49fee57af mm/vmalloc: remove vmap_area_list
>    d093602919ad mm: vmalloc: remove global vmap_area_root rb-tree
>
> Without the patch, the "kmem -v" option and functions that use
> dump_vmlist() fail with or without an error:
>
>    crash> kmem -v
>       VM_STRUCT                 ADDRESS RANGE               SIZE
>    kmem: invalid kernel virtual address: ccccccccccccccd4  type: "vmlist addr"
>
>    crash> kmem -v
>    crash>
>
> Signed-off-by: Kazuhito Hagio<k-hagio-ab(a)nec.com>
> ---
>   defs.h    |   4 ++
>   memory.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++---------
>   symbols.c |   3 ++
>   3 files changed, 120 insertions(+), 22 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index 01f316e67dde..95de33188070 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2240,6 +2240,8 @@ struct offset_table {                    /* stash of commonly-used offsets */
>   	long mnt_namespace_nr_mounts;
>   	long mount_mnt_node;
>   	long log_caller_id;
> +	long vmap_node_busy;
> +	long rb_list_head;
>   };
>   
>   struct size_table {         /* stash of commonly-used sizes */
> @@ -2414,6 +2416,7 @@ struct size_table {         /* stash of commonly-used sizes */
>   	long maple_tree;
>   	long maple_node;
>   	long module_memory;
> +	long vmap_node;
>   };
>   
>   struct array_table {
> @@ -2678,6 +2681,7 @@ struct vm_table {                /* kernel VM-related data */
>   #define SLAB_OVERLOAD_PAGE    (0x8000000)
>   #define SLAB_CPU_CACHE       (0x10000000)
>   #define SLAB_ROOT_CACHES     (0x20000000)
> +#define USE_VMAP_NODES       (0x40000000)
>   
>   #define IS_FLATMEM()		(vt->flags & FLATMEM)
>   #define IS_DISCONTIGMEM()	(vt->flags & DISCONTIGMEM)
> diff --git a/memory.c b/memory.c
> index 34ed646b5d1e..acb8507cfb75 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -235,6 +235,7 @@ static void dump_slab_objects(struct meminfo *);
>   static void dump_slab_objects_percpu(struct meminfo *);
>   static void dump_vmlist(struct meminfo *);
>   static void dump_vmap_area(struct meminfo *);
> +static int get_vmap_area_list_from_nodes(ulong **);
>   static int dump_page_lists(struct meminfo *);
>   static void dump_kmeminfo(void);
>   static int page_to_phys(ulong, physaddr_t *);
> @@ -433,9 +434,15 @@ vm_init(void)
>   	if (VALID_MEMBER(vmap_area_va_start) &&
>   	    VALID_MEMBER(vmap_area_va_end) &&
>   	    VALID_MEMBER(vmap_area_list) &&
> -	    VALID_MEMBER(vmap_area_vm) &&
> -	    kernel_symbol_exists("vmap_area_list"))
> -		vt->flags |= USE_VMAP_AREA;
> +	    VALID_MEMBER(vmap_area_vm)) {
> +		if (kernel_symbol_exists("vmap_nodes")) {
> +			STRUCT_SIZE_INIT(vmap_node, "vmap_node");
> +			MEMBER_OFFSET_INIT(vmap_node_busy, "vmap_node", "busy");
> +			MEMBER_OFFSET_INIT(rb_list_head, "rb_list", "head");
> +			vt->flags |= USE_VMAP_NODES;
> +		} else if (kernel_symbol_exists("vmap_area_list"))
> +			vt->flags |= USE_VMAP_AREA;
> +	}
>   
>   	if (kernel_symbol_exists("hstates")) {
>   		STRUCT_SIZE_INIT(hstate, "hstate");
> @@ -8957,7 +8964,7 @@ dump_vmlist(struct meminfo *vi)
>   	physaddr_t paddr;
>   	int mod_vmlist;
>   
> -	if (vt->flags & USE_VMAP_AREA) {
> +	if (vt->flags & (USE_VMAP_AREA|USE_VMAP_NODES)) {
>   		dump_vmap_area(vi);
>   		return;
>   	}
> @@ -9067,6 +9074,77 @@ next_entry:
>   		vi->retval = verified;
>   }
>   
> +static int
> +sort_by_va_start(const void *arg1, const void *arg2)
> +{
> +	ulong va_start1, va_start2;
> +
> +	readmem(*(ulong *)arg1 + OFFSET(vmap_area_va_start), KVADDR, &va_start1,
> +		sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
> +	readmem(*(ulong *)arg2 + OFFSET(vmap_area_va_start), KVADDR, &va_start2,
> +		sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
> +
> +	return va_start1 < va_start2 ? -1 : (va_start1 == va_start2 ? 0 : 1);
> +}
> +
> +/* Linux 6.9 and later kernels use "vmap_nodes". */
> +static int
> +get_vmap_area_list_from_nodes(ulong **list_ptr)
> +{
> +	int i, cnt, c;
> +	struct list_data list_data, *ld = &list_data;
> +	uint nr_vmap_nodes;
> +	ulong vmap_nodes, list_head;
> +	ulong *list, *ptr;
> +
> +	get_symbol_data("nr_vmap_nodes", sizeof(uint), &nr_vmap_nodes);
> +	get_symbol_data("vmap_nodes", sizeof(ulong), &vmap_nodes);
> +
> +	/* count up all vmap_areas. */
> +	cnt = 0;
> +	for (i = 0; i < nr_vmap_nodes; i++) {
> +		BZERO(ld, sizeof(struct list_data));
> +		list_head = vmap_nodes + SIZE(vmap_node) * i +
> +				OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
> +		readmem(list_head, KVADDR, &ld->start, sizeof(void *),
> +				"rb_list.head", FAULT_ON_ERROR);
> +		ld->list_head_offset = OFFSET(vmap_area_list);
> +		ld->end = list_head;
> +		c = do_list(ld);
> +		if (c < 0)
> +			return -1;
> +
> +		cnt += c;
> +	}
> +
> +	list = ptr = (ulong *)GETBUF(sizeof(void *) * cnt);
> +
> +	/* gather all vmap_areas into a list. */
> +	for (i = 0; i < nr_vmap_nodes; i++) {
> +		BZERO(ld, sizeof(struct list_data));
> +		ld->flags = LIST_ALLOCATE;
> +		list_head = vmap_nodes + SIZE(vmap_node) * i +
> +				OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
> +		readmem(list_head, KVADDR, &ld->start, sizeof(void *),
> +				"rb_list.head", FAULT_ON_ERROR);
> +		ld->list_head_offset = OFFSET(vmap_area_list);
> +		ld->end = list_head;
> +		c = do_list(ld);
> +		if (c < 0)
> +			return -1;
> +
> +		memcpy(ptr, ld->list_ptr, sizeof(void *) * c);
> +		ptr += c;
> +
> +		FREEBUF(ld->list_ptr);
> +	}
The above two for-loop code blocks seem duplicated a little bit, but I 
have no better way too.
Let's go with this, so for the patch: Ack.
Thanks
Lianbo
> +
> +	qsort(list, cnt, sizeof(void *), sort_by_va_start);
> +
> +	*list_ptr = list;
> +	return cnt;
> +}
> +
>   static void
>   dump_vmap_area(struct meminfo *vi)
>   {
> @@ -9080,26 +9158,37 @@ dump_vmap_area(struct meminfo *vi)
>   	char buf2[BUFSIZE];
>   	char buf3[BUFSIZE];
>   	char buf4[BUFSIZE];
> +	ulong *list_ptr;
>   
>   #define VM_VM_AREA 0x4   /* mm/vmalloc.c */
>   
> -	vmap_area_buf = GETBUF(SIZE(vmap_area));
>   	start = count = verified = size = 0;
>   
> -	ld = &list_data;
> -	BZERO(ld, sizeof(struct list_data));
> -	ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
> -	get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
> -	ld->list_head_offset = OFFSET(vmap_area_list);
> -	ld->end = symbol_value("vmap_area_list");
> -	cnt = do_list(ld);
> -	if (cnt < 0) {
> -		FREEBUF(vmap_area_buf);
> -		error(WARNING, "invalid/corrupt vmap_area_list\n");
> -		vi->retval = 0;
> -		return;
> +	if (vt->flags & USE_VMAP_NODES) {
> +		cnt = get_vmap_area_list_from_nodes(&list_ptr);
> +		if (cnt < 0) {
> +			error(WARNING, "invalid/corrupt vmap_nodes.busy list\n");
> +			vi->retval = 0;
> +			return;
> +		}
> +	} else {
> +		ld = &list_data;
> +		BZERO(ld, sizeof(struct list_data));
> +		ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
> +		get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
> +		ld->list_head_offset = OFFSET(vmap_area_list);
> +		ld->end = symbol_value("vmap_area_list");
> +		cnt = do_list(ld);
> +		if (cnt < 0) {
> +			error(WARNING, "invalid/corrupt vmap_area_list\n");
> +			vi->retval = 0;
> +			return;
> +		}
> +		list_ptr = ld->list_ptr;
>   	}
>   
> +	vmap_area_buf = GETBUF(SIZE(vmap_area));
> +
>   	for (i = 0; i < cnt; i++) {
>   		if (!(pc->curcmd_flags & HEADER_PRINTED) && (i == 0) &&
>   		    !(vi->flags & (GET_HIGHEST|GET_PHYS_TO_VMALLOC|
> @@ -9116,7 +9205,7 @@ dump_vmap_area(struct meminfo *vi)
>   			pc->curcmd_flags |= HEADER_PRINTED;
>   		}
>   
> -		readmem(ld->list_ptr[i], KVADDR, vmap_area_buf,
> +		readmem(list_ptr[i], KVADDR, vmap_area_buf,
>                           SIZE(vmap_area), "vmap_area struct", FAULT_ON_ERROR);
>   
>   		if (VALID_MEMBER(vmap_area_flags) &&
> @@ -9158,7 +9247,7 @@ dump_vmap_area(struct meminfo *vi)
>   			} 	
>   			fprintf(fp, "%s%s  %s%s  %s - %s  %7ld\n",
>   				mkstring(buf1,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
> -				MKSTR(ld->list_ptr[i])), space(MINSPACE-1),
> +				MKSTR(list_ptr[i])), space(MINSPACE-1),
>   				mkstring(buf2,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
>   				MKSTR(vm_struct)), space(MINSPACE-1),
>   				mkstring(buf3, VADDR_PRLEN, LONG_HEX|RJUST,
> @@ -9179,14 +9268,14 @@ dump_vmap_area(struct meminfo *vi)
>   					if (vi->flags & GET_PHYS_TO_VMALLOC) {
>   						vi->retval = pcheck +
>   						    PAGEOFFSET(vi->spec_addr);
> -						FREEBUF(ld->list_ptr);
> +						FREEBUF(list_ptr);
>   						return;
>   				        } else
>   						fprintf(fp,
>   						"%s%s  %s%s  %s - %s  %7ld\n",
>   						mkstring(buf1,VADDR_PRLEN,
>   						LONG_HEX|CENTER|LJUST,
> -						MKSTR(ld->list_ptr[i])),
> +						MKSTR(list_ptr[i])),
>   						space(MINSPACE-1),
>   						mkstring(buf2, VADDR_PRLEN,
>   						LONG_HEX|CENTER|LJUST,
> @@ -9204,7 +9293,7 @@ dump_vmap_area(struct meminfo *vi)
>   	}
>   
>   	FREEBUF(vmap_area_buf);
> -	FREEBUF(ld->list_ptr);
> +	FREEBUF(list_ptr);
>   
>   	if (vi->flags & GET_HIGHEST)
>   		vi->retval = start+size;
> @@ -14001,6 +14090,8 @@ dump_vm_table(int verbose)
>   		fprintf(fp, "%sSLAB_ROOT_CACHES", others++ ? "|" : "");\
>   	if (vt->flags & USE_VMAP_AREA)
>   		fprintf(fp, "%sUSE_VMAP_AREA", others++ ? "|" : "");\
> +	if (vt->flags & USE_VMAP_NODES)
> +		fprintf(fp, "%sUSE_VMAP_NODES", others++ ? "|" : "");\
>   	if (vt->flags & CONFIG_NUMA)
>   		fprintf(fp, "%sCONFIG_NUMA", others++ ? "|" : "");\
>   	if (vt->flags & VM_EVENT)
> diff --git a/symbols.c b/symbols.c
> index b7627a83587a..ded34412ff41 100644
> --- a/symbols.c
> +++ b/symbols.c
> @@ -10167,6 +10167,8 @@ dump_offset_table(char *spec, ulong makestruct)
>   	fprintf(fp, "               vmap_area_flags: %ld\n",
>   		OFFSET(vmap_area_flags));
>   	fprintf(fp, "          vmap_area_purge_list: %ld\n", OFFSET(vmap_area_purge_list));
> +	fprintf(fp, "                vmap_node_busy: %ld\n", OFFSET(vmap_node_busy));
> +	fprintf(fp, "                  rb_list_head: %ld\n", OFFSET(rb_list_head));
>   
>   	fprintf(fp, "         module_size_of_struct: %ld\n",
>   		OFFSET(module_size_of_struct));
> @@ -12040,6 +12042,7 @@ dump_offset_table(char *spec, ulong makestruct)
>   		SIZE(task_group));
>   	fprintf(fp, "                     vmap_area: %ld\n",
>   		SIZE(vmap_area));
> +	fprintf(fp, "                     vmap_node: %ld\n", SIZE(vmap_node));
>   	fprintf(fp, "            hrtimer_clock_base: %ld\n",
>   		SIZE(hrtimer_clock_base));
>   	fprintf(fp, "                  hrtimer_base: %ld\n",
> -- 2.31.1
                                
                         
                        
                                
                                1 year, 4 months
                        
                        
                 
         
 
        
            
        
        
        
                
                        
                                
                                 
                                        
                                
                         
                        
                                
                                
                                        
                                                
                                        
                                        
                                        [PATCH] Fix "kmem -v" option on Linux 6.9 and later kernels
                                
                                
                                
                                    
                                        by HAGIO KAZUHITO(萩尾 一仁)
                                    
                                
                                
                                        The following kernel commits removed vmap_area_list and vmap_area_root
rb-tree, and introduced vmap_nodes.
  55c49fee57af mm/vmalloc: remove vmap_area_list
  d093602919ad mm: vmalloc: remove global vmap_area_root rb-tree
Without the patch, the "kmem -v" option and functions that use
dump_vmlist() fail with or without an error:
  crash> kmem -v
     VM_STRUCT                 ADDRESS RANGE               SIZE
  kmem: invalid kernel virtual address: ccccccccccccccd4  type: "vmlist addr"
  crash> kmem -v
  crash>
Signed-off-by: Kazuhito Hagio <k-hagio-ab(a)nec.com>
---
 defs.h    |   4 ++
 memory.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++---------
 symbols.c |   3 ++
 3 files changed, 120 insertions(+), 22 deletions(-)
diff --git a/defs.h b/defs.h
index 01f316e67dde..95de33188070 100644
--- a/defs.h
+++ b/defs.h
@@ -2240,6 +2240,8 @@ struct offset_table {                    /* stash of commonly-used offsets */
 	long mnt_namespace_nr_mounts;
 	long mount_mnt_node;
 	long log_caller_id;
+	long vmap_node_busy;
+	long rb_list_head;
 };
 
 struct size_table {         /* stash of commonly-used sizes */
@@ -2414,6 +2416,7 @@ struct size_table {         /* stash of commonly-used sizes */
 	long maple_tree;
 	long maple_node;
 	long module_memory;
+	long vmap_node;
 };
 
 struct array_table {
@@ -2678,6 +2681,7 @@ struct vm_table {                /* kernel VM-related data */
 #define SLAB_OVERLOAD_PAGE    (0x8000000)
 #define SLAB_CPU_CACHE       (0x10000000)
 #define SLAB_ROOT_CACHES     (0x20000000)
+#define USE_VMAP_NODES       (0x40000000)
 
 #define IS_FLATMEM()		(vt->flags & FLATMEM)
 #define IS_DISCONTIGMEM()	(vt->flags & DISCONTIGMEM)
diff --git a/memory.c b/memory.c
index 34ed646b5d1e..acb8507cfb75 100644
--- a/memory.c
+++ b/memory.c
@@ -235,6 +235,7 @@ static void dump_slab_objects(struct meminfo *);
 static void dump_slab_objects_percpu(struct meminfo *);
 static void dump_vmlist(struct meminfo *);
 static void dump_vmap_area(struct meminfo *);
+static int get_vmap_area_list_from_nodes(ulong **);
 static int dump_page_lists(struct meminfo *);
 static void dump_kmeminfo(void);
 static int page_to_phys(ulong, physaddr_t *); 
@@ -433,9 +434,15 @@ vm_init(void)
 	if (VALID_MEMBER(vmap_area_va_start) &&
 	    VALID_MEMBER(vmap_area_va_end) &&
 	    VALID_MEMBER(vmap_area_list) &&
-	    VALID_MEMBER(vmap_area_vm) &&
-	    kernel_symbol_exists("vmap_area_list"))
-		vt->flags |= USE_VMAP_AREA;
+	    VALID_MEMBER(vmap_area_vm)) {
+		if (kernel_symbol_exists("vmap_nodes")) {
+			STRUCT_SIZE_INIT(vmap_node, "vmap_node");
+			MEMBER_OFFSET_INIT(vmap_node_busy, "vmap_node", "busy");
+			MEMBER_OFFSET_INIT(rb_list_head, "rb_list", "head");
+			vt->flags |= USE_VMAP_NODES;
+		} else if (kernel_symbol_exists("vmap_area_list"))
+			vt->flags |= USE_VMAP_AREA;
+	}
 
 	if (kernel_symbol_exists("hstates")) {
 		STRUCT_SIZE_INIT(hstate, "hstate");
@@ -8957,7 +8964,7 @@ dump_vmlist(struct meminfo *vi)
 	physaddr_t paddr;
 	int mod_vmlist;
 
-	if (vt->flags & USE_VMAP_AREA) {
+	if (vt->flags & (USE_VMAP_AREA|USE_VMAP_NODES)) {
 		dump_vmap_area(vi);
 		return;
 	}
@@ -9067,6 +9074,77 @@ next_entry:
 		vi->retval = verified;
 }
 
+static int
+sort_by_va_start(const void *arg1, const void *arg2)
+{
+	ulong va_start1, va_start2;
+
+	readmem(*(ulong *)arg1 + OFFSET(vmap_area_va_start), KVADDR, &va_start1,
+		sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
+	readmem(*(ulong *)arg2 + OFFSET(vmap_area_va_start), KVADDR, &va_start2,
+		sizeof(void *), "vmap_area.va_start", FAULT_ON_ERROR);
+
+	return va_start1 < va_start2 ? -1 : (va_start1 == va_start2 ? 0 : 1);
+}
+
+/* Linux 6.9 and later kernels use "vmap_nodes". */
+static int
+get_vmap_area_list_from_nodes(ulong **list_ptr)
+{
+	int i, cnt, c;
+	struct list_data list_data, *ld = &list_data;
+	uint nr_vmap_nodes;
+	ulong vmap_nodes, list_head;
+	ulong *list, *ptr;
+
+	get_symbol_data("nr_vmap_nodes", sizeof(uint), &nr_vmap_nodes);
+	get_symbol_data("vmap_nodes", sizeof(ulong), &vmap_nodes);
+
+	/* count up all vmap_areas. */
+	cnt = 0;
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		BZERO(ld, sizeof(struct list_data));
+		list_head = vmap_nodes + SIZE(vmap_node) * i +
+				OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
+		readmem(list_head, KVADDR, &ld->start, sizeof(void *),
+				"rb_list.head", FAULT_ON_ERROR);
+		ld->list_head_offset = OFFSET(vmap_area_list);
+		ld->end = list_head;
+		c = do_list(ld);
+		if (c < 0)
+			return -1;
+
+		cnt += c;
+	}
+
+	list = ptr = (ulong *)GETBUF(sizeof(void *) * cnt);
+
+	/* gather all vmap_areas into a list. */
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		BZERO(ld, sizeof(struct list_data));
+		ld->flags = LIST_ALLOCATE;
+		list_head = vmap_nodes + SIZE(vmap_node) * i +
+				OFFSET(vmap_node_busy) + OFFSET(rb_list_head);
+		readmem(list_head, KVADDR, &ld->start, sizeof(void *),
+				"rb_list.head", FAULT_ON_ERROR);
+		ld->list_head_offset = OFFSET(vmap_area_list);
+		ld->end = list_head;
+		c = do_list(ld);
+		if (c < 0)
+			return -1;
+
+		memcpy(ptr, ld->list_ptr, sizeof(void *) * c);
+		ptr += c;
+
+		FREEBUF(ld->list_ptr);
+	}
+
+	qsort(list, cnt, sizeof(void *), sort_by_va_start);
+
+	*list_ptr = list;
+	return cnt;
+}
+
 static void
 dump_vmap_area(struct meminfo *vi)
 {
@@ -9080,26 +9158,37 @@ dump_vmap_area(struct meminfo *vi)
 	char buf2[BUFSIZE];
 	char buf3[BUFSIZE];
 	char buf4[BUFSIZE];
+	ulong *list_ptr;
 
 #define VM_VM_AREA 0x4   /* mm/vmalloc.c */
 
-	vmap_area_buf = GETBUF(SIZE(vmap_area));
 	start = count = verified = size = 0;
 
-	ld = &list_data;
-	BZERO(ld, sizeof(struct list_data));
-	ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
-	get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
-	ld->list_head_offset = OFFSET(vmap_area_list);
-	ld->end = symbol_value("vmap_area_list");
-	cnt = do_list(ld);
-	if (cnt < 0) {
-		FREEBUF(vmap_area_buf);
-		error(WARNING, "invalid/corrupt vmap_area_list\n"); 
-		vi->retval = 0;
-		return;
+	if (vt->flags & USE_VMAP_NODES) {
+		cnt = get_vmap_area_list_from_nodes(&list_ptr);
+		if (cnt < 0) {
+			error(WARNING, "invalid/corrupt vmap_nodes.busy list\n");
+			vi->retval = 0;
+			return;
+		}
+	} else {
+		ld = &list_data;
+		BZERO(ld, sizeof(struct list_data));
+		ld->flags = LIST_HEAD_FORMAT|LIST_HEAD_POINTER|LIST_ALLOCATE;
+		get_symbol_data("vmap_area_list", sizeof(void *), &ld->start);
+		ld->list_head_offset = OFFSET(vmap_area_list);
+		ld->end = symbol_value("vmap_area_list");
+		cnt = do_list(ld);
+		if (cnt < 0) {
+			error(WARNING, "invalid/corrupt vmap_area_list\n");
+			vi->retval = 0;
+			return;
+		}
+		list_ptr = ld->list_ptr;
 	}
 
+	vmap_area_buf = GETBUF(SIZE(vmap_area));
+
 	for (i = 0; i < cnt; i++) {
 		if (!(pc->curcmd_flags & HEADER_PRINTED) && (i == 0) && 
 		    !(vi->flags & (GET_HIGHEST|GET_PHYS_TO_VMALLOC|
@@ -9116,7 +9205,7 @@ dump_vmap_area(struct meminfo *vi)
 			pc->curcmd_flags |= HEADER_PRINTED;
 		}
 
-		readmem(ld->list_ptr[i], KVADDR, vmap_area_buf,
+		readmem(list_ptr[i], KVADDR, vmap_area_buf,
                         SIZE(vmap_area), "vmap_area struct", FAULT_ON_ERROR); 
 
 		if (VALID_MEMBER(vmap_area_flags) &&
@@ -9158,7 +9247,7 @@ dump_vmap_area(struct meminfo *vi)
 			} 	
 			fprintf(fp, "%s%s  %s%s  %s - %s  %7ld\n",
 				mkstring(buf1,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
-				MKSTR(ld->list_ptr[i])), space(MINSPACE-1),
+				MKSTR(list_ptr[i])), space(MINSPACE-1),
 				mkstring(buf2,VADDR_PRLEN, LONG_HEX|CENTER|LJUST,
 				MKSTR(vm_struct)), space(MINSPACE-1),
 				mkstring(buf3, VADDR_PRLEN, LONG_HEX|RJUST,
@@ -9179,14 +9268,14 @@ dump_vmap_area(struct meminfo *vi)
 					if (vi->flags & GET_PHYS_TO_VMALLOC) {
 						vi->retval = pcheck +
 						    PAGEOFFSET(vi->spec_addr);
-						FREEBUF(ld->list_ptr);
+						FREEBUF(list_ptr);
 						return;
 				        } else
 						fprintf(fp,
 						"%s%s  %s%s  %s - %s  %7ld\n",
 						mkstring(buf1,VADDR_PRLEN, 
 						LONG_HEX|CENTER|LJUST,
-						MKSTR(ld->list_ptr[i])), 
+						MKSTR(list_ptr[i])),
 						space(MINSPACE-1),
 						mkstring(buf2, VADDR_PRLEN,
 						LONG_HEX|CENTER|LJUST,
@@ -9204,7 +9293,7 @@ dump_vmap_area(struct meminfo *vi)
 	}
 
 	FREEBUF(vmap_area_buf);
-	FREEBUF(ld->list_ptr);
+	FREEBUF(list_ptr);
 
 	if (vi->flags & GET_HIGHEST)
 		vi->retval = start+size;
@@ -14001,6 +14090,8 @@ dump_vm_table(int verbose)
 		fprintf(fp, "%sSLAB_ROOT_CACHES", others++ ? "|" : "");\
 	if (vt->flags & USE_VMAP_AREA)
 		fprintf(fp, "%sUSE_VMAP_AREA", others++ ? "|" : "");\
+	if (vt->flags & USE_VMAP_NODES)
+		fprintf(fp, "%sUSE_VMAP_NODES", others++ ? "|" : "");\
 	if (vt->flags & CONFIG_NUMA)
 		fprintf(fp, "%sCONFIG_NUMA", others++ ? "|" : "");\
 	if (vt->flags & VM_EVENT)
diff --git a/symbols.c b/symbols.c
index b7627a83587a..ded34412ff41 100644
--- a/symbols.c
+++ b/symbols.c
@@ -10167,6 +10167,8 @@ dump_offset_table(char *spec, ulong makestruct)
 	fprintf(fp, "               vmap_area_flags: %ld\n", 
 		OFFSET(vmap_area_flags));
 	fprintf(fp, "          vmap_area_purge_list: %ld\n", OFFSET(vmap_area_purge_list));
+	fprintf(fp, "                vmap_node_busy: %ld\n", OFFSET(vmap_node_busy));
+	fprintf(fp, "                  rb_list_head: %ld\n", OFFSET(rb_list_head));
 
 	fprintf(fp, "         module_size_of_struct: %ld\n", 
 		OFFSET(module_size_of_struct));
@@ -12040,6 +12042,7 @@ dump_offset_table(char *spec, ulong makestruct)
 		SIZE(task_group));
 	fprintf(fp, "                     vmap_area: %ld\n",
 		SIZE(vmap_area));
+	fprintf(fp, "                     vmap_node: %ld\n", SIZE(vmap_node));
 	fprintf(fp, "            hrtimer_clock_base: %ld\n",
 		SIZE(hrtimer_clock_base));
 	fprintf(fp, "                  hrtimer_base: %ld\n",
-- 
2.31.1
                                
                         
                        
                                
                                1 year, 4 months