Hi Hatayama-san,
thank you for the patches.
On 2023/05/26 19:45, HATAYAMA Daisuke wrote:
 There's no NMI on ARM. Hence, stopping the non-panicking CPUs
from the
 panicking CPU via IPI can fail easily if interrupts are being masked
 in those moment. Moreover, crash_notes are not initialized for such
 unstopped CPUs and the corresponding NT_PRSTATUS notes are not
 attached to vmcore. However, crash utility never takes it
 consideration such uninitialized crash_notes and then ends with
 mapping different NT_PRSTATUS to actually unstopped CPUs. This corrupt
 mapping can result crash utility into segmentation fault in the
 operations where register values in NT_PRSTATUS notes are used.
 
 For example:
 
      crash> bt 1408
 	PID: 1408     TASK: ffff000003e22200  CPU: 2    COMMAND: "repro"
 	Segmentation fault (core dumped)
 
 	crash> help -D
 	diskdump_data:
 			  filename: 127.0.0.1-2023-05-26-02:21:27/vmcore-ld1
 				 flags: 46 (KDUMP_CMPRS_LOCAL|ERROR_EXCLUDED|LZO_SUPPORTED)
 	...snip...
 			   notes_buf: 1815df0
 	  num_vmcoredd_notes: 0
 	  num_prstatus_notes: 5
 				notes[0]: 1815df0 (NT_PRSTATUS)
 						  si.signo: 0  si.code: 0  si.errno: 0
 	...snip...
 						  PSTATE: 80400005   FPVALID: 00000000
 				notes[4]: 1808f10 (NT_PRSTATUS)
 	Segmentation fault (core dumped)
 
 To fix this issue, let's map NT_PRSTATUS to some CPU only if the
 corresponding crash_notes is checked to be initialized.
 
 Signed-off-by: HATAYAMA Daisuke <d.hatayama(a)fujitsu.com>
 ---
   defs.h     |  1 +
   diskdump.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
   netdump.c  |  5 ++++-
   3 files changed, 55 insertions(+), 2 deletions(-)
 
 diff --git a/defs.h b/defs.h
 index 12ad6aa..72129a1 100644
 --- a/defs.h
 +++ b/defs.h
 @@ -7111,6 +7111,7 @@ int dumpfile_is_split(void);
   void show_split_dumpfiles(void);
   void x86_process_elf_notes(void *, unsigned long);
   void *diskdump_get_prstatus_percpu(int);
 +int have_crash_notes(int cpu);
   void map_cpus_to_prstatus_kdump_cmprs(void);
   void diskdump_display_regs(int, FILE *);
   void process_elf32_notes(void *, ulong);
 diff --git a/diskdump.c b/diskdump.c
 index cf5f5d9..11d29d3 100644
 --- a/diskdump.c
 +++ b/diskdump.c
 @@ -101,6 +101,55 @@ int dumpfile_is_split(void)
   	return KDUMP_SPLIT();
   }
   
 +int have_crash_notes(int cpu)
 +{
 +	ulong crash_notes, notes_ptr;
 +	char *buf, *p;
 +	Elf64_Nhdr *note = NULL;
 +
 +	if (!readmem(symbol_value("crash_notes"),
 +		     KVADDR,
 +		     &crash_notes,
 +		     sizeof(crash_notes),
 +		     "crash_notes",
 +		     RETURN_ON_ERROR)) {
 +		error(WARNING, "cannot read \"crash_notes\"\n");
 +		return FALSE;
 +	}
 +
 +	if (symbol_exists("__per_cpu_offset"))
 
According to the crash sources, I think this can be replaced with:
     if ((kt->flags & SMP) && (kt->flags & PER_CPU_OFF))
 +		notes_ptr = crash_notes + kt->__per_cpu_offset[cpu];
 +	else
 +		notes_ptr = crash_notes;
 +
 +	buf = GETBUF(SIZE(note_buf));
 +
 +	if (!readmem(notes_ptr,
 +		     KVADDR,
 +		     buf,
 +		     SIZE(note_buf),
 +		     "note_buf_t",
 +		     RETURN_ON_ERROR)) {
 +		error(WARNING, "cpu %d: cannot read NT_PRSTATUS note\n", cpu);
 +		return FALSE;
 +	}
 +
 +	note = (Elf64_Nhdr *)buf;
 +	p = buf + sizeof(Elf64_Nhdr);
 +
 +	if (note->n_type != NT_PRSTATUS) {
 +		error(WARNING, "cpu %d: invalid NT_PRSTATUS note (n_type != NT_PRSTATUS)\n",
cpu);
 +		return FALSE;
 +	}
 +
 +	if (!STRNEQ(p, "CORE")) {
 +		error(WARNING, "cpu %d: invalid NT_PRSTATUS note (name !=
\"CORE\")\n", cpu);
 +		return FALSE;
 +	}
 +
 +	return TRUE;
 +}
 +
   void
   map_cpus_to_prstatus_kdump_cmprs(void)
   {
 @@ -131,7 +180,7 @@ map_cpus_to_prstatus_kdump_cmprs(void)
   	nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS);
   
   	for (i = 0, j = 0; i < nrcpus; i++) {
 -		if (in_cpu_map(ONLINE_MAP, i)) {
 +		if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) {
 
I've seen a user using a kernel without the crash_notes symbol before 
e.g. [1].  It might be better to call have_crash_notes() only if it exists?
[1] 
https://github.com/crash-utility/crash/commit/4b34197508578bb43639e6d169f...
Thanks,
Kazu
>   			dd->nt_prstatus_percpu[i] = nt_ptr[j++];
>   			dd->num_prstatus_notes =
>   				MAX(dd->num_prstatus_notes, i+1);
> diff --git a/netdump.c b/netdump.c
> index 01af145..b272984 100644
> --- a/netdump.c
> +++ b/netdump.c
> @@ -99,8 +99,11 @@ map_cpus_to_prstatus(void)
>   	nrcpus = (kt->kernel_NR_CPUS ? kt->kernel_NR_CPUS : NR_CPUS);
>   
>   	for (i = 0, j = 0; i < nrcpus; i++) {
> -		if (in_cpu_map(ONLINE_MAP, i))
> +		if (in_cpu_map(ONLINE_MAP, i) && have_crash_notes(i)) {
>   			nd->nt_prstatus_percpu[i] = nt_ptr[j++];
> +			nd->num_prstatus_notes =
> +				MAX(nd->num_prstatus_notes, i+1);
> +		}
>   	}
>   
>   	FREEBUF(nt_ptr);