On Thu, Feb 16, 2023 at 8:48 AM HAGIO KAZUHITO(萩尾 一仁) <k-hagio-ab@nec.com> wrote:
On 2023/02/15 17:24, Lianbo Jiang wrote:
> Kernel commit 7d65f4a65532 ("irq: Consolidate do_softirq() arch overriden
> implementations") renamed the call_softirq to do_softirq_own_stack.
> Crash may incorrectly output the eframe stack with a warning as below:
>
>    crash> foreach bt
>    ...
>    PID: 0        TASK: ffff914f820a8000  CPU: 25   COMMAND: "swapper/25"
>     #0 [fffffe0000504e48] crash_nmi_callback at ffffffffa665d763
>     #1 [fffffe0000504e50] nmi_handle at ffffffffa662a423
>     #2 [fffffe0000504ea8] default_do_nmi at ffffffffa6fe7dc9
>     #3 [fffffe0000504ec8] do_nmi at ffffffffa662a97f
>     #4 [fffffe0000504ef0] end_repeat_nmi at ffffffffa70015e8
>        [exception RIP: clone_endio+172]
>        RIP: ffffffffc005c1ec  RSP: ffffa1d403d08e98  RFLAGS: 00000246
>        RAX: 0000000000000000  RBX: ffff915326fba230  RCX: 0000000000000018
>        RDX: ffffffffc0075400  RSI: 0000000000000000  RDI: ffff915326fba230
>        RBP: ffff915326fba1c0   R8: 0000000000001000   R9: ffff915308d6d2a0
>        R10: 000000a97dfe5e10  R11: ffffa1d40038fe98  R12: ffff915302babc40
>        R13: ffff914f94360000  R14: 0000000000000000  R15: 0000000000000000
>        ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>    --- <NMI exception stack> ---
>     #5 [ffffa1d403d08e98] clone_endio at ffffffffc005c1ec [dm_mod]
>     #6 [ffffa1d403d08ed0] blk_update_request at ffffffffa6a96954
>     #7 [ffffa1d403d08f10] scsi_end_request at ffffffffa6c9b968
>     #8 [ffffa1d403d08f48] scsi_io_completion at ffffffffa6c9bb3e
>     #9 [ffffa1d403d08f90] blk_complete_reqs at ffffffffa6aa0e95
>     #10 [ffffa1d403d08fa0] __softirqentry_text_start at ffffffffa72000dc
>     #11 [ffffa1d403d08ff0] do_softirq_own_stack at ffffffffa7000f9a
>    --- <IRQ stack> ---
>     #12 [ffffa1d40038fe70] do_softirq_own_stack at ffffffffa7000f9a
>        [exception RIP: unknown or invalid address]
>        RIP: 0000000000000000  RSP: 0000000000000000  RFLAGS: 00000000
>        RAX: ffffffffa672eae5  RBX: ffffffffa83b34e0  RCX: ffffffffa672eb12
>        RDX: 0000000000000010  RSI: 8b7d6c8869010c00  RDI: 0000000000000085
>        RBP: 0000000000000286   R8: ffff914f820a8000   R9: ffffffffa67a94e0
>        R10: 0000000000000286  R11: ffffffffa66fb4c5  R12: ffffffffa67a898b
>        R13: 0000000000000000  R14: fffffffffffffff8  R15: ffffffffa67a1e68
>        ORIG_RAX: 0000000000000000  CS: 0000  SS: ffffffffa672edff
>     bt: WARNING: possibly bogus exception frame
>     #13 [ffffa1d40038ff30] start_secondary at ffffffffa665fa2c
>     #14 [ffffa1d40038ff50] secondary_startup_64_no_verify at ffffffffa6600116
>     ...

Thank you for the patch.  Could I have "bt -c 25" output with the patch
and "bt -r -c 25 | tail -n 40" output for checking?

 
Sure.

crash> bt -c 25
PID: 0        TASK: ffff914f820a8000  CPU: 25   COMMAND: "swapper/25"
 #0 [fffffe0000504e48] crash_nmi_callback at ffffffffa665d763
 #1 [fffffe0000504e50] nmi_handle at ffffffffa662a423
 #2 [fffffe0000504ea8] default_do_nmi at ffffffffa6fe7dc9
 #3 [fffffe0000504ec8] do_nmi at ffffffffa662a97f
 #4 [fffffe0000504ef0] end_repeat_nmi at ffffffffa70015e8
    [exception RIP: clone_endio+172]
    RIP: ffffffffc005c1ec  RSP: ffffa1d403d08e98  RFLAGS: 00000246
    RAX: 0000000000000000  RBX: ffff915326fba230  RCX: 0000000000000018
    RDX: ffffffffc0075400  RSI: 0000000000000000  RDI: ffff915326fba230
    RBP: ffff915326fba1c0   R8: 0000000000001000   R9: ffff915308d6d2a0
    R10: 000000a97dfe5e10  R11: ffffa1d40038fe98  R12: ffff915302babc40
    R13: ffff914f94360000  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #5 [ffffa1d403d08e98] clone_endio at ffffffffc005c1ec [dm_mod]
 #6 [ffffa1d403d08ed0] blk_update_request at ffffffffa6a96954
 #7 [ffffa1d403d08f10] scsi_end_request at ffffffffa6c9b968
 #8 [ffffa1d403d08f48] scsi_io_completion at ffffffffa6c9bb3e
 #9 [ffffa1d403d08f90] blk_complete_reqs at ffffffffa6aa0e95
#10 [ffffa1d403d08fa0] __softirqentry_text_start at ffffffffa72000dc
#11 [ffffa1d403d08ff0] do_softirq_own_stack at ffffffffa7000f9a
--- <IRQ stack> ---
#12 [ffffa1d40038fe70] update_ts_time_stats at ffffffffa67a1e68
#13 [ffffa1d40038fea0] do_softirq at ffffffffa66fb4c5
#14 [ffffa1d40038feb0] flush_smp_call_function_queue at ffffffffa67a94e0
#15 [ffffa1d40038fec0] do_idle at ffffffffa672eae5
#16 [ffffa1d40038ff10] cpu_startup_entry at ffffffffa672edff
#17 [ffffa1d40038ff30] start_secondary at ffffffffa665fa2c
#18 [ffffa1d40038ff50] secondary_startup_64_no_verify at ffffffffa6600116
crash>
crash> bt -r -c 25 | tail -n 40
ffffa1d40038fd80:  ffff9156ff26afc0 0000002027c3e7a1
ffffa1d40038fd90:  __next_timer_interrupt+166 ffff9156ff25aa68
ffffa1d40038fda0:  00000000fffd85da 0000000000000082
ffffa1d40038fdb0:  0000000000000082 hrtimer_get_next_event+78
ffffa1d40038fdc0:  ffff9156ff25aa40 000000200cd15980
ffffa1d40038fdd0:  __hrtimer_next_event_base+192 ffff9156ff25cf40
ffffa1d40038fde0:  7fffffffffffffff ffff9156ff25d580
ffffa1d40038fdf0:  0000000000000082 ffffffffffffffff
ffffa1d40038fe00:  0000000000000082 hrtimer_next_event_without+96
ffffa1d40038fe10:  ffff9156ff25d580 000000200cff2040
ffffa1d40038fe20:  000000200cdb406e ffff9153052f8c00
ffffa1d40038fe30:  sched_clock+5    sched_clock_cpu+12
ffffa1d40038fe40:  acpi_idle_driver+136 ffff9153052f8c00
ffffa1d40038fe50:  cpuidle_enter_state+181 000000203501afdb
ffffa1d40038fe60:  00ffffffa82c44c0 read_tsc        
ffffa1d40038fe70:  update_ts_time_stats+88 fffffffffffffff8
ffffa1d40038fe80:  0000000000000000 __flush_smp_call_function_queue+219
ffffa1d40038fe90:  0000000000000286 __cpu_online_mask
ffffa1d40038fea0:  do_softirq+69    0000000000000286
ffffa1d40038feb0:  flush_smp_call_function_queue+96 ffff914f820a8000
ffffa1d40038fec0:  do_idle+405      do_idle+450      
ffffa1d40038fed0:  0000000000000010 8b7d6c8869010c00
ffffa1d40038fee0:  0000000000000085 0000000000000000
ffffa1d40038fef0:  0000000000000000 0000000000000000
ffffa1d40038ff00:  0000000000000000 0000000000000000
ffffa1d40038ff10:  cpu_startup_entry+111 8b7d6c8869010c00
ffffa1d40038ff20:  534e004fc6e00600 ffffa1d40038ff38
ffffa1d40038ff30:  start_secondary+396 534e004fc6e00600
ffffa1d40038ff40:  0000000000000000 0000000040000000
ffffa1d40038ff50:  secondary_startup_64_no_verify+209 0000000000000000
ffffa1d40038ff60:  0000000000000000 0000000000000000
ffffa1d40038ff70:  0000000000000000 0000000000000000
ffffa1d40038ff80:  0000000000000000 0000000000000000
ffffa1d40038ff90:  0000000000000000 0000000000000000
ffffa1d40038ffa0:  0000000000000000 0000000000000000
ffffa1d40038ffb0:  0000000000000000 0000000000000000
ffffa1d40038ffc0:  0000000000000000 0000000000000000
ffffa1d40038ffd0:  0000000000000000 0000000000000000
ffffa1d40038ffe0:  0000000000000000 0000000000000000
ffffa1d40038fff0:  0000000000000000 0000000000000000
crash>
 
Thanks.
Lianbo

Thanks,
Kazu

>
> Also no exception frame when coming from do_softirq_own_stack.
>
> Reported-by: Marco Patalano <mpatalan@redhat.com>
> Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
> ---
>   x86_64.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/x86_64.c b/x86_64.c
> index 5b671bd97775..3428bed417df 100644
> --- a/x86_64.c
> +++ b/x86_64.c
> @@ -3825,10 +3825,11 @@ in_exception_stack:
>               up -= 1;
>                   bt->instptr = *up;
>               /*
> -              *  No exception frame when coming from call_softirq.
> +              *  No exception frame when coming from call_softirq
> +              *  or do_softirq_own_stack.
>                */
>               if ((sp = value_search(bt->instptr, &offset)) &&
> -                 STREQ(sp->name, "call_softirq"))
> +                 (STREQ(sp->name, "call_softirq") || STREQ(sp->name, "do_softirq_own_stack")))
>                       irq_eframe = 0;
>                   bt->frameptr = 0;
>                   done = FALSE;