Hi Rui,
On Mon, May 25, 2026 at 9:55 PM Rui Qi <qirui.001(a)bytedance.com> wrote:
The numeric_forward() function serves as the comparator for qsort() when
sorting kernel symbols. For a modern Linux kernel containing hundreds of
thousands of symbols (N), qsort() performs O(N log N) comparisons,
meaning this function is invoked millions of times during the startup
phase of the crash utility.
During these millions of comparisons, it frequently checks for specific
symbol names (e.g., "_stext", "kaslr_get_random_long") using the
STREQ()
macro. STREQ() internally expands to string_exists() checks followed by a
full strcmp(), incurring significant function call overhead that cannot be
optimized out by the compiler at runtime.
By explicitly checking the first character of the symbol name
(e.g., x->name[0] == '_') before invoking STREQ(), we introduce a
lightweight "early reject" mechanism. Since the distribution of kernel
symbol starting characters is relatively sparse, this short-circuits the
evaluation for the vast majority of symbols, completely avoiding the
overhead of strcmp() macro expansion.
Additionally, since x->name could potentially be NULL, we must safely
guard the character access with an explicit non-null check (x->name &&)
to prevent segmentation faults.
This O(N log N) cumulative optimization yields a measurable performance
improvement in symbol sorting speed, which scales directly with the size
of the kernel symbol table.
Thanks for attach your performance measurement. However on my x86_64
machine, the performance improvement is not as significant as on your
RISC-V machine:
$ echo 'q' | /usr/bin/time -v ./crash.orig /proc/kcore ~/vmlinux
crash.orig> q
...
User time (seconds): 5.84
System time (seconds): 0.36
Percent of CPU this job got: 154%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:04.03
$ echo 'q' | /usr/bin/time -v ./crash.aft /proc/kcore ~/vmlinux
crash.aft> q
...
User time (seconds): 5.83
System time (seconds): 0.37
Percent of CPU this job got: 153%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:04.03
It does have improvements, but honestly the improvements are tiny...
Could you please try the "echo q | /usr/bin/time -v ./crash xx xx"
command on your machine and repost the result so I can have a
comparison?
Performance measurement (./crash /proc/kcore startup, n=6 alternating
runs on RISC-V 32-core, kernel 6.12.13.bsk.1-rc17-riscv64):
before after
mean (s) 60.367 48.688
median (s) 58.831 47.588
min (s) 54.980 45.951
max (s) 69.889 52.988
Speedup (mean): 1.24x (-11.68 s, -19.3%)
perf stat:
instructions: 3.104e11 -> 2.680e11 (-13.7%)
cycles: 1.612e11 -> 1.406e11 (-12.8%)
perf record (top hotspots):
strcmp: 12.59% -> 3.64%
string_exists: dropped out of top-10
Interesting, have you enabled the -O2 flags(make warn)? Because the
string_exists() is an inline function, which shouldn't exist after O2
compilation:
$ nm crash.aft | grep string_exists
$ <<<---- no outputs
The decrease of strcmp() in your result is expected. However my point
is, I believe strcmp() is fast enough, that there isn't much
difference whether you call strcmp() multiple times or bail out a bit
early to skip the strcmp() call:
#include <stdio.h>
#include <string.h>
#include <stdint.h>
volatile char s1[] = "hello_world_123";
volatile char s2[] = "hello_world_456";
volatile int n = 0;
int main(void)
{
for (int i = 0; i < 1000000; ++i) {
n += strcmp((const char *)s1, (const char *)s2);
}
printf("n = %d\n", n);
return 0;
}
$ gcc -O2 -fno-builtin-strcmp test.c -o test
$ /usr/bin/time -v ./test
n = -3000000
Command being timed: "./test"
User time (seconds): 0.01
System time (seconds): 0.00
Percent of CPU this job got: 90%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.01
$ objdump -d test
0000000000401050 <main>:
401050: 53 push %rbx
401051: bb 40 42 0f 00 mov $0xf4240,%ebx
401056: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
40105d: 00 00 00
401060: be 20 40 40 00 mov $0x404020,%esi
401065: bf 30 40 40 00 mov $0x404030,%edi
40106a: e8 d1 ff ff ff call 401040 <strcmp@plt>
<<---------
40106f: 8b 15 cf 2f 00 00 mov 0x2fcf(%rip),%edx
# 404044 <n>
401075: 01 d0 add %edx,%eax
401077: 89 05 c7 2f 00 00 mov %eax,0x2fc7(%rip)
# 404044 <n>
40107d: 83 eb 01 sub $0x1,%ebx
The 0.01s of strcmp() calling is acceptable to me.
Signed-off-by: Rui Qi <qirui.001(a)bytedance.com>
---
Changes in V2:
- Add performance measurement data (time, perf stat, perf record)
symbols.c | 40 ++++++++++++++++++++--------------------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/symbols.c b/symbols.c
index c0285058c5cb..c0353d626477 100644
--- a/symbols.c
+++ b/symbols.c
@@ -14412,16 +14412,16 @@ numeric_forward(const void *P_x, const void *P_y)
error(FATAL, "bfd_minisymbol_to_symbol failed\n");
if (st->_stext_vmlinux == UNINITIALIZED) {
- if (STREQ(x->name, "_stext"))
+ if (x->name && x->name[0] == '_' &&
STREQ(x->name, "_stext"))
st->_stext_vmlinux = valueof(x);
- else if (STREQ(y->name, "_stext"))
+ else if (y->name && y->name[0] == '_' &&
STREQ(y->name, "_stext"))
st->_stext_vmlinux = valueof(y);
}
if (kt->flags2 & KASLR_CHECK) {
- if (STREQ(x->name, "kaslr_get_random_long") ||
- STREQ(y->name, "kaslr_get_random_long") ||
- STREQ(x->name, "module_load_offset") ||
- STREQ(y->name, "module_load_offset")) {
+ if ((x->name && x->name[0] == 'k' &&
STREQ(x->name, "kaslr_get_random_long")) ||
+ (y->name && y->name[0] == 'k' &&
STREQ(y->name, "kaslr_get_random_long")) ||
+ (x->name && x->name[0] == 'm' &&
STREQ(x->name, "module_load_offset")) ||
+ (y->name && y->name[0] == 'm' &&
STREQ(y->name, "module_load_offset"))) {
kt->flags2 &= ~KASLR_CHECK;
kt->flags2 |= (RELOC_AUTO|KASLR);
}
@@ -14429,36 +14429,36 @@ numeric_forward(const void *P_x, const void *P_y)
if (SADUMP_DUMPFILE() || QEMU_MEM_DUMP_NO_VMCOREINFO() || VMSS_DUMPFILE()) {
/* Need for kaslr_offset and phys_base */
- if (STREQ(x->name, "divide_error") ||
- STREQ(x->name, "asm_exc_divide_error"))
+ if ((x->name && x->name[0] == 'd' &&
STREQ(x->name, "divide_error")) ||
+ (x->name && x->name[0] == 'a' &&
STREQ(x->name, "asm_exc_divide_error")))
st->divide_error_vmlinux = valueof(x);
- else if (STREQ(y->name, "divide_error") ||
- STREQ(y->name, "asm_exc_divide_error"))
+ else if ((y->name && y->name[0] == 'd' &&
STREQ(y->name, "divide_error")) ||
+ (y->name && y->name[0] == 'a' &&
STREQ(y->name, "asm_exc_divide_error")))
st->divide_error_vmlinux = valueof(y);
- if (STREQ(x->name, "idt_table"))
+ if (x->name && x->name[0] == 'i' &&
STREQ(x->name, "idt_table"))
st->idt_table_vmlinux = valueof(x);
- else if (STREQ(y->name, "idt_table"))
+ else if (y->name && y->name[0] == 'i' &&
STREQ(y->name, "idt_table"))
st->idt_table_vmlinux = valueof(y);
- if (STREQ(x->name, "kaiser_init"))
+ if (x->name && x->name[0] == 'k' &&
STREQ(x->name, "kaiser_init"))
st->kaiser_init_vmlinux = valueof(x);
- else if (STREQ(y->name, "kaiser_init"))
+ else if (y->name && y->name[0] == 'k' &&
STREQ(y->name, "kaiser_init"))
st->kaiser_init_vmlinux = valueof(y);
- if (STREQ(x->name, "linux_banner"))
+ if (x->name && x->name[0] == 'l' &&
STREQ(x->name, "linux_banner"))
st->linux_banner_vmlinux = valueof(x);
- else if (STREQ(y->name, "linux_banner"))
+ else if (y->name && y->name[0] == 'l' &&
STREQ(y->name, "linux_banner"))
st->linux_banner_vmlinux = valueof(y);
- if (STREQ(x->name, "pti_init"))
+ if (x->name && x->name[0] == 'p' &&
STREQ(x->name, "pti_init"))
st->pti_init_vmlinux = valueof(x);
- else if (STREQ(y->name, "pti_init"))
+ else if (y->name && y->name[0] == 'p' &&
STREQ(y->name, "pti_init"))
st->pti_init_vmlinux = valueof(y);
- if (STREQ(x->name, "saved_command_line"))
+ if (x->name && x->name[0] == 's' &&
STREQ(x->name, "saved_command_line"))
st->saved_command_line_vmlinux = valueof(x);
- else if (STREQ(y->name, "saved_command_line"))
+ else if (y->name && y->name[0] == 's' &&
STREQ(y->name, "saved_command_line"))
st->saved_command_line_vmlinux = valueof(y);
}
--
2.47.3