I faced an incomplete vmcore previous week which was generated because
system running in kdump kernel was somehow rebooted in the middle of
copying vmcore.
Unfortunately, in the incomplete vmcore, most of the tasks failed to
be detected via PID hash table because the objects relevant to PID
hash including ptes needed to refer to the objects were lost.
Although I successfully found many of objects of task_struct from
another data structure such as via a circular list of task_struct::tasks
and via run queue, crash sub-commands never work with the following
message if a given object is not contained in the task table:
crash> bt 0xffffffff00000000
bt: invalid task or pid value: 0xffffffff00000000
To address this issue, I made a patch to add a command-line option
to pass a list of addresses of task_struct objects to make crash
try to detect them in task table.
I made this in very short time and there may be better interface
than command-line option.
Tested on top of crash-7.2.5.
---
defs.h | 1 +
main.c | 7 +++-
task.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 145 insertions(+), 1 deletion(-)
diff --git a/defs.h b/defs.h
index 9ebdde6..150dcdb 100644
--- a/defs.h
+++ b/defs.h
@@ -855,6 +855,7 @@ struct task_table { /* kernel/local task table
data */
int callbacks;
struct task_context **context_by_task; /* task_context sorted by task addr */
ulong pid_xarray;
+ char *task_table_file;
};
#define TASK_INIT_DONE (0x1)
diff --git a/main.c b/main.c
index 7248810..4f6c0a6 100644
--- a/main.c
+++ b/main.c
@@ -89,7 +89,7 @@ main(int argc, char **argv)
*/
opterr = 0;
optind = 0;
- while((c = getopt_long(argc, argv, "Lkgh::e:i:sSvc:d:tfp:m:xo:",
+ while((c = getopt_long(argc, argv, "Lkgh::e:i:sSvc:d:tT:fp:m:xo:",
long_options, &option_index)) != -1) {
switch (c)
{
@@ -408,6 +408,11 @@ main(int argc, char **argv)
ramdump_elf_output_file(optarg);
break;
+ case 'T':
+ if (!tt->task_table_file)
+ tt->task_table_file = optarg;
+ break;
+
default:
error(INFO, "invalid option: %s\n",
argv[optind-1]);
diff --git a/task.c b/task.c
index cd118e5..c57fa37 100644
--- a/task.c
+++ b/task.c
@@ -30,6 +30,7 @@ static void refresh_hlist_task_table(void);
static void refresh_hlist_task_table_v2(void);
static void refresh_hlist_task_table_v3(void);
static void refresh_active_task_table(void);
+static void refresh_task_table_from_file(void);
static int radix_tree_task_callback(ulong);
static void refresh_radix_tree_task_table(void);
static void refresh_xarray_task_table(void);
@@ -598,6 +599,9 @@ task_init(void)
if (tt->flags & ACTIVE_ONLY)
tt->refresh_task_table = refresh_active_task_table;
+ if (tt->task_table_file)
+ tt->refresh_task_table = refresh_task_table_from_file;
+
tt->refresh_task_table();
if (tt->flags & TASK_REFRESH_OFF)
@@ -2798,6 +2802,140 @@ retry_active:
tt->retries = MAX(tt->retries, retries);
}
+static void
+refresh_task_table_from_file(void)
+{
+ int i;
+ char *tp;
+ int cnt;
+ ulong curtask;
+ ulong curpid;
+ ulong retries;
+ ulong *tlp;
+ FILE *ttffp = NULL;
+
+ if (DUMPFILE() && (tt->flags & TASK_INIT_DONE)) /* impossible */
+ return;
+
+ if (DUMPFILE()) {
+ please_wait("gathering task table data");
+ if (!symbol_exists("panic_threads"))
+ tt->flags |= POPULATE_PANIC;
+ }
+
+ if (ACTIVE() && !(tt->flags & TASK_REFRESH))
+ return;
+
+ curtask = NO_TASK;
+ curpid = NO_PID;
+ retries = 0;
+
+ get_active_set();
+
+ /*
+ * The current task's task_context entry may change,
+ * or the task may not even exist anymore.
+ */
+ if (ACTIVE() && (tt->flags & TASK_INIT_DONE)) {
+ curtask = CURRENT_TASK();
+ curpid = CURRENT_PID();
+ }
+
+retry_active:
+
+ if (!hq_open()) {
+ error(INFO, "cannot hash task_struct entries\n");
+ if (!(tt->flags & TASK_INIT_DONE))
+ clean_exit(1);
+ error(INFO, "using stale task_structs\n");
+ return;
+ }
+
+ ttffp = fopen(tt->task_table_file, "r");
+ if (!ttffp) {
+ error(INFO, "failed to open %s\n", tt->task_table_file);
+ return;
+ }
+
+ /*
+ * Get tasks from a file.
+ */
+ cnt = 0;
+ for (;;) {
+ char line[128];
+
+ fgets(line, sizeof(line), ttffp);
+ if (ferror(ttffp)) {
+ error(INFO, "failed to read task table file: %s\n",
+ strerror(errno));
+ goto fail;
+ }
+ if (feof(ttffp))
+ break;
+ if (!strlen(line) || line[0] == '#')
+ continue;
+ strtok(line, "\n");
+ if (hq_enter(htol(line, RETURN_ON_ERROR, NULL)))
+ cnt++;
+ else
+ error(WARNING, "%sduplicate tasks?\n",
+ DUMPFILE() ? "\n" : "");
+ }
+
+ BZERO(tt->task_local, tt->max_tasks * sizeof(void *));
+ cnt = retrieve_list((ulong *)tt->task_local, cnt);
+
+ hq_close();
+
+ clear_task_cache();
+
+ for (i = 0, tlp = (ulong *)tt->task_local, tt->running_tasks = 0;
+ i < tt->max_tasks; i++, tlp++) {
+ if (!(*tlp))
+ continue;
+
+ if (!IS_TASK_ADDR(*tlp)) {
+ error(WARNING,
+ "%sinvalid task address found in "
+ "task list: %lx\n",
+ DUMPFILE() ? "\n" : "", *tlp);
+ if (DUMPFILE())
+ continue;
+ retries++;
+ goto retry_active;
+ }
+
+ if (!(tp = fill_task_struct(*tlp))) {
+ if (DUMPFILE())
+ continue;
+ retries++;
+ goto retry_active;
+ }
+
+ if (!add_context(*tlp, tp) && DUMPFILE())
+ error(WARNING, "corrupt/invalid task from file: %lx\n",
+ *tlp);
+ }
+
+ if (!tt->running_tasks) {
+ if (DUMPFILE())
+ error(FATAL, "cannot determine any tasks "
+ "from file!\n");
+ retries++;
+ goto retry_active;
+ }
+
+ please_wait_done();
+
+ if (ACTIVE() && (tt->flags & TASK_INIT_DONE))
+ refresh_context(curtask, curpid);
+
+ tt->retries = MAX(tt->retries, retries);
+
+fail:
+ fclose(ttffp);
+}
+
/*
* Initialize and return a new task_context structure with data from a task.
* NULL is returned on error.
--
1.8.3.1