I faced an incomplete vmcore previous week which was generated
because
system running in kdump kernel was somehow rebooted in the middle of
copying vmcore.
Unfortunately, in the incomplete vmcore, most of the tasks failed to
be detected via PID hash table because the objects relevant to PID
hash including ptes needed to refer to the objects were lost.
Although I successfully found many of objects of task_struct from
another data structure such as via a circular list of task_struct::tasks
and via run queue, crash sub-commands never work with the following
message if a given object is not contained in the task table:
crash> bt 0xffffffff00000000
bt: invalid task or pid value: 0xffffffff00000000
To address this issue, I made a patch to add a command-line option
to pass a list of addresses of task_struct objects to make crash
try to detect them in task table.
I made this in very short time and there may be better interface
than command-line option.
Tested on top of crash-7.2.5.
Yeah, what bothers me about this patch is that even though it worked for your
particular half-baked vmcore, it may never be of any help to anybody else
in the future.
It's similar in nature to patches that have posted that address a particular
unique kernel bug that was seen in one vmcore, but it would be highly unlikely
that the circumstances would ever be seen again.
But in this case, it's even more unlikely given that it's dealing with
an incomplete vmcore. You were lucky that you were able to even
bring up a crash session at all -- and then were able to generate
a task list after that.
Following the task_struct.tasks list doesn't gather all of the
tasks in a task group, so it doesn't create a fully populated task
list, correct?
Plus it doesn't make sense to add it unless it's documented *how* to
create the task list to begin with.
I don't know, let me think about this...
Thanks,
Dave
---
defs.h | 1 +
main.c | 7 +++-
task.c | 138
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 145 insertions(+), 1 deletion(-)
diff --git a/defs.h b/defs.h
index 9ebdde6..150dcdb 100644
--- a/defs.h
+++ b/defs.h
@@ -855,6 +855,7 @@ struct task_table { /* kernel/local
task table data */
int callbacks;
struct task_context **context_by_task; /* task_context sorted by task addr
*/
ulong pid_xarray;
+ char *task_table_file;
};
#define TASK_INIT_DONE (0x1)
diff --git a/main.c b/main.c
index 7248810..4f6c0a6 100644
--- a/main.c
+++ b/main.c
@@ -89,7 +89,7 @@ main(int argc, char **argv)
*/
opterr = 0;
optind = 0;
- while((c = getopt_long(argc, argv, "Lkgh::e:i:sSvc:d:tfp:m:xo:",
+ while((c = getopt_long(argc, argv, "Lkgh::e:i:sSvc:d:tT:fp:m:xo:",
long_options, &option_index)) != -1) {
switch (c)
{
@@ -408,6 +408,11 @@ main(int argc, char **argv)
ramdump_elf_output_file(optarg);
break;
+ case 'T':
+ if (!tt->task_table_file)
+ tt->task_table_file = optarg;
+ break;
+
default:
error(INFO, "invalid option: %s\n",
argv[optind-1]);
diff --git a/task.c b/task.c
index cd118e5..c57fa37 100644
--- a/task.c
+++ b/task.c
@@ -30,6 +30,7 @@ static void refresh_hlist_task_table(void);
static void refresh_hlist_task_table_v2(void);
static void refresh_hlist_task_table_v3(void);
static void refresh_active_task_table(void);
+static void refresh_task_table_from_file(void);
static int radix_tree_task_callback(ulong);
static void refresh_radix_tree_task_table(void);
static void refresh_xarray_task_table(void);
@@ -598,6 +599,9 @@ task_init(void)
if (tt->flags & ACTIVE_ONLY)
tt->refresh_task_table = refresh_active_task_table;
+ if (tt->task_table_file)
+ tt->refresh_task_table = refresh_task_table_from_file;
+
tt->refresh_task_table();
if (tt->flags & TASK_REFRESH_OFF)
@@ -2798,6 +2802,140 @@ retry_active:
tt->retries = MAX(tt->retries, retries);
}
+static void
+refresh_task_table_from_file(void)
+{
+ int i;
+ char *tp;
+ int cnt;
+ ulong curtask;
+ ulong curpid;
+ ulong retries;
+ ulong *tlp;
+ FILE *ttffp = NULL;
+
+ if (DUMPFILE() && (tt->flags & TASK_INIT_DONE)) /* impossible */
+ return;
+
+ if (DUMPFILE()) {
+ please_wait("gathering task table data");
+ if (!symbol_exists("panic_threads"))
+ tt->flags |= POPULATE_PANIC;
+ }
+
+ if (ACTIVE() && !(tt->flags & TASK_REFRESH))
+ return;
+
+ curtask = NO_TASK;
+ curpid = NO_PID;
+ retries = 0;
+
+ get_active_set();
+
+ /*
+ * The current task's task_context entry may change,
+ * or the task may not even exist anymore.
+ */
+ if (ACTIVE() && (tt->flags & TASK_INIT_DONE)) {
+ curtask = CURRENT_TASK();
+ curpid = CURRENT_PID();
+ }
+
+retry_active:
+
+ if (!hq_open()) {
+ error(INFO, "cannot hash task_struct entries\n");
+ if (!(tt->flags & TASK_INIT_DONE))
+ clean_exit(1);
+ error(INFO, "using stale task_structs\n");
+ return;
+ }
+
+ ttffp = fopen(tt->task_table_file, "r");
+ if (!ttffp) {
+ error(INFO, "failed to open %s\n", tt->task_table_file);
+ return;
+ }
+
+ /*
+ * Get tasks from a file.
+ */
+ cnt = 0;
+ for (;;) {
+ char line[128];
+
+ fgets(line, sizeof(line), ttffp);
+ if (ferror(ttffp)) {
+ error(INFO, "failed to read task table file: %s\n",
+ strerror(errno));
+ goto fail;
+ }
+ if (feof(ttffp))
+ break;
+ if (!strlen(line) || line[0] == '#')
+ continue;
+ strtok(line, "\n");
+ if (hq_enter(htol(line, RETURN_ON_ERROR, NULL)))
+ cnt++;
+ else
+ error(WARNING, "%sduplicate tasks?\n",
+ DUMPFILE() ? "\n" : "");
+ }
+
+ BZERO(tt->task_local, tt->max_tasks * sizeof(void *));
+ cnt = retrieve_list((ulong *)tt->task_local, cnt);
+
+ hq_close();
+
+ clear_task_cache();
+
+ for (i = 0, tlp = (ulong *)tt->task_local, tt->running_tasks = 0;
+ i < tt->max_tasks; i++, tlp++) {
+ if (!(*tlp))
+ continue;
+
+ if (!IS_TASK_ADDR(*tlp)) {
+ error(WARNING,
+ "%sinvalid task address found in "
+ "task list: %lx\n",
+ DUMPFILE() ? "\n" : "", *tlp);
+ if (DUMPFILE())
+ continue;
+ retries++;
+ goto retry_active;
+ }
+
+ if (!(tp = fill_task_struct(*tlp))) {
+ if (DUMPFILE())
+ continue;
+ retries++;
+ goto retry_active;
+ }
+
+ if (!add_context(*tlp, tp) && DUMPFILE())
+ error(WARNING, "corrupt/invalid task from file: %lx\n",
+ *tlp);
+ }
+
+ if (!tt->running_tasks) {
+ if (DUMPFILE())
+ error(FATAL, "cannot determine any tasks "
+ "from file!\n");
+ retries++;
+ goto retry_active;
+ }
+
+ please_wait_done();
+
+ if (ACTIVE() && (tt->flags & TASK_INIT_DONE))
+ refresh_context(curtask, curpid);
+
+ tt->retries = MAX(tt->retries, retries);
+
+fail:
+ fclose(ttffp);
+}
+
/*
* Initialize and return a new task_context structure with data from a
task.
* NULL is returned on error.
--
1.8.3.1
--
Crash-utility mailing list
Crash-utility(a)redhat.com
https://www.redhat.com/mailman/listinfo/crash-utility