Hi Lianbo,
Thank you for the update, much better structure :)
Let's go into detail...
Thank you for helping the review, it seems that we are getting closer to this goal.
On 2022/05/30 18:15, Lianbo Jiang wrote:
> Currently, crash doesn't support to display disk I/O statistics
> for blk-mq devices. For more details, please refer to the following
> commit: <98b417fc6346> ("Handle blk_mq_ctx member changes for kernels
> 5.16-rc1 and later").
>
> Lets parse the bitmap in blk-mq layer to achieve it.
>
> Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
> ---
> defs.h | 11 +++
> dev.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++--------
> symbols.c | 22 +++++
> 3 files changed, 255 insertions(+), 39 deletions(-)
>
> diff --git a/defs.h b/defs.h
> index c8444b4e54eb..2681586a33dc 100644
> --- a/defs.h
> +++ b/defs.h
> @@ -2170,6 +2170,16 @@ struct offset_table { /* stash of commonly-used offsets */
> long sbq_wait_state_wait;
> long sbitmap_alloc_hint;
> long sbitmap_round_robin;
> + long request_cmd_flags;
> + long request_q;
> + long request_state;
> + long request_queue_queue_hw_ctx;
> + long request_queue_nr_hw_queues;
> + long blk_mq_hw_ctx_tags;
> + long blk_mq_tags_bitmap_tags;
> + long blk_mq_tags_breserved_tags;
> + long blk_mq_tags_nr_reserved_tags;
> + long blk_mq_tags_rqs;
> };
>
> struct size_table { /* stash of commonly-used sizes */
> @@ -2339,6 +2349,7 @@ struct size_table { /* stash of commonly-used sizes */
> long sbitmap;
> long sbitmap_queue;
> long sbq_wait_state;
> + long blk_mq_tags;
> };
>
> struct array_table {
> diff --git a/dev.c b/dev.c
> index a493e51ac95c..c0240f7b889b 100644
> --- a/dev.c
> +++ b/dev.c
> @@ -4238,19 +4238,193 @@ get_one_mctx_diskio(unsigned long mctx, struct diskio *io)
> io->write = (dispatch[1] - comp[1]);
> }
>
> +typedef bool (busy_tag_iter_fn)(ulong rq, void *data);
> +
> +struct bt_iter_data {
I think that some names are confusing and not synced with the kernel.
How about renaming to "mq_inflight"?
> + ulong q;
> + struct diskio *dio;
> +};
> +
> +struct bt_tags_iter_data {
And please rename this to "bt_iter_data".
> + ulong tags;
> + uint flags;
rename to "reserved".
OK, I will rename the above three variables.
> + uint nr_reserved_tags;
> + busy_tag_iter_fn *fn;
> + void *data;
> +};
> +
> +/*
> + * See the include/linux/blk_types.h and include/linux/blk-mq.h
> + */
> +#define MQ_RQ_IN_FLIGHT 1
> +#define REQ_OP_BITS 8
> +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1)
> +
> +static uint op_is_write(uint op)
> +{
> + return (op & REQ_OP_MASK) & 1;
> +}
> +
> +static bool mq_check_inflight(ulong rq, void *data)
> +{
> + uint cmd_flags = 0, state = 0;
> + ulong addr = 0, queue = 0;
> + struct bt_iter_data *iter = data;
> +
> + if (!IS_KVADDR(rq))
> + return TRUE;
Why return TRUE?
Does this mean that rq can be invalid with bit on?
The intent is to filter out the zero address of rq, and crash cannot completely believe that the address
of rq is always valid. To be on the safe side, it should be good to do this check.
In addition, the kernel has the similar logic of code, for example:
static bool bt_tags_iter(...)
{
...
if (!rq)
return true;
^^^^
...
}
> +
> + addr = rq + OFFSET(request_q);
> + if (!readmem(addr, KVADDR, &queue, sizeof(ulong), "request.q", RETURN_ON_ERROR))
> + return FALSE;
> +
> + addr = rq + OFFSET(request_cmd_flags);
> + if (!readmem(addr, KVADDR, &cmd_flags, sizeof(uint), "request.cmd_flags", RETURN_ON_ERROR))
> + return FALSE;
> +
> + addr = rq + OFFSET(request_state);
> + if (!readmem(addr, KVADDR, &state, sizeof(uint), "request.state", RETURN_ON_ERROR))
> + return FALSE;
Rethinking these, RETURN_ON_ERROR leads to stop counting (i.e. wrong results)
with error messages. I think it would be better to use FAULT_ON_ERROR unless
we can ignore or handle an error correctly. What do you think?
(And for the RETURN_ON_ERRORs below, same as above.)
At first, I had the same opinion as yours. But eventually, called the readmem() with the "RETURN_ON_ERROR",
the reason is that crash can help to output the statistics as much as possible, which is helpful to user, and once the
readmem() failed, it will print an appropriate error, and it can explicitly tell users that the statistics may be unreliable.
If this is really confusing to the user, let's replace it with the "FAULT_ON_ERROR".
> +
> + if (queue == iter->q && state == MQ_RQ_IN_FLIGHT) {
> + if (op_is_write(cmd_flags))
> + iter->dio->write++;
> + else
> + iter->dio->read++;
> + }
> +
> + return TRUE;
> +}
> +
> +static bool bt_tags_iter(uint bitnr, void *data)
> +{
> + ulong addr = 0, rqs_addr = 0, rq = 0;
> + struct bt_tags_iter_data *tag_iter = data;
> + ulong tag = tag_iter->tags;
> +
> + if (!tag_iter->flags)
> + bitnr += tag_iter->nr_reserved_tags;
> +
> + /* rqs */
> + addr = tag + OFFSET(blk_mq_tags_rqs);
> + if (!readmem(addr, KVADDR, &rqs_addr, sizeof(void *), "blk_mq_tags.rqs", RETURN_ON_ERROR))
> + return FALSE;
> +
> + addr = rqs_addr + bitnr * sizeof(ulong); /* rqs[bitnr] */
> + if (!readmem(addr, KVADDR, &rq, sizeof(ulong), "blk_mq_tags.rqs[]", RETURN_ON_ERROR))
> + return FALSE;
> +
> + return tag_iter->fn(rq, tag_iter->data);
> +}
> +
> +static void bt_tags_for_each(ulong q, ulong tags, ulong sbq, uint flags, uint nr_resvd_tags, struct diskio *dio)
Please rename this to bt_for_each.
No problem.
> +{
> + struct sbitmap_context sc = {0};
> + struct diskio tmp = {0};
This tmp can be removed?
(The dio->{read,write} are incremented in mq_check_inflight(), so
we don't need the local tmps for each function, I think)
Let me double check.
> + struct bt_iter_data iter_data = {
> + .q = q,
> + .dio = &tmp,
> + };
> + struct bt_tags_iter_data tags_iter_data = {
> + .tags = tags,
> + .flags = flags,
> + .nr_reserved_tags = nr_resvd_tags,
> + .fn = mq_check_inflight,
> + .data = &iter_data,
> + };
> +
> + sbitmap_context_load(sbq + OFFSET(sbitmap_queue_sb), &sc);
> + sbitmap_for_each_set(&sc, bt_tags_iter, &tags_iter_data);
> +
> + dio->read = tmp.read;
> + dio->write = tmp.write;
> +}
> +
> +static void queue_for_each_hw_ctx(ulong q, ulong *hctx, uint cnt, struct diskio *dio)
> +{
> + uint i;
> + struct diskio tmp = {0};
This also can be removed.
> +
> + if (!hctx || !dio)
> + error(FATAL, "%s: Invalid parameter.\n", __func__);
This can be used?
Will remove it.
> +
> + for (i = 0; i < cnt; i++) {
> + ulong addr = 0, tags = 0;
> + uint nr_reserved_tags = 0;
> +
> + if (!IS_KVADDR(hctx[i]))
> + continue;
Is this the remain of the v1 patch?
Yes, this can be removed from the 'for loop'.
> +
> + /* Tags owned by the block driver */
> + addr = hctx[i] + OFFSET(blk_mq_hw_ctx_tags);
> + if (!readmem(addr, KVADDR, &tags, sizeof(ulong),
> + "blk_mq_hw_ctx.tags", RETURN_ON_ERROR))
> + break;
> +
> + addr = tags + OFFSET(blk_mq_tags_nr_reserved_tags);
> + if (!readmem(addr, KVADDR, &nr_reserved_tags, sizeof(uint),
> + "blk_mq_tags_nr_reserved_tags", RETURN_ON_ERROR))
> + break;
> +
> + if (nr_reserved_tags) {
> + addr = tags + OFFSET(blk_mq_tags_breserved_tags);
> + bt_tags_for_each(q, tags, addr, 1, nr_reserved_tags, &tmp);
> + }
> + addr = tags + OFFSET(blk_mq_tags_bitmap_tags);
> + bt_tags_for_each(q, tags, addr, 0, nr_reserved_tags, &tmp);
(the local tmp set by the nr_reserved_tags case is overwritten here.)
Thanks for pointing out this issue. It should be accumulated in the bt_tags_for_each() as below:
+ dio->read += tmp.read;
+ dio->write += tmp.write;
But anyway, I will rethink the variable 'tmp' and see what is the good way.
Thanks.
Lianbo
> + }
> +
> + dio->read = tmp.read;
> + dio->write = tmp.write;
> +}
> +
> +static void get_mq_diskio_from_hw_queues(ulong q, struct diskio *dio)
> +{
> + uint cnt = 0;
> + ulong addr = 0, hctx_addr = 0;
> + ulong *hctx_array = NULL;
> + struct diskio tmp = {0};
This also can be removed.
Ditto.
> +
> + addr = q + OFFSET(request_queue_nr_hw_queues);
> + readmem(addr, KVADDR, &cnt, sizeof(uint),
> + "request_queue.nr_hw_queues", FAULT_ON_ERROR);
> +
> + addr = q + OFFSET(request_queue_queue_hw_ctx);
> + readmem(addr, KVADDR, &hctx_addr, sizeof(void *),
> + "request_queue.queue_hw_ctx", FAULT_ON_ERROR);
> +
> + hctx_array = (ulong *)GETBUF(sizeof(void *) * cnt);
> + if (!hctx_array)
> + error(FATAL, "fail to get memory for the hctx_array\n");
> +
> + if (!readmem(hctx_addr, KVADDR, hctx_array, sizeof(void *) * cnt,
> + "request_queue.queue_hw_ctx[]", RETURN_ON_ERROR)) {
> + FREEBUF(hctx_array);
> + return;
> + }
> +
> + queue_for_each_hw_ctx(q, hctx_array, cnt, &tmp);
> + dio->read = tmp.read;
> + dio->write = tmp.write;
> +
> + FREEBUF(hctx_array);
> +}
> +
> static void
> get_mq_diskio(unsigned long q, unsigned long *mq_count)
> {
> int cpu;
> unsigned long queue_ctx;
> unsigned long mctx_addr;
> - struct diskio tmp;
> + struct diskio tmp = {0};
>
> if (INVALID_MEMBER(blk_mq_ctx_rq_dispatched) ||
> - INVALID_MEMBER(blk_mq_ctx_rq_completed))
> + INVALID_MEMBER(blk_mq_ctx_rq_completed)) {
> + get_mq_diskio_from_hw_queues(q, &tmp);
> + mq_count[0] = tmp.read;
> + mq_count[1] = tmp.write;
> return;
> -
> - memset(&tmp, 0x00, sizeof(struct diskio));
> + }
>
> readmem(q + OFFSET(request_queue_queue_ctx), KVADDR, &queue_ctx,
> sizeof(ulong), "request_queue.queue_ctx",
> @@ -4479,41 +4653,24 @@ display_one_diskio(struct iter *i, unsigned long gendisk, ulong flags)
> && (io.read + io.write == 0))
> return;
>
> - if (use_mq_interface(queue_addr) &&
> - (INVALID_MEMBER(blk_mq_ctx_rq_dispatched) ||
> - INVALID_MEMBER(blk_mq_ctx_rq_completed)))
> - fprintf(fp, "%s%s%s %s%s%s%s %s%s%s",
> - mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major),
> - space(MINSPACE),
> - mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk),
> - space(MINSPACE),
> - mkstring(buf2, 10, LJUST, disk_name),
> - space(MINSPACE),
> - mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN,
> - LJUST|LONG_HEX, (char *)queue_addr),
> - space(MINSPACE),
> - mkstring(buf4, 17, RJUST, "(not supported)"),
> - space(MINSPACE));
> -
> - else
> - fprintf(fp, "%s%s%s %s%s%s%s %s%5d%s%s%s%s%s",
> - mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major),
> - space(MINSPACE),
> - mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk),
> - space(MINSPACE),
> - mkstring(buf2, 10, LJUST, disk_name),
> - space(MINSPACE),
> - mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN,
> - LJUST|LONG_HEX, (char *)queue_addr),
> - space(MINSPACE),
> - io.read + io.write,
> - space(MINSPACE),
> - mkstring(buf4, 5, RJUST|INT_DEC,
> - (char *)(unsigned long)io.read),
> - space(MINSPACE),
> - mkstring(buf5, 5, RJUST|INT_DEC,
> - (char *)(unsigned long)io.write),
> - space(MINSPACE));
> + fprintf(fp, "%s%s%s %s%s%s%s %s%5d%s%s%s%s%s",
> + mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major),
> + space(MINSPACE),
> + mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk),
> + space(MINSPACE),
> + mkstring(buf2, 10, LJUST, disk_name),
> + space(MINSPACE),
> + mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN,
> + LJUST|LONG_HEX, (char *)queue_addr),
> + space(MINSPACE),
> + io.read + io.write,
> + space(MINSPACE),
> + mkstring(buf4, 5, RJUST|INT_DEC,
> + (char *)(unsigned long)io.read),
> + space(MINSPACE),
> + mkstring(buf5, 5, RJUST|INT_DEC,
> + (char *)(unsigned long)io.write),
> + space(MINSPACE));
>
> if (VALID_MEMBER(request_queue_in_flight)) {
> if (!use_mq_interface(queue_addr)) {
> @@ -4597,6 +4754,9 @@ void diskio_init(void)
> MEMBER_OFFSET_INIT(kobject_entry, "kobject", "entry");
> MEMBER_OFFSET_INIT(kset_list, "kset", "list");
> MEMBER_OFFSET_INIT(request_list_count, "request_list", "count");
> + MEMBER_OFFSET_INIT(request_cmd_flags, "request", "cmd_flags");
> + MEMBER_OFFSET_INIT(request_q, "request", "q");
> + MEMBER_OFFSET_INIT(request_state, "request", "state");
> MEMBER_OFFSET_INIT(request_queue_in_flight, "request_queue",
> "in_flight");
> if (MEMBER_EXISTS("request_queue", "rq"))
> @@ -4608,10 +4768,33 @@ void diskio_init(void)
> "mq_ops");
> ANON_MEMBER_OFFSET_INIT(request_queue_queue_ctx,
> "request_queue", "queue_ctx");
> + MEMBER_OFFSET_INIT(request_queue_queue_hw_ctx,
> + "request_queue", "queue_hw_ctx");
> + MEMBER_OFFSET_INIT(request_queue_nr_hw_queues,
> + "request_queue", "nr_hw_queues");
> MEMBER_OFFSET_INIT(blk_mq_ctx_rq_dispatched, "blk_mq_ctx",
> "rq_dispatched");
> MEMBER_OFFSET_INIT(blk_mq_ctx_rq_completed, "blk_mq_ctx",
> "rq_completed");
> + MEMBER_OFFSET_INIT(blk_mq_hw_ctx_tags, "blk_mq_hw_ctx", "tags");
> + MEMBER_OFFSET_INIT(blk_mq_tags_bitmap_tags, "blk_mq_tags",
> + "bitmap_tags");
> + MEMBER_OFFSET_INIT(blk_mq_tags_breserved_tags, "blk_mq_tags",
> + "breserved_tags");
> + MEMBER_OFFSET_INIT(blk_mq_tags_nr_reserved_tags, "blk_mq_tags",
> + "nr_reserved_tags");
> + MEMBER_OFFSET_INIT(blk_mq_tags_rqs, "blk_mq_tags", "rqs");
> + STRUCT_SIZE_INIT(blk_mq_tags, "blk_mq_tags");
> + STRUCT_SIZE_INIT(sbitmap, "sbitmap");
> + STRUCT_SIZE_INIT(sbitmap_word, "sbitmap_word");
> + MEMBER_OFFSET_INIT(sbitmap_word_word, "sbitmap_word", "word");
> + MEMBER_OFFSET_INIT(sbitmap_word_cleared, "sbitmap_word", "cleared");
> + MEMBER_OFFSET_INIT(sbitmap_depth, "sbitmap", "depth");
> + MEMBER_OFFSET_INIT(sbitmap_shift, "sbitmap", "shift");
> + MEMBER_OFFSET_INIT(sbitmap_map_nr, "sbitmap", "map_nr");
> + MEMBER_OFFSET_INIT(sbitmap_map, "sbitmap", "map");
> + MEMBER_OFFSET_INIT(sbitmap_queue_sb, "sbitmap_queue", "sb");
> +
> }
> MEMBER_OFFSET_INIT(subsys_private_klist_devices, "subsys_private",
> "klist_devices");
> diff --git a/symbols.c b/symbols.c
> index 5d12a021c769..c1f09556d710 100644
> --- a/symbols.c
> +++ b/symbols.c
> @@ -10385,6 +10385,12 @@ dump_offset_table(char *spec, ulong makestruct)
> OFFSET(kset_list));
> fprintf(fp, " request_list_count: %ld\n",
> OFFSET(request_list_count));
> + fprintf(fp, " request_cmd_flags: %ld\n",
> + OFFSET(request_cmd_flags));
> + fprintf(fp, " request_q: %ld\n",
> + OFFSET(request_q));
> + fprintf(fp, " request_state: %ld\n",
> + OFFSET(request_state));
> fprintf(fp, " request_queue_in_flight: %ld\n",
> OFFSET(request_queue_in_flight));
> fprintf(fp, " request_queue_rq: %ld\n",
> @@ -10393,10 +10399,25 @@ dump_offset_table(char *spec, ulong makestruct)
> OFFSET(request_queue_mq_ops));
> fprintf(fp, " request_queue_queue_ctx: %ld\n",
> OFFSET(request_queue_queue_ctx));
> + fprintf(fp, " request_queue_queue_hw_ctx: %ld\n",
> + OFFSET(request_queue_queue_hw_ctx));
> + fprintf(fp, " request_queue_nr_hw_queues: %ld\n",
> + OFFSET(request_queue_nr_hw_queues));
> fprintf(fp, " blk_mq_ctx_rq_dispatched: %ld\n",
> OFFSET(blk_mq_ctx_rq_dispatched));
> fprintf(fp, " blk_mq_ctx_rq_completed: %ld\n",
> OFFSET(blk_mq_ctx_rq_completed));
> + fprintf(fp, " blk_mq_hw_ctx_tags: %ld\n",
> + OFFSET(blk_mq_hw_ctx_tags));
> + fprintf(fp, " blk_mq_tags_bitmap_tags: %ld\n",
> + OFFSET(blk_mq_tags_bitmap_tags));
> + fprintf(fp, " blk_mq_tags_breserved_tags: %ld\n",
> + OFFSET(blk_mq_tags_breserved_tags));
> + fprintf(fp, " blk_mq_tags_nr_reserved_tags: %ld\n",
> + OFFSET(blk_mq_tags_nr_reserved_tags));
> + fprintf(fp, " blk_mq_tags_rqs: %ld\n",
> + OFFSET(blk_mq_tags_rqs));
Good, thanks for adjusting these indents.
Thanks,
Kazu
> +
> fprintf(fp, " subsys_private_klist_devices: %ld\n",
> OFFSET(subsys_private_klist_devices));
> fprintf(fp, " subsystem_kset: %ld\n",
> @@ -11003,6 +11024,7 @@ dump_offset_table(char *spec, ulong makestruct)
> fprintf(fp, " sbitmap: %ld\n", SIZE(sbitmap));
> fprintf(fp, " sbitmap_queue: %ld\n", SIZE(sbitmap_queue));
> fprintf(fp, " sbq_wait_state: %ld\n", SIZE(sbq_wait_state));
> + fprintf(fp, " blk_mq_tags: %ld\n", SIZE(blk_mq_tags));
>
> fprintf(fp, "\n array_table:\n");
> /*