// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
// Copyright (c) 2020 Wenbo Zhang
//
// Based on biosnoop(8) from BCC by Brendan Gregg.
// 29-Jun-2020   Wenbo Zhang   Created this.
#include <argp.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <bpf/libbpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
#include "blk_types.h"
#include "biosnoop.h"
#include "biosnoop.skel.h"
#include "trace_helpers.h"

#define PERF_BUFFER_PAGES	16
#define PERF_POLL_TIMEOUT_MS	100

static volatile sig_atomic_t exiting = 0;

static struct env {
	char *disk;
	int duration;
	bool timestamp;
	bool queued;
	bool verbose;
} env = {};

static volatile __u64 start_ts;

const char *argp_program_version = "biosnoop 0.1";
const char *argp_program_bug_address =
	"https://github.com/iovisor/bcc/tree/master/libbpf-tools";
const char argp_program_doc[] =
"Trace block I/O.\n"
"\n"
"USAGE: biosnoop [--help] [-d DISK] [-Q]\n"
"\n"
"EXAMPLES:\n"
"    biosnoop              # trace all block I/O\n"
"    biosnoop -Q           # include OS queued time in I/O time\n"
"    biosnoop 10           # trace for 10 seconds only\n"
"    biosnoop -d sdc       # trace sdc only\n";

static const struct argp_option opts[] = {
	{ "queued", 'Q', NULL, 0, "Include OS queued time in I/O time" },
	{ "disk",  'd', "DISK",  0, "Trace this disk only" },
	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
	{},
};

static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
	static int pos_args;

	switch (key) {
	case 'h':
		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
		break;
	case 'v':
		env.verbose = true;
		break;
	case 'Q':
		env.queued = true;
		break;
	case 'd':
		env.disk = arg;
		if (strlen(arg) + 1 > DISK_NAME_LEN) {
			fprintf(stderr, "invaild disk name: too long\n");
			argp_usage(state);
		}
		break;
	case ARGP_KEY_ARG:
		if (pos_args++) {
			fprintf(stderr,
				"unrecognized positional argument: %s\n", arg);
			argp_usage(state);
		}
		errno = 0;
		env.duration = strtoll(arg, NULL, 10);
		if (errno || env.duration <= 0) {
			fprintf(stderr, "invalid delay (in us): %s\n", arg);
			argp_usage(state);
		}
		break;
	default:
		return ARGP_ERR_UNKNOWN;
	}
	return 0;
}

int libbpf_print_fn(enum libbpf_print_level level,
		const char *format, va_list args)
{
	if (level == LIBBPF_DEBUG && !env.verbose)
		return 0;
	return vfprintf(stderr, format, args);
}

static void sig_int(int signo)
{
	exiting = 1;
}

static void blk_fill_rwbs(char *rwbs, unsigned int op)
{
	int i = 0;

	if (op & REQ_PREFLUSH)
		rwbs[i++] = 'F';

	switch (op & REQ_OP_MASK) {
	case REQ_OP_WRITE:
	case REQ_OP_WRITE_SAME:
		rwbs[i++] = 'W';
		break;
	case REQ_OP_DISCARD:
		rwbs[i++] = 'D';
		break;
	case REQ_OP_SECURE_ERASE:
		rwbs[i++] = 'D';
		rwbs[i++] = 'E';
		break;
	case REQ_OP_FLUSH:
		rwbs[i++] = 'F';
		break;
	case REQ_OP_READ:
		rwbs[i++] = 'R';
		break;
	default:
		rwbs[i++] = 'N';
	}

	if (op & REQ_FUA)
		rwbs[i++] = 'F';
	if (op & REQ_RAHEAD)
		rwbs[i++] = 'A';
	if (op & REQ_SYNC)
		rwbs[i++] = 'S';
	if (op & REQ_META)
		rwbs[i++] = 'M';

	rwbs[i] = '\0';
}

static struct partitions *partitions;

void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
{
	const struct partition *partition;
	const struct event *e = data;
	char rwbs[RWBS_LEN];

	if (!start_ts)
		start_ts = e->ts;
	blk_fill_rwbs(rwbs, e->cmd_flags);
	partition = partitions__get_by_dev(partitions, e->dev);
	printf("%-11.6f %-14.14s %-6d %-7s %-4s %-10lld %-7d ",
		(e->ts - start_ts) / 1000000000.0,
		e->comm, e->pid, partition ? partition->name : "Unknown", rwbs,
		e->sector, e->len);
	if (env.queued)
		printf("%7.3f ", e->qdelta != -1 ?
			e->qdelta / 1000000.0 : -1);
	printf("%7.3f\n", e->delta / 1000000.0);
}

void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
{
	fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu);
}

int main(int argc, char **argv)
{
	const struct partition *partition;
	static const struct argp argp = {
		.options = opts,
		.parser = parse_arg,
		.doc = argp_program_doc,
	};
	struct perf_buffer_opts pb_opts;
	struct perf_buffer *pb = NULL;
	struct ksyms *ksyms = NULL;
	struct biosnoop_bpf *obj;
	__u64 time_end = 0;
	int err;

	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
	if (err)
		return err;

	libbpf_set_print(libbpf_print_fn);

	err = bump_memlock_rlimit();
	if (err) {
		fprintf(stderr, "failed to increase rlimit: %d\n", err);
		return 1;
	}

	obj = biosnoop_bpf__open();
	if (!obj) {
		fprintf(stderr, "failed to open BPF object\n");
		return 1;
	}

	partitions = partitions__load();
	if (!partitions) {
		fprintf(stderr, "failed to load partitions info\n");
		goto cleanup;
	}

	/* initialize global data (filtering options) */
	if (env.disk) {
		partition = partitions__get_by_name(partitions, env.disk);
		if (!partition) {
			fprintf(stderr, "invaild partition name: not exist\n");
			goto cleanup;
		}
	}
	obj->rodata->targ_queued = env.queued;

	err = biosnoop_bpf__load(obj);
	if (err) {
		fprintf(stderr, "failed to load BPF object: %d\n", err);
		goto cleanup;
	}

	obj->links.blk_account_io_start =
		bpf_program__attach(obj->progs.blk_account_io_start);
	err = libbpf_get_error(obj->links.blk_account_io_start);
	if (err) {
		fprintf(stderr, "failed to attach blk_account_io_start: %s\n",
			strerror(err));
		goto cleanup;
	}
	ksyms = ksyms__load();
	if (!ksyms) {
		fprintf(stderr, "failed to load kallsyms\n");
		goto cleanup;
	}
	if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
		obj->links.blk_account_io_merge_bio =
			bpf_program__attach(obj->progs.blk_account_io_merge_bio);
		err = libbpf_get_error(obj->links.blk_account_io_merge_bio);
		if (err) {
			fprintf(stderr, "failed to attach "
				"blk_account_io_merge_bio: %s\n",
				strerror(err));
			goto cleanup;
		}
	}
	if (env.queued) {
		obj->links.block_rq_insert =
			bpf_program__attach(obj->progs.block_rq_insert);
		err = libbpf_get_error(obj->links.block_rq_insert);
		if (err) {
			fprintf(stderr, "failed to attach block_rq_insert: %s\n",
				strerror(err));
			goto cleanup;
		}
	}
	obj->links.block_rq_issue =
		bpf_program__attach(obj->progs.block_rq_issue);
	err = libbpf_get_error(obj->links.block_rq_issue);
	if (err) {
		fprintf(stderr, "failed to attach block_rq_issue: %s\n",
			strerror(err));
		goto cleanup;
	}
	obj->links.block_rq_complete =
		bpf_program__attach(obj->progs.block_rq_complete);
	err = libbpf_get_error(obj->links.block_rq_complete);
	if (err) {
		fprintf(stderr, "failed to attach block_rq_complete: %s\n",
			strerror(err));
		goto cleanup;
	}

	pb_opts.sample_cb = handle_event;
	pb_opts.lost_cb = handle_lost_events;
	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES,
			&pb_opts);
	err = libbpf_get_error(pb);
	if (err) {
		pb = NULL;
		fprintf(stderr, "failed to open perf buffer: %d\n", err);
		goto cleanup;
	}

	printf("%-11s %-14s %-6s %-7s %-4s %-10s %-7s ",
		"TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES");
	if (env.queued)
		printf("%7s ", "QUE(ms)");
	printf("%7s\n", "LAT(ms)");

	/* setup duration */
	if (env.duration)
		time_end = get_ktime_ns() + env.duration * NSEC_PER_SEC;

	if (signal(SIGINT, sig_int) == SIG_ERR) {
		fprintf(stderr, "can't set signal handler: %s\n", strerror(errno));
		err = 1;
		goto cleanup;
	}

	/* main: poll */
	while (!exiting) {
		err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
		if (err < 0 && errno != EINTR) {
			fprintf(stderr, "error polling perf buffer: %s\n", strerror(errno));
			goto cleanup;
		}
		if (env.duration && get_ktime_ns() > time_end)
			goto cleanup;
		/* reset err to return 0 if exiting */
		err = 0;
	}

cleanup:
	perf_buffer__free(pb);
	biosnoop_bpf__destroy(obj);
	ksyms__free(ksyms);
	partitions__free(partitions);

	return err != 0;
}