blkcg: implement BPF_PROG_TYPE_IO_COST
Currently, blkcg implements one builtin IO cost model - lienar. To
allow customization and experimentation, allow a bpf program to
override IO cost model.
v2: bash completion and documentation updates suggested by Quentin.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Quentin Monnet <quentin.monnet@netronome.com>
diff --git a/block/Kconfig b/block/Kconfig
index 15b3de2..2882fdd 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -204,4 +204,7 @@
config BLK_PM
def_bool BLOCK && PM
+config BLK_BPF_IO_COST
+ def_bool BLK_CGROUP_IOWEIGHT && BPF_SYSCALL
+
source "block/Kconfig.iosched"
diff --git a/block/blk-ioweight.c b/block/blk-ioweight.c
index 79e93a5..32a9362 100644
--- a/block/blk-ioweight.c
+++ b/block/blk-ioweight.c
@@ -43,6 +43,10 @@
* parameters can be configured from userspace via
* /sys/fs/cgroup/io.weight.cost_model.
*
+ * For experimentations and refinements, the IO model can also be replaced
+ * by a IO_COST bpf program. Take a look at progs/iocost_linear_prog.c and
+ * iocost_ctrl.c under tools/testing/selftests/bpf for details on how-to.
+ *
* 2. Control Strategy
*
* The device virtual time (vtime) is used as the primary control metric.
@@ -176,6 +180,7 @@
#include <linux/parser.h>
#include <linux/sched/signal.h>
#include <linux/blk-cgroup.h>
+#include <linux/filter.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
@@ -389,6 +394,10 @@ struct iow {
bool enabled;
struct iow_params params;
+#ifdef CONFIG_BLK_BPF_IO_COST
+ /* if non-NULL, bpf cost model is being used */
+ struct bpf_prog __rcu *cost_prog;
+#endif
u32 period_us;
u32 margin_us;
u64 vrate_min;
@@ -1571,6 +1580,45 @@ static void iow_timer_fn(struct timer_list *timer)
spin_unlock_irq(&iow->lock);
}
+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+ bool is_merge, u64 *costp)
+{
+ struct iow *iow = iowg->iow;
+ struct bpf_prog *prog;
+ bool ret = false;
+
+ if (!iow->cost_prog)
+ return ret;
+
+ rcu_read_lock();
+ prog = rcu_dereference(iow->cost_prog);
+ if (prog) {
+ struct bpf_io_cost ctx = {
+ .cost = 0,
+ .opf = bio->bi_opf,
+ .nr_sectors = bio_sectors(bio),
+ .sector = bio->bi_iter.bi_sector,
+ .last_sector = iowg->cursor,
+ .is_merge = is_merge,
+ };
+
+ BPF_PROG_RUN(prog, &ctx);
+ *costp = ctx.cost;
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+#else
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+ bool is_merge, u64 *costp)
+{
+ return false;
+}
+#endif
+
static void calc_vtime_cost_builtin(struct bio *bio, struct iow_gq *iowg,
bool is_merge, u64 *costp)
{
@@ -1616,6 +1664,9 @@ static u64 calc_vtime_cost(struct bio *bio, struct iow_gq *iowg, bool is_merge)
{
u64 cost;
+ if (calc_vtime_cost_bpf(bio, iowg, is_merge, &cost))
+ return cost;
+
calc_vtime_cost_builtin(bio, iowg, is_merge, &cost);
return cost;
}
@@ -2222,6 +2273,12 @@ static u64 iow_cost_model_prfill(struct seq_file *sf,
if (!dname)
return 0;
+#ifdef CONFIG_BLK_BPF_IO_COST
+ if (iow->cost_prog) {
+ seq_printf(sf, "%s ctrl=bpf\n", dname);
+ return 0;
+ }
+#endif
seq_printf(sf, "%s ctrl=%s model=linear "
"rbps=%llu rseqiops=%llu rrandiops=%llu "
"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
@@ -2369,6 +2426,84 @@ static struct blkcg_policy blkcg_policy_iow = {
.pd_free_fn = iow_pd_free,
};
+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool io_cost_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_io_cost) || off % size)
+ return false;
+
+ if (off != offsetof(struct bpf_io_cost, cost) && type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_io_cost, opf):
+ bpf_ctx_record_field_size(info, sizeof(__u32));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+ case offsetof(struct bpf_io_cost, nr_sectors):
+ return size == sizeof(__u32);
+ case offsetof(struct bpf_io_cost, cost):
+ case offsetof(struct bpf_io_cost, sector):
+ case offsetof(struct bpf_io_cost, last_sector):
+ return size == sizeof(__u64);
+ case offsetof(struct bpf_io_cost, is_merge):
+ return size == sizeof(__u8);
+ }
+
+ return false;
+}
+
+const struct bpf_prog_ops io_cost_prog_ops = {
+};
+
+const struct bpf_verifier_ops io_cost_verifier_ops = {
+ .is_valid_access = io_cost_is_valid_access,
+};
+
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg)
+{
+ int prog_fd = (int)(long)arg;
+ struct bpf_prog *prog = NULL;
+ struct request_queue *q;
+ struct iow *iow;
+ int ret = 0;
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return -ENXIO;
+ iow = q_to_iow(q);
+
+ if (prog_fd >= 0) {
+ prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_IO_COST);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ spin_lock_irq(&iow->lock);
+ if (!iow->cost_prog) {
+ rcu_assign_pointer(iow->cost_prog, prog);
+ prog = NULL;
+ } else {
+ ret = -EEXIST;
+ }
+ spin_unlock_irq(&iow->lock);
+ } else {
+ spin_lock_irq(&iow->lock);
+ if (iow->cost_prog) {
+ prog = iow->cost_prog;
+ rcu_assign_pointer(iow->cost_prog, NULL);
+ }
+ spin_unlock_irq(&iow->lock);
+ }
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
+}
+#endif /* CONFIG_BLK_BPF_IO_COST */
+
static int __init iow_init(void)
{
return blkcg_policy_register(&blkcg_policy_iow);
diff --git a/block/blk.h b/block/blk.h
index 7814aa20..98fa228 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -317,6 +317,14 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
}
#endif /* CONFIG_BOUNCE */
+#ifdef CONFIG_BLK_BPF_IO_COST
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg);
+#else
+static inline int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg) { return -ENOTTY; }
+#endif
+
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
extern int blk_iolatency_init(struct request_queue *q);
#else
diff --git a/block/ioctl.c b/block/ioctl.c
index 15a0eb8..89d48d7d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -11,6 +11,8 @@
#include <linux/pr.h>
#include <linux/uaccess.h>
+#include "blk.h"
+
static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
{
struct block_device *bdevp;
@@ -590,6 +592,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKBPFIOCOST:
+ return blk_bpf_io_cost_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a99756..fb0a91c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
#endif
+#ifdef CONFIG_BLK_BPF_IO_COST
+BPF_PROG_TYPE(BPF_PROG_TYPE_IO_COST, io_cost)
+#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf6..1664ef4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_IO_COST,
};
enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
};
};
+struct bpf_io_cost {
+ __u64 cost; /* output */
+
+ __u32 opf;
+ __u32 nr_sectors;
+ __u64 sector;
+ __u64 last_sector;
+ __u8 is_merge;
+};
+
struct bpf_func_info {
__u32 insn_off;
__u32 type_id;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa..ddf3c80 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index d672d90..beeac8a 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -383,6 +383,9 @@ static void probe_kernel_image_config(void)
/* bpftilter module with "user mode helper" */
"CONFIG_BPFILTER_UMH",
+ /* Block */
+ "CONFIG_BLK_IO_COST",
+
/* test_bpf module for BPF tests */
"CONFIG_TEST_BPF",
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 3d63feb..298e53f35 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport",
[BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
[BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl",
+ [BPF_PROG_TYPE_IO_COST] = "io_cost",
};
extern const char * const map_type_name[];
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63e0cf6..1664ef4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_IO_COST,
};
enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
};
};
+struct bpf_io_cost {
+ __u64 cost; /* output */
+
+ __u32 opf;
+ __u32 nr_sectors;
+ __u64 sector;
+ __u64 last_sector;
+ __u8 is_merge;
+};
+
struct bpf_func_info {
__u32 insn_off;
__u32 type_id;
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index 59c71fa..ddf3c80 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 197b574..6dbee40 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2266,6 +2266,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_IO_COST:
return false;
case BPF_PROG_TYPE_KPROBE:
default:
@@ -3168,6 +3169,7 @@ static const struct {
BPF_PROG_SEC("lwt_out", BPF_PROG_TYPE_LWT_OUT),
BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT),
BPF_PROG_SEC("lwt_seg6local", BPF_PROG_TYPE_LWT_SEG6LOCAL),
+ BPF_PROG_SEC("io_cost", BPF_PROG_TYPE_IO_COST),
BPF_APROG_SEC("cgroup_skb/ingress", BPF_PROG_TYPE_CGROUP_SKB,
BPF_CGROUP_INET_INGRESS),
BPF_APROG_SEC("cgroup_skb/egress", BPF_PROG_TYPE_CGROUP_SKB,
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 5e2aa83..0248317 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -101,6 +101,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_IO_COST:
default:
break;
}
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 66f2dca..c28f308 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
- test_netcnt test_tcpnotify_user test_sock_fields test_sysctl
+ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl iocost_ctrl
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
diff --git a/tools/testing/selftests/bpf/iocost_ctrl.c b/tools/testing/selftests/bpf/iocost_ctrl.c
new file mode 100644
index 0000000..d9d3eb7
--- /dev/null
+++ b/tools/testing/selftests/bpf/iocost_ctrl.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <linux/fs.h>
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj;
+ int dev_fd, prog_fd = -1;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: iocost-attach BLKDEV [BPF_PROG]");
+ return 1;
+ }
+
+ dev_fd = open(argv[1], O_RDONLY);
+ if (dev_fd < 0) {
+ perror("open(BLKDEV)");
+ return 1;
+ }
+
+ if (argc > 2) {
+ if (bpf_prog_load(argv[2], BPF_PROG_TYPE_IO_COST,
+ &obj, &prog_fd)) {
+ perror("bpf_prog_load(BPF_PROG)");
+ return 1;
+ }
+ }
+
+ if (ioctl(dev_fd, BLKBPFIOCOST, (long)prog_fd)) {
+ perror("ioctl(BLKBPFIOCOST)");
+ return 1;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/iocost_linear_prog.c b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
new file mode 100644
index 0000000..4e202c5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define REQ_OP_READ 0
+#define REQ_OP_WRITE 1
+#define REQ_OP_BITS 8
+#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1)
+
+#define LCOEF_RSEQIO 14663889
+#define LCOEF_RRANDIO 248752010
+#define LCOEF_RPAGE 28151808
+#define LCOEF_WSEQIO 32671670
+#define LCOEF_WRANDIO 63150006
+#define LCOEF_WPAGE 7323648
+
+#define RAND_IO_CUTOFF 10
+
+SEC("io_cost")
+int func(struct bpf_io_cost *ctx)
+{
+ int op;
+ __u64 seqio, randio, page;
+ __s64 delta;
+
+ switch (ctx->opf & REQ_OP_MASK) {
+ case REQ_OP_READ:
+ seqio = LCOEF_RSEQIO;
+ randio = LCOEF_RRANDIO;
+ page = LCOEF_RPAGE;
+ break;
+ case REQ_OP_WRITE:
+ seqio = LCOEF_WSEQIO;
+ randio = LCOEF_WRANDIO;
+ page = LCOEF_WPAGE;
+ break;
+ default:
+ return 0;
+ }
+
+ delta = ctx->sector - ctx->last_sector;
+ if (delta >= -RAND_IO_CUTOFF && delta <= RAND_IO_CUTOFF)
+ ctx->cost += seqio;
+ else
+ ctx->cost += randio;
+ if (!ctx->is_merge)
+ ctx->cost += page * (ctx->nr_sectors >> 3);
+
+ return 0;
+}