|  | // SPDX-License-Identifier: GPL-2.0 | 
|  | /* Copyright (c) 2019 Facebook | 
|  | * | 
|  | * This program is free software; you can redistribute it and/or | 
|  | * modify it under the terms of version 2 of the GNU General Public | 
|  | * License as published by the Free Software Foundation. | 
|  | * | 
|  | * Example program for Host Bandwidth Managment | 
|  | * | 
|  | * This program loads a cgroup skb BPF program to enforce cgroup output | 
|  | * (egress) or input (ingress) bandwidth limits. | 
|  | * | 
|  | * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] | 
|  | *   Where: | 
|  | *    -d	Print BPF trace debug buffer | 
|  | *    -l	Also limit flows doing loopback | 
|  | *    -n <#>	To create cgroup \"/hbm#\" and attach prog | 
|  | *		Default is /hbm1 | 
|  | *    --no_cn   Do not return cn notifications | 
|  | *    -r <rate>	Rate limit in Mbps | 
|  | *    -s	Get HBM stats (marked, dropped, etc.) | 
|  | *    -t <time>	Exit after specified seconds (default is 0) | 
|  | *    -w	Work conserving flag. cgroup can increase its bandwidth | 
|  | *		beyond the rate limit specified while there is available | 
|  | *		bandwidth. Current implementation assumes there is only | 
|  | *		NIC (eth0), but can be extended to support multiple NICs. | 
|  | *		Currrently only supported for egress. | 
|  | *    -h	Print this info | 
|  | *    prog	BPF program file name. Name defaults to hbm_out_kern.o | 
|  | */ | 
|  |  | 
|  | #define _GNU_SOURCE | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <assert.h> | 
|  | #include <sys/resource.h> | 
|  | #include <sys/time.h> | 
|  | #include <unistd.h> | 
|  | #include <errno.h> | 
|  | #include <fcntl.h> | 
|  | #include <linux/unistd.h> | 
|  | #include <linux/compiler.h> | 
|  |  | 
|  | #include <linux/bpf.h> | 
|  | #include <bpf/bpf.h> | 
|  | #include <getopt.h> | 
|  |  | 
|  | #include "bpf_rlimit.h" | 
|  | #include "cgroup_helpers.h" | 
|  | #include "hbm.h" | 
|  | #include "bpf_util.h" | 
|  | #include <bpf/libbpf.h> | 
|  |  | 
|  | bool outFlag = true; | 
|  | int minRate = 1000;		/* cgroup rate limit in Mbps */ | 
|  | int rate = 1000;		/* can grow if rate conserving is enabled */ | 
|  | int dur = 1; | 
|  | bool stats_flag; | 
|  | bool loopback_flag; | 
|  | bool debugFlag; | 
|  | bool work_conserving_flag; | 
|  | bool no_cn_flag; | 
|  | bool edt_flag; | 
|  |  | 
|  | static void Usage(void); | 
|  | static void read_trace_pipe2(void); | 
|  | static void do_error(char *msg, bool errno_flag); | 
|  |  | 
|  | #define DEBUGFS "/sys/kernel/debug/tracing/" | 
|  |  | 
|  | static struct bpf_program *bpf_prog; | 
|  | static struct bpf_object *obj; | 
|  | static int queue_stats_fd; | 
|  |  | 
|  | static void read_trace_pipe2(void) | 
|  | { | 
|  | int trace_fd; | 
|  | FILE *outf; | 
|  | char *outFname = "hbm_out.log"; | 
|  |  | 
|  | trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); | 
|  | if (trace_fd < 0) { | 
|  | printf("Error opening trace_pipe\n"); | 
|  | return; | 
|  | } | 
|  |  | 
|  | //	Future support of ingress | 
|  | //	if (!outFlag) | 
|  | //		outFname = "hbm_in.log"; | 
|  | outf = fopen(outFname, "w"); | 
|  |  | 
|  | if (outf == NULL) | 
|  | printf("Error creating %s\n", outFname); | 
|  |  | 
|  | while (1) { | 
|  | static char buf[4097]; | 
|  | ssize_t sz; | 
|  |  | 
|  | sz = read(trace_fd, buf, sizeof(buf) - 1); | 
|  | if (sz > 0) { | 
|  | buf[sz] = 0; | 
|  | puts(buf); | 
|  | if (outf != NULL) { | 
|  | fprintf(outf, "%s\n", buf); | 
|  | fflush(outf); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | static void do_error(char *msg, bool errno_flag) | 
|  | { | 
|  | if (errno_flag) | 
|  | printf("ERROR: %s, errno: %d\n", msg, errno); | 
|  | else | 
|  | printf("ERROR: %s\n", msg); | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | static int prog_load(char *prog) | 
|  | { | 
|  | obj = bpf_object__open_file(prog, NULL); | 
|  | if (libbpf_get_error(obj)) { | 
|  | printf("ERROR: opening BPF object file failed\n"); | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | /* load BPF program */ | 
|  | if (bpf_object__load(obj)) { | 
|  | printf("ERROR: loading BPF object file failed\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | bpf_prog = bpf_object__find_program_by_title(obj, "cgroup_skb/egress"); | 
|  | if (!bpf_prog) { | 
|  | printf("ERROR: finding a prog in obj file failed\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); | 
|  | if (queue_stats_fd < 0) { | 
|  | printf("ERROR: finding a map in obj file failed\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | err: | 
|  | bpf_object__close(obj); | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | static int run_bpf_prog(char *prog, int cg_id) | 
|  | { | 
|  | struct hbm_queue_stats qstats = {0}; | 
|  | char cg_dir[100], cg_pin_path[100]; | 
|  | struct bpf_link *link = NULL; | 
|  | int key = 0; | 
|  | int cg1 = 0; | 
|  | int rc = 0; | 
|  |  | 
|  | sprintf(cg_dir, "/hbm%d", cg_id); | 
|  | rc = prog_load(prog); | 
|  | if (rc != 0) | 
|  | return rc; | 
|  |  | 
|  | if (setup_cgroup_environment()) { | 
|  | printf("ERROR: setting cgroup environment\n"); | 
|  | goto err; | 
|  | } | 
|  | cg1 = create_and_get_cgroup(cg_dir); | 
|  | if (!cg1) { | 
|  | printf("ERROR: create_and_get_cgroup\n"); | 
|  | goto err; | 
|  | } | 
|  | if (join_cgroup(cg_dir)) { | 
|  | printf("ERROR: join_cgroup\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | qstats.rate = rate; | 
|  | qstats.stats = stats_flag ? 1 : 0; | 
|  | qstats.loopback = loopback_flag ? 1 : 0; | 
|  | qstats.no_cn = no_cn_flag ? 1 : 0; | 
|  | if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { | 
|  | printf("ERROR: Could not update map element\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | if (!outFlag) | 
|  | bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); | 
|  |  | 
|  | link = bpf_program__attach_cgroup(bpf_prog, cg1); | 
|  | if (libbpf_get_error(link)) { | 
|  | fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); | 
|  | rc = bpf_link__pin(link, cg_pin_path); | 
|  | if (rc < 0) { | 
|  | printf("ERROR: bpf_link__pin failed: %d\n", rc); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | if (work_conserving_flag) { | 
|  | struct timeval t0, t_last, t_new; | 
|  | FILE *fin; | 
|  | unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; | 
|  | signed long long last_cg_tx_bytes, new_cg_tx_bytes; | 
|  | signed long long delta_time, delta_bytes, delta_rate; | 
|  | int delta_ms; | 
|  | #define DELTA_RATE_CHECK 10000		/* in us */ | 
|  | #define RATE_THRESHOLD 9500000000	/* 9.5 Gbps */ | 
|  |  | 
|  | bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); | 
|  | if (gettimeofday(&t0, NULL) < 0) | 
|  | do_error("gettimeofday failed", true); | 
|  | t_last = t0; | 
|  | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); | 
|  | if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) | 
|  | do_error("fscanf fails", false); | 
|  | fclose(fin); | 
|  | last_cg_tx_bytes = qstats.bytes_total; | 
|  | while (true) { | 
|  | usleep(DELTA_RATE_CHECK); | 
|  | if (gettimeofday(&t_new, NULL) < 0) | 
|  | do_error("gettimeofday failed", true); | 
|  | delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + | 
|  | (t_new.tv_usec - t0.tv_usec)/1000; | 
|  | if (delta_ms > dur * 1000) | 
|  | break; | 
|  | delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + | 
|  | (t_new.tv_usec - t_last.tv_usec); | 
|  | if (delta_time == 0) | 
|  | continue; | 
|  | t_last = t_new; | 
|  | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", | 
|  | "r"); | 
|  | if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) | 
|  | do_error("fscanf fails", false); | 
|  | fclose(fin); | 
|  | printf("  new_eth_tx_bytes:%llu\n", | 
|  | new_eth_tx_bytes); | 
|  | bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); | 
|  | new_cg_tx_bytes = qstats.bytes_total; | 
|  | delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; | 
|  | last_eth_tx_bytes = new_eth_tx_bytes; | 
|  | delta_rate = (delta_bytes * 8000000) / delta_time; | 
|  | printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", | 
|  | delta_ms, delta_rate/1000000000.0, | 
|  | rate/1000.0); | 
|  | if (delta_rate < RATE_THRESHOLD) { | 
|  | /* can increase cgroup rate limit, but first | 
|  | * check if we are using the current limit. | 
|  | * Currently increasing by 6.25%, unknown | 
|  | * if that is the optimal rate. | 
|  | */ | 
|  | int rate_diff100; | 
|  |  | 
|  | delta_bytes = new_cg_tx_bytes - | 
|  | last_cg_tx_bytes; | 
|  | last_cg_tx_bytes = new_cg_tx_bytes; | 
|  | delta_rate = (delta_bytes * 8000000) / | 
|  | delta_time; | 
|  | printf(" rate:%.3fGbps", | 
|  | delta_rate/1000000000.0); | 
|  | rate_diff100 = (((long long)rate)*1000000 - | 
|  | delta_rate) * 100 / | 
|  | (((long long) rate) * 1000000); | 
|  | printf("  rdiff:%d", rate_diff100); | 
|  | if (rate_diff100  <= 3) { | 
|  | rate += (rate >> 4); | 
|  | if (rate > RATE_THRESHOLD / 1000000) | 
|  | rate = RATE_THRESHOLD / 1000000; | 
|  | qstats.rate = rate; | 
|  | printf(" INC\n"); | 
|  | } else { | 
|  | printf("\n"); | 
|  | } | 
|  | } else { | 
|  | /* Need to decrease cgroup rate limit. | 
|  | * Currently decreasing by 12.5%, unknown | 
|  | * if that is optimal | 
|  | */ | 
|  | printf(" DEC\n"); | 
|  | rate -= (rate >> 3); | 
|  | if (rate < minRate) | 
|  | rate = minRate; | 
|  | qstats.rate = rate; | 
|  | } | 
|  | if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) | 
|  | do_error("update map element fails", false); | 
|  | } | 
|  | } else { | 
|  | sleep(dur); | 
|  | } | 
|  | // Get stats! | 
|  | if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { | 
|  | char fname[100]; | 
|  | FILE *fout; | 
|  |  | 
|  | if (!outFlag) | 
|  | sprintf(fname, "hbm.%d.in", cg_id); | 
|  | else | 
|  | sprintf(fname, "hbm.%d.out", cg_id); | 
|  | fout = fopen(fname, "w"); | 
|  | fprintf(fout, "id:%d\n", cg_id); | 
|  | fprintf(fout, "ERROR: Could not lookup queue_stats\n"); | 
|  | } else if (stats_flag && qstats.lastPacketTime > | 
|  | qstats.firstPacketTime) { | 
|  | long long delta_us = (qstats.lastPacketTime - | 
|  | qstats.firstPacketTime)/1000; | 
|  | unsigned int rate_mbps = ((qstats.bytes_total - | 
|  | qstats.bytes_dropped) * 8 / | 
|  | delta_us); | 
|  | double percent_pkts, percent_bytes; | 
|  | char fname[100]; | 
|  | FILE *fout; | 
|  | int k; | 
|  | static const char *returnValNames[] = { | 
|  | "DROP_PKT", | 
|  | "ALLOW_PKT", | 
|  | "DROP_PKT_CWR", | 
|  | "ALLOW_PKT_CWR" | 
|  | }; | 
|  | #define RET_VAL_COUNT 4 | 
|  |  | 
|  | // Future support of ingress | 
|  | //		if (!outFlag) | 
|  | //			sprintf(fname, "hbm.%d.in", cg_id); | 
|  | //		else | 
|  | sprintf(fname, "hbm.%d.out", cg_id); | 
|  | fout = fopen(fname, "w"); | 
|  | fprintf(fout, "id:%d\n", cg_id); | 
|  | fprintf(fout, "rate_mbps:%d\n", rate_mbps); | 
|  | fprintf(fout, "duration:%.1f secs\n", | 
|  | (qstats.lastPacketTime - qstats.firstPacketTime) / | 
|  | 1000000000.0); | 
|  | fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); | 
|  | fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / | 
|  | 1000000)); | 
|  | fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); | 
|  | fprintf(fout, "bytes_dropped_MB:%d\n", | 
|  | (int)(qstats.bytes_dropped / | 
|  | 1000000)); | 
|  | // Marked Pkts and Bytes | 
|  | percent_pkts = (qstats.pkts_marked * 100.0) / | 
|  | (qstats.pkts_total + 1); | 
|  | percent_bytes = (qstats.bytes_marked * 100.0) / | 
|  | (qstats.bytes_total + 1); | 
|  | fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); | 
|  | fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); | 
|  |  | 
|  | // Dropped Pkts and Bytes | 
|  | percent_pkts = (qstats.pkts_dropped * 100.0) / | 
|  | (qstats.pkts_total + 1); | 
|  | percent_bytes = (qstats.bytes_dropped * 100.0) / | 
|  | (qstats.bytes_total + 1); | 
|  | fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); | 
|  | fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); | 
|  |  | 
|  | // ECN CE markings | 
|  | percent_pkts = (qstats.pkts_ecn_ce * 100.0) / | 
|  | (qstats.pkts_total + 1); | 
|  | fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, | 
|  | (int)qstats.pkts_ecn_ce); | 
|  |  | 
|  | // Average cwnd | 
|  | fprintf(fout, "avg cwnd:%d\n", | 
|  | (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); | 
|  | // Average rtt | 
|  | fprintf(fout, "avg rtt:%d\n", | 
|  | (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); | 
|  | // Average credit | 
|  | if (edt_flag) | 
|  | fprintf(fout, "avg credit_ms:%.03f\n", | 
|  | (qstats.sum_credit / | 
|  | (qstats.pkts_total + 1.0)) / 1000000.0); | 
|  | else | 
|  | fprintf(fout, "avg credit:%d\n", | 
|  | (int)(qstats.sum_credit / | 
|  | (1500 * ((int)qstats.pkts_total ) + 1))); | 
|  |  | 
|  | // Return values stats | 
|  | for (k = 0; k < RET_VAL_COUNT; k++) { | 
|  | percent_pkts = (qstats.returnValCount[k] * 100.0) / | 
|  | (qstats.pkts_total + 1); | 
|  | fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], | 
|  | percent_pkts, (int)qstats.returnValCount[k]); | 
|  | } | 
|  | fclose(fout); | 
|  | } | 
|  |  | 
|  | if (debugFlag) | 
|  | read_trace_pipe2(); | 
|  | goto cleanup; | 
|  |  | 
|  | err: | 
|  | rc = 1; | 
|  |  | 
|  | cleanup: | 
|  | bpf_link__destroy(link); | 
|  | bpf_object__close(obj); | 
|  |  | 
|  | if (cg1 != -1) | 
|  | close(cg1); | 
|  |  | 
|  | if (rc != 0) | 
|  | cleanup_cgroup_environment(); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static void Usage(void) | 
|  | { | 
|  | printf("This program loads a cgroup skb BPF program to enforce\n" | 
|  | "cgroup output (egress) bandwidth limits.\n\n" | 
|  | "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [--no_cn] [-r <rate>]\n" | 
|  | "           [-s] [-t <secs>] [-w] [-h] [prog]\n" | 
|  | "  Where:\n" | 
|  | "    -o         indicates egress direction (default)\n" | 
|  | "    -d         print BPF trace debug buffer\n" | 
|  | "    --edt      use fq's Earliest Departure Time\n" | 
|  | "    -l         also limit flows using loopback\n" | 
|  | "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n" | 
|  | "               Default is /hbm1\n" | 
|  | "    --no_cn    disable CN notifications\n" | 
|  | "    -r <rate>  Rate in Mbps\n" | 
|  | "    -s         Update HBM stats\n" | 
|  | "    -t <time>  Exit after specified seconds (default is 0)\n" | 
|  | "    -w	       Work conserving flag. cgroup can increase\n" | 
|  | "               bandwidth beyond the rate limit specified\n" | 
|  | "               while there is available bandwidth. Current\n" | 
|  | "               implementation assumes there is only eth0\n" | 
|  | "               but can be extended to support multiple NICs\n" | 
|  | "    -h         print this info\n" | 
|  | "    prog       BPF program file name. Name defaults to\n" | 
|  | "                 hbm_out_kern.o\n"); | 
|  | } | 
|  |  | 
|  | int main(int argc, char **argv) | 
|  | { | 
|  | char *prog = "hbm_out_kern.o"; | 
|  | int  k; | 
|  | int cg_id = 1; | 
|  | char *optstring = "iodln:r:st:wh"; | 
|  | struct option loptions[] = { | 
|  | {"no_cn", 0, NULL, 1}, | 
|  | {"edt", 0, NULL, 2}, | 
|  | {NULL, 0, NULL, 0} | 
|  | }; | 
|  |  | 
|  | while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { | 
|  | switch (k) { | 
|  | case 1: | 
|  | no_cn_flag = true; | 
|  | break; | 
|  | case 2: | 
|  | prog = "hbm_edt_kern.o"; | 
|  | edt_flag = true; | 
|  | break; | 
|  | case'o': | 
|  | break; | 
|  | case 'd': | 
|  | debugFlag = true; | 
|  | break; | 
|  | case 'l': | 
|  | loopback_flag = true; | 
|  | break; | 
|  | case 'n': | 
|  | cg_id = atoi(optarg); | 
|  | break; | 
|  | case 'r': | 
|  | minRate = atoi(optarg) * 1.024; | 
|  | rate = minRate; | 
|  | break; | 
|  | case 's': | 
|  | stats_flag = true; | 
|  | break; | 
|  | case 't': | 
|  | dur = atoi(optarg); | 
|  | break; | 
|  | case 'w': | 
|  | work_conserving_flag = true; | 
|  | break; | 
|  | case '?': | 
|  | if (optopt == 'n' || optopt == 'r' || optopt == 't') | 
|  | fprintf(stderr, | 
|  | "Option -%c requires an argument.\n\n", | 
|  | optopt); | 
|  | case 'h': | 
|  | __fallthrough; | 
|  | default: | 
|  | Usage(); | 
|  | return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (optind < argc) | 
|  | prog = argv[optind]; | 
|  | printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); | 
|  |  | 
|  | return run_bpf_prog(prog, cg_id); | 
|  | } |