From 2c25fc9a503adef4279951382fc9d47b59977f59 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 27 Apr 2018 18:02:54 +0800 Subject: [PATCH 01/66] bpf, doc: Update bpf_jit_enable limitation for CONFIG_BPF_JIT_ALWAYS_ON When CONFIG_BPF_JIT_ALWAYS_ON is enabled, kernel has limitation for bpf_jit_enable, so it has fixed value 1 and we cannot set it to 2 for JIT opcode dumping; this patch is to update the doc for it. Suggested-by: Daniel Borkmann Signed-off-by: Leo Yan Signed-off-by: Daniel Borkmann --- Documentation/networking/filter.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index fd55c7de9991..5032e1263bc9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -483,6 +483,12 @@ Example output from dmesg: [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 +When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and +setting any other value than that will return in failure. This is even the case for +setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log +is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the +generally recommended approach instead. + In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for generating disassembly out of the kernel log's hexdump: From 5f4126327494c189767ca64b222abadb07c55e3d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:07 -0700 Subject: [PATCH 02/66] bpf: change prototype for stack_map_get_build_id_offset This patch didn't incur functionality change. The function prototype got changed so that the same function can be reused later. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/stackmap.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 57eeb1234b67..04f6ec1679f0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -262,16 +262,11 @@ out: return ret; } -static void stack_map_get_build_id_offset(struct bpf_map *map, - struct stack_map_bucket *bucket, +static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; - struct bpf_stack_build_id *id_offs; - - bucket->nr = trace_nr; - id_offs = (struct bpf_stack_build_id *)bucket->data; /* * We cannot do up_read() in nmi context, so build_id lookup is @@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; - stack_map_get_build_id_offset(map, new_bucket, ips, - trace_nr, user); + new_bucket->nr = trace_nr; + stack_map_get_build_id_offset( + (struct bpf_stack_build_id *)new_bucket->data, + ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:08 -0700 Subject: [PATCH 03/66] bpf: add bpf_get_stack helper Currently, stackmap and bpf_get_stackid helper are provided for bpf program to get the stack trace. This approach has a limitation though. If two stack traces have the same hash, only one will get stored in the stackmap table, so some stack traces are missing from user perspective. This patch implements a new helper, bpf_get_stack, will send stack traces directly to bpf program. The bpf program is able to see all stack traces, and then can do in-kernel processing or send stack traces to user space through shared map or bpf_perf_event_output. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 3 +- include/uapi/linux/bpf.h | 42 +++++++++++++++++++++++-- kernel/bpf/core.c | 5 +++ kernel/bpf/stackmap.c | 67 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 19 ++++++++++++ kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++- 7 files changed, 183 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 38ebbc61ed99..c553f6f9c6b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; +extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 4da8b2308174..64899c04c1a6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -468,7 +468,8 @@ struct bpf_prog { dst_needed:1, /* Do we need dst entry? */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ - kprobe_override:1; /* Do we override a kprobe? */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1; /* callchain buffer allocated? */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da77a9388947..1afb606a18b9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1767,6 +1767,40 @@ union bpf_attr { * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1835,7 +1869,8 @@ union bpf_attr { FN(msg_pull_data), \ FN(bind), \ FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), + FN(skb_get_xfrm_state), \ + FN(get_stack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1869,11 +1904,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba03ec39efb3..9349a5db3cf2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 04f6ec1679f0..3ba102b41512 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + bool user_build_id = flags & BPF_F_USER_BUILD_ID; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + int err = -EINVAL; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + if (kernel && user_build_id) + goto clear; + + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) + : sizeof(u64); + if (unlikely(size % elem_size)) + goto clear; + + num_elem = size / elem_size; + if (sysctl_perf_event_max_stack < num_elem) + init_nr = 0; + else + init_nr = sysctl_perf_event_max_stack - num_elem; + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + if (unlikely(!trace)) + goto err_fault; + + trace_nr = trace->nr - init_nr; + if (trace_nr < skip) + goto err_fault; + + trace_nr -= skip; + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; + copy_len = trace_nr * elem_size; + ips = trace->ip + skip + init_nr; + if (user && user_build_id) + stack_map_get_build_id_offset(buf, ips, trace_nr, user); + else + memcpy(buf, ips, copy_len); + + if (size > copy_len) + memset(buf + copy_len, 0, size - copy_len); + return copy_len; + +err_fault: + err = -EFAULT; +clear: + memset(buf, 0, size); + return err; +} + +const struct bpf_func_proto bpf_get_stack_proto = { + .func = bpf_get_stack, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb1a596aebd3..253f6bdb9117 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "disasm.h" @@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + const char *err_str; + +#ifdef CONFIG_PERF_EVENTS + err = get_callchain_buffers(sysctl_perf_event_max_stack); + err_str = "cannot get callchain buffer for func %s#%d\n"; +#else + err = -ENOTSUPP; + err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +#endif + if (err) { + verbose(env, err_str, func_id_name(func_id), func_id); + return err; + } + + env->prog->has_callchain_buf = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 56ba0f2a01db..46d866e9937c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,7 @@ #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /** * trace_call_bpf - invoke BPF program @@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE @@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; default: return tracing_func_proto(func_id, prog); } @@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: @@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack */ static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, @@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; default: return tracing_func_proto(func_id, prog); } From 849fa50662fbc8b476d652f8a4e6bdda17b37859 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:09 -0700 Subject: [PATCH 04/66] bpf/verifier: refine retval R0 state for bpf_get_stack helper The special property of return values for helpers bpf_get_stack and bpf_probe_read_str are captured in verifier. Both helpers return a negative error code or a length, which is equal to or smaller than the buffer size argument. This additional information in the verifier can avoid the condition such as "retval > bufsize" in the bpf program. For example, for the code blow, usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); if (usize < 0 || usize > max_len) return 0; The verifier may have the following errors: 52: (85) call bpf_get_stack#65 R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0) R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R9_w=inv800 R10=fp0,call_-1 53: (bf) r8 = r0 54: (bf) r1 = r8 55: (67) r1 <<= 32 56: (bf) r2 = r1 57: (77) r2 >>= 32 58: (25) if r2 > 0x31f goto pc+33 R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512, umax_value=18446744069414584320, var_off=(0x0; 0xffffffff00000000)) R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff)) R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R8=inv(id=0) R9=inv800 R10=fp0,call_-1 59: (1f) r9 -= r8 60: (c7) r1 s>>= 32 61: (bf) r2 = r7 62: (0f) r2 += r1 math between map_value pointer and register with unbounded min value is not allowed The failure is due to llvm compiler optimization where register "r2", which is a copy of "r1", is tested for condition while later on "r1" is used for map_ptr operation. The verifier is not able to track such inst sequence effectively. Without the "usize > max_len" condition, there is no llvm optimization and the below generated code passed verifier: 52: (85) call bpf_get_stack#65 R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0) R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R9_w=inv800 R10=fp0,call_-1 53: (b7) r1 = 0 54: (bf) r8 = r0 55: (67) r8 <<= 32 56: (c7) r8 s>>= 32 57: (6d) if r1 s> r8 goto pc+24 R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R1=inv0 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800 R10=fp0,call_-1 58: (bf) r2 = r7 59: (0f) r2 += r8 60: (1f) r9 -= r8 61: (bf) r1 = r6 Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 253f6bdb9117..988400e283b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -165,6 +165,8 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + s64 msize_smax_value; + u64 msize_umax_value; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -1985,6 +1987,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); + /* remember the mem_size which may be used later + * to refine return values. + */ + meta->msize_smax_value = reg->smax_value; + meta->msize_umax_value = reg->umax_value; + /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ @@ -2324,6 +2332,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + + if (ret_type != RET_INTEGER || + (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_probe_read_str)) + return; + + ret_reg->smax_value = meta->msize_smax_value; + ret_reg->umax_value = meta->msize_umax_value; + __reg_deduce_bounds(ret_reg); + __reg_bound_offset(ret_reg); +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2447,6 +2472,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; From afbe1a5b79cd18f0ace9107e6b4cd7aa193e5e52 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:10 -0700 Subject: [PATCH 05/66] bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals In verifier function adjust_scalar_min_max_vals, when src_known is false and the opcode is BPF_LSH/BPF_RSH, early return will happen in the function. So remove the branch in handling BPF_LSH/BPF_RSH when src_known is false. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 988400e283b7..6e3f859b3abf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2940,10 +2940,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->umin_value <<= umin_val; dst_reg->umax_value <<= umax_val; } - if (src_known) - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - else - dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; @@ -2971,11 +2968,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - if (src_known) - dst_reg->var_off = tnum_rshift(dst_reg->var_off, - umin_val); - else - dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); dst_reg->umin_value >>= umax_val; dst_reg->umax_value >>= umin_val; /* We may learn something more from the var_off */ From 9cbe1f5a32dcd6d0508326f7d9098e5bc380a4fe Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:11 -0700 Subject: [PATCH 06/66] bpf/verifier: improve register value range tracking with ARSH When helpers like bpf_get_stack returns an int value and later on used for arithmetic computation, the LSH and ARSH operations are often required to get proper sign extension into 64-bit. For example, without this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0) With this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff)) With better range of "R8", later on when "R8" is added to other register, e.g., a map pointer or scalar-value register, the better register range can be derived and verifier failure may be avoided. In our later example, ...... usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); if (usize < 0) return 0; ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0); ...... Without improving ARSH value range tracking, the register representing "max_len - usize" will have smin_value equal to S64_MIN and will be rejected by verifier. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 +++- kernel/bpf/tnum.c | 10 ++++++++++ kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0d2d3da46139..c7dc2b5902c0 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ /* Shift a tnum left (by a fixed shift) */ struct tnum tnum_lshift(struct tnum a, u8 shift); -/* Shift a tnum right (by a fixed shift) */ +/* Shift (rsh) a tnum right (by a fixed shift) */ struct tnum tnum_rshift(struct tnum a, u8 shift); +/* Shift (arsh) a tnum right (by a fixed min_shift) */ +struct tnum tnum_arshift(struct tnum a, u8 min_shift); /* Add two tnums, return @a + @b */ struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 1f4bf68c12db..938d41211be7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } +struct tnum tnum_arshift(struct tnum a, u8 min_shift) +{ + /* if a.value is negative, arithmetic shifting by minimum shift + * will have larger negative offset compared to more shifting. + * If a.value is nonnegative, arithmetic shifting by minimum shift + * will have larger positive offset compare to more shifting. + */ + return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); +} + struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6e3f859b3abf..712d8655e916 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2974,6 +2974,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; + case BPF_ARSH: + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; + } + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + __update_reg_bounds(dst_reg); + break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; From de2ff05f48afcde816ff4edb217417f62f624ab5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:12 -0700 Subject: [PATCH 07/66] tools/bpf: add bpf_get_stack helper to tools headers The tools header file bpf.h is synced with kernel uapi bpf.h. The new helper is also added to bpf_helpers.h. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 42 +++++++++++++++++++++-- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index da77a9388947..1afb606a18b9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1767,6 +1767,40 @@ union bpf_attr { * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1835,7 +1869,8 @@ union bpf_attr { FN(msg_pull_data), \ FN(bind), \ FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), + FN(skb_get_xfrm_state), \ + FN(get_stack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1869,11 +1904,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 69d7b918e66a..265f8e0e8ada 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -101,6 +101,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, int size, int flags) = (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = + (void *) BPF_FUNC_get_stack; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions From 28dbf861deacb0321604bf1c5e1ccc34dd215669 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:13 -0700 Subject: [PATCH 08/66] samples/bpf: move common-purpose trace functions to selftests There is no functionality change in this patch. The common-purpose trace functions, including perf_event polling and ksym lookup, are moved from trace_output_user.c and bpf_load.c to selftests/bpf/trace_helpers.c so that these function can be reused later in selftests. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- samples/bpf/Makefile | 11 +- samples/bpf/bpf_load.c | 63 ------- samples/bpf/bpf_load.h | 7 - samples/bpf/offwaketime_user.c | 1 + samples/bpf/sampleip_user.c | 1 + samples/bpf/spintest_user.c | 1 + samples/bpf/trace_event_user.c | 1 + samples/bpf/trace_output_user.c | 110 ++---------- tools/testing/selftests/bpf/trace_helpers.c | 180 ++++++++++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 23 +++ 10 files changed, 223 insertions(+), 175 deletions(-) create mode 100644 tools/testing/selftests/bpf/trace_helpers.c create mode 100644 tools/testing/selftests/bpf/trace_helpers.h diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b853581592fd..5e31770ac087 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -49,6 +49,7 @@ hostprogs-y += xdp_adjust_tail # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o +TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o test_lru_dist-objs := test_lru_dist.o $(LIBBPF) sock_example-objs := sock_example.o $(LIBBPF) @@ -65,10 +66,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o -trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o +trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS) lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o -offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o -spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o +offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS) +spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS) map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o @@ -82,8 +83,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o -trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o -sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o +trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS) +sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS) tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index feca497d6afd..a27ef3c42e4e 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -648,66 +648,3 @@ void read_trace_pipe(void) } } } - -#define MAX_SYMS 300000 -static struct ksym syms[MAX_SYMS]; -static int sym_cnt; - -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -int load_kallsyms(void) -{ - FILE *f = fopen("/proc/kallsyms", "r"); - char func[256], buf[256]; - char symbol; - void *addr; - int i = 0; - - if (!f) - return -ENOENT; - - while (!feof(f)) { - if (!fgets(buf, sizeof(buf), f)) - break; - if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) - break; - if (!addr) - continue; - syms[i].addr = (long) addr; - syms[i].name = strdup(func); - i++; - } - sym_cnt = i; - qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); - return 0; -} - -struct ksym *ksym_search(long key) -{ - int start = 0, end = sym_cnt; - int result; - - while (start < end) { - size_t mid = start + (end - start) / 2; - - result = key - syms[mid].addr; - if (result < 0) - end = mid; - else if (result > 0) - start = mid + 1; - else - return &syms[mid]; - } - - if (start >= 1 && syms[start - 1].addr < key && - key < syms[start].addr) - /* valid ksym */ - return &syms[start - 1]; - - /* out of range. return _stext */ - return &syms[0]; -} - diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 453c200b389b..2c3d0b448632 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -54,12 +54,5 @@ int load_bpf_file(char *path); int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); void read_trace_pipe(void); -struct ksym { - long addr; - char *name; -}; - -int load_kallsyms(void); -struct ksym *ksym_search(long key); int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); #endif diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index 512f87a5fd20..f06063af9fcb 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -17,6 +17,7 @@ #include #include "libbpf.h" #include "bpf_load.h" +#include "trace_helpers.h" #define PRINT_RAW_ADDR 0 diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 4ed690b907ff..60c2b73d1b4d 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -22,6 +22,7 @@ #include "libbpf.h" #include "bpf_load.h" #include "perf-sys.h" +#include "trace_helpers.h" #define DEFAULT_FREQ 99 #define DEFAULT_SECS 5 diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 3d736219a31c..8d3e9cfa1909 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -7,6 +7,7 @@ #include #include "libbpf.h" #include "bpf_load.h" +#include "trace_helpers.h" int main(int ac, char **argv) { diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 56f7a259a7c9..1fa1becfa641 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -21,6 +21,7 @@ #include "libbpf.h" #include "bpf_load.h" #include "perf-sys.h" +#include "trace_helpers.h" #define SAMPLE_FREQ 50 diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index ccca1e348017..5e78c2ecd08d 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -21,100 +21,10 @@ #include "libbpf.h" #include "bpf_load.h" #include "perf-sys.h" +#include "trace_helpers.h" static int pmu_fd; -int page_size; -int page_cnt = 8; -volatile struct perf_event_mmap_page *header; - -typedef void (*print_fn)(void *data, int size); - -static int perf_event_mmap(int fd) -{ - void *base; - int mmap_size; - - page_size = getpagesize(); - mmap_size = page_size * (page_cnt + 1); - - base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (base == MAP_FAILED) { - printf("mmap err\n"); - return -1; - } - - header = base; - return 0; -} - -static int perf_event_poll(int fd) -{ - struct pollfd pfd = { .fd = fd, .events = POLLIN }; - - return poll(&pfd, 1, 1000); -} - -struct perf_event_sample { - struct perf_event_header header; - __u32 size; - char data[]; -}; - -static void perf_event_read(print_fn fn) -{ - __u64 data_tail = header->data_tail; - __u64 data_head = header->data_head; - __u64 buffer_size = page_cnt * page_size; - void *base, *begin, *end; - char buf[256]; - - asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ - if (data_head == data_tail) - return; - - base = ((char *)header) + page_size; - - begin = base + data_tail % buffer_size; - end = base + data_head % buffer_size; - - while (begin != end) { - struct perf_event_sample *e; - - e = begin; - if (begin + e->header.size > base + buffer_size) { - long len = base + buffer_size - begin; - - assert(len < e->header.size); - memcpy(buf, begin, len); - memcpy(buf + len, base, e->header.size - len); - e = (void *) buf; - begin = base + e->header.size - len; - } else if (begin + e->header.size == base + buffer_size) { - begin = base; - } else { - begin += e->header.size; - } - - if (e->header.type == PERF_RECORD_SAMPLE) { - fn(e->data, e->size); - } else if (e->header.type == PERF_RECORD_LOST) { - struct { - struct perf_event_header header; - __u64 id; - __u64 lost; - } *lost = (void *) e; - printf("lost %lld events\n", lost->lost); - } else { - printf("unknown event type=%d size=%d\n", - e->header.type, e->header.size); - } - } - - __sync_synchronize(); /* smp_mb() */ - header->data_tail = data_head; -} - static __u64 time_get_ns(void) { struct timespec ts; @@ -127,7 +37,7 @@ static __u64 start_time; #define MAX_CNT 100000ll -static void print_bpf_output(void *data, int size) +static int print_bpf_output(void *data, int size) { static __u64 cnt; struct { @@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size) if (e->cookie != 0x12345678) { printf("BUG pid %llx cookie %llx sized %d\n", e->pid, e->cookie, size); - kill(0, SIGINT); + return PERF_EVENT_ERROR; } cnt++; @@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size) if (cnt == MAX_CNT) { printf("recv %lld events per sec\n", MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); - kill(0, SIGINT); + return PERF_EVENT_DONE; } + + return PERF_EVENT_CONT; } static void test_bpf_perf_event(void) @@ -170,6 +82,7 @@ int main(int argc, char **argv) { char filename[256]; FILE *f; + int ret; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -187,10 +100,7 @@ int main(int argc, char **argv) (void) f; start_time = time_get_ns(); - for (;;) { - perf_event_poll(pmu_fd); - perf_event_read(print_bpf_output); - } - - return 0; + ret = perf_event_poller(pmu_fd, print_bpf_output); + kill(0, SIGINT); + return ret; } diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c new file mode 100644 index 000000000000..ad025bd75f1c --- /dev/null +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_helpers.h" + +#define MAX_SYMS 300000 +static struct ksym syms[MAX_SYMS]; +static int sym_cnt; + +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +int load_kallsyms(void) +{ + FILE *f = fopen("/proc/kallsyms", "r"); + char func[256], buf[256]; + char symbol; + void *addr; + int i = 0; + + if (!f) + return -ENOENT; + + while (!feof(f)) { + if (!fgets(buf, sizeof(buf), f)) + break; + if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) + break; + if (!addr) + continue; + syms[i].addr = (long) addr; + syms[i].name = strdup(func); + i++; + } + sym_cnt = i; + qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); + return 0; +} + +struct ksym *ksym_search(long key) +{ + int start = 0, end = sym_cnt; + int result; + + while (start < end) { + size_t mid = start + (end - start) / 2; + + result = key - syms[mid].addr; + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return &syms[mid]; + } + + if (start >= 1 && syms[start - 1].addr < key && + key < syms[start].addr) + /* valid ksym */ + return &syms[start - 1]; + + /* out of range. return _stext */ + return &syms[0]; +} + +static int page_size; +static int page_cnt = 8; +static volatile struct perf_event_mmap_page *header; + +int perf_event_mmap(int fd) +{ + void *base; + int mmap_size; + + page_size = getpagesize(); + mmap_size = page_size * (page_cnt + 1); + + base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + printf("mmap err\n"); + return -1; + } + + header = base; + return 0; +} + +static int perf_event_poll(int fd) +{ + struct pollfd pfd = { .fd = fd, .events = POLLIN }; + + return poll(&pfd, 1, 1000); +} + +struct perf_event_sample { + struct perf_event_header header; + __u32 size; + char data[]; +}; + +static int perf_event_read(perf_event_print_fn fn) +{ + __u64 data_tail = header->data_tail; + __u64 data_head = header->data_head; + __u64 buffer_size = page_cnt * page_size; + void *base, *begin, *end; + char buf[256]; + int ret; + + asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ + if (data_head == data_tail) + return PERF_EVENT_CONT; + + base = ((char *)header) + page_size; + + begin = base + data_tail % buffer_size; + end = base + data_head % buffer_size; + + while (begin != end) { + struct perf_event_sample *e; + + e = begin; + if (begin + e->header.size > base + buffer_size) { + long len = base + buffer_size - begin; + + assert(len < e->header.size); + memcpy(buf, begin, len); + memcpy(buf + len, base, e->header.size - len); + e = (void *) buf; + begin = base + e->header.size - len; + } else if (begin + e->header.size == base + buffer_size) { + begin = base; + } else { + begin += e->header.size; + } + + if (e->header.type == PERF_RECORD_SAMPLE) { + ret = fn(e->data, e->size); + if (ret != PERF_EVENT_CONT) + return ret; + } else if (e->header.type == PERF_RECORD_LOST) { + struct { + struct perf_event_header header; + __u64 id; + __u64 lost; + } *lost = (void *) e; + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } + + __sync_synchronize(); /* smp_mb() */ + header->data_tail = data_head; + return PERF_EVENT_CONT; +} + +int perf_event_poller(int fd, perf_event_print_fn output_fn) +{ + int ret; + + for (;;) { + perf_event_poll(fd); + ret = perf_event_read(output_fn); + if (ret != PERF_EVENT_CONT) + return ret; + } + + return PERF_EVENT_DONE; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h new file mode 100644 index 000000000000..fe3eefd21e86 --- /dev/null +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_HELPER_H +#define __TRACE_HELPER_H + +struct ksym { + long addr; + char *name; +}; + +int load_kallsyms(void); +struct ksym *ksym_search(long key); + +typedef int (*perf_event_print_fn)(void *data, int size); + +/* return code for perf_event_print_fn */ +#define PERF_EVENT_DONE 0 +#define PERF_EVENT_ERROR -1 +#define PERF_EVENT_CONT -2 + +int perf_event_mmap(int fd); +/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */ +int perf_event_poller(int fd, perf_event_print_fn output_fn); +#endif From 2abe611c5f031f06ab783102e62c0ae9dfd93e80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:14 -0700 Subject: [PATCH 09/66] tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH The test_verifier already has a few ARSH test cases. This patch adds a new test case which takes advantage of newly improved verifier behavior for bpf_get_stack and ARSH. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 45 +++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 165e9ddfa446..1acafe26498b 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -11680,6 +11680,51 @@ static struct bpf_test tests[] = { .errstr = "BPF_XADD stores into R2 packet", .prog_type = BPF_PROG_TYPE_XDP, }, + { + "bpf_get_stack return R0 within range", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_4, 256), + BPF_EMIT_CALL(BPF_FUNC_get_stack), + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32), + BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16), + BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_9), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_9), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_EMIT_CALL(BPF_FUNC_get_stack), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 173965fbfba596c02fa128966c2a33cb88afcd7f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:15 -0700 Subject: [PATCH 10/66] tools/bpf: add a test for bpf_get_stack with raw tracepoint prog The test attached a raw_tracepoint program to raw_syscalls/sys_enter. It tested to get stack for user space, kernel space and user space with build_id request. It also tested to get user and kernel stack into the same buffer with back-to-back bpf_get_stack helper calls. If jit is not enabled, the user space application will check to ensure that the kernel function for raw_tracepoint ___bpf_prog_run is part of the stack. If jit is enabled, we did not have a reliable way to verify the kernel stack, so just assume the kernel stack is good when the kernel stack size is greater than 0. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- .../selftests/bpf/test_get_stack_rawtp.c | 102 +++++++++++ tools/testing/selftests/bpf/test_progs.c | 172 ++++++++++++++++-- 3 files changed, 266 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b64a7a39cbc8..9d762184b805 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -32,7 +32,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \ - test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o + test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ + test_get_stack_rawtp.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -58,6 +59,7 @@ $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c $(OUTPUT)/test_sockmap: cgroup_helpers.c +$(OUTPUT)/test_progs: trace_helpers.c .PHONY: force diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c new file mode 100644 index 000000000000..f6d9f238e00a --- /dev/null +++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_helpers.h" + +/* Permit pretty deep stack traces */ +#define MAX_STACK_RAWTP 100 +struct stack_trace_t { + int pid; + int kern_stack_size; + int user_stack_size; + int user_stack_buildid_size; + __u64 kern_stack[MAX_STACK_RAWTP]; + __u64 user_stack[MAX_STACK_RAWTP]; + struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP]; +}; + +struct bpf_map_def SEC("maps") perfmap = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(__u32), + .max_entries = 2, +}; + +struct bpf_map_def SEC("maps") stackdata_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct stack_trace_t), + .max_entries = 1, +}; + +/* Allocate per-cpu space twice the needed. For the code below + * usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); + * if (usize < 0) + * return 0; + * ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0); + * + * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64), + * verifier will complain that access "raw_data + usize" + * with size "max_len - usize" may be out of bound. + * The maximum "raw_data + usize" is "raw_data + max_len" + * and the maximum "max_len - usize" is "max_len", verifier + * concludes that the maximum buffer access range is + * "raw_data[0...max_len * 2 - 1]" and hence reject the program. + * + * Doubling the to-be-used max buffer size can fix this verifier + * issue and avoid complicated C programming massaging. + * This is an acceptable workaround since there is one entry here. + */ +struct bpf_map_def SEC("maps") rawdata_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(__u32), + .value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2, + .max_entries = 1, +}; + +SEC("tracepoint/raw_syscalls/sys_enter") +int bpf_prog1(void *ctx) +{ + int max_len, max_buildid_len, usize, ksize, total_size; + struct stack_trace_t *data; + void *raw_data; + __u32 key = 0; + + data = bpf_map_lookup_elem(&stackdata_map, &key); + if (!data) + return 0; + + max_len = MAX_STACK_RAWTP * sizeof(__u64); + max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id); + data->pid = bpf_get_current_pid_tgid(); + data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack, + max_len, 0); + data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len, + BPF_F_USER_STACK); + data->user_stack_buildid_size = bpf_get_stack( + ctx, data->user_stack_buildid, max_buildid_len, + BPF_F_USER_STACK | BPF_F_USER_BUILD_ID); + bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data)); + + /* write both kernel and user stacks to the same buffer */ + raw_data = bpf_map_lookup_elem(&rawdata_map, &key); + if (!raw_data) + return 0; + + usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); + if (usize < 0) + return 0; + + ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0); + if (ksize < 0) + return 0; + + total_size = usize + ksize; + if (total_size > 0 && total_size <= max_len) + bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index eedda98d7bb1..0ddbf34b2438 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -38,8 +38,10 @@ typedef __u16 __sum16; #include "bpf_util.h" #include "bpf_endian.h" #include "bpf_rlimit.h" +#include "trace_helpers.h" static int error_cnt, pass_cnt; +static bool jit_enabled; #define MAGIC_BYTES 123 @@ -391,13 +393,30 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } +static bool is_jit_enabled(void) +{ + const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable"; + bool enabled = false; + int sysctl_fd; + + sysctl_fd = open(jit_sysctl, 0, O_RDONLY); + if (sysctl_fd != -1) { + char tmpc; + + if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1) + enabled = (tmpc != '0'); + close(sysctl_fd); + } + + return enabled; +} + static void test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; const int nr_iters = 2; const char *file = "./test_obj_id.o"; - const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; @@ -414,20 +433,11 @@ static void test_bpf_obj_id(void) char jited_insns[128], xlated_insns[128], zeros[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; struct timespec real_time_ts, boot_time_ts; - int sysctl_fd, jit_enabled = 0, err = 0; + int err = 0; __u64 array_value; uid_t my_uid = getuid(); time_t now, load_time; - sysctl_fd = open(jit_sysctl, 0, O_RDONLY); - if (sysctl_fd != -1) { - char tmpc; - - if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1) - jit_enabled = (tmpc != '0'); - close(sysctl_fd); - } - err = bpf_prog_get_fd_by_id(0); CHECK(err >= 0 || errno != ENOENT, "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno); @@ -1204,8 +1214,147 @@ out: return; } +#define MAX_CNT_RAWTP 10ull +#define MAX_STACK_RAWTP 100 +struct get_stack_trace_t { + int pid; + int kern_stack_size; + int user_stack_size; + int user_stack_buildid_size; + __u64 kern_stack[MAX_STACK_RAWTP]; + __u64 user_stack[MAX_STACK_RAWTP]; + struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP]; +}; + +static int get_stack_print_output(void *data, int size) +{ + bool good_kern_stack = false, good_user_stack = false; + const char *nonjit_func = "___bpf_prog_run"; + struct get_stack_trace_t *e = data; + int i, num_stack; + static __u64 cnt; + struct ksym *ks; + + cnt++; + + if (size < sizeof(struct get_stack_trace_t)) { + __u64 *raw_data = data; + bool found = false; + + num_stack = size / sizeof(__u64); + /* If jit is enabled, we do not have a good way to + * verify the sanity of the kernel stack. So we + * just assume it is good if the stack is not empty. + * This could be improved in the future. + */ + if (jit_enabled) { + found = num_stack > 0; + } else { + for (i = 0; i < num_stack; i++) { + ks = ksym_search(raw_data[i]); + if (strcmp(ks->name, nonjit_func) == 0) { + found = true; + break; + } + } + } + if (found) { + good_kern_stack = true; + good_user_stack = true; + } + } else { + num_stack = e->kern_stack_size / sizeof(__u64); + if (jit_enabled) { + good_kern_stack = num_stack > 0; + } else { + for (i = 0; i < num_stack; i++) { + ks = ksym_search(e->kern_stack[i]); + if (strcmp(ks->name, nonjit_func) == 0) { + good_kern_stack = true; + break; + } + } + } + if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0) + good_user_stack = true; + } + if (!good_kern_stack || !good_user_stack) + return PERF_EVENT_ERROR; + + if (cnt == MAX_CNT_RAWTP) + return PERF_EVENT_DONE; + + return PERF_EVENT_CONT; +} + +static void test_get_stack_raw_tp(void) +{ + const char *file = "./test_get_stack_rawtp.o"; + int i, efd, err, prog_fd, pmu_fd, perfmap_fd; + struct perf_event_attr attr = {}; + struct timespec tv = {0, 10}; + __u32 key = 0, duration = 0; + struct bpf_object *obj; + + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) + return; + + efd = bpf_raw_tracepoint_open("sys_enter", prog_fd); + if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno)) + goto close_prog; + + perfmap_fd = bpf_find_map(__func__, obj, "perfmap"); + if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n", + perfmap_fd, errno)) + goto close_prog; + + err = load_kallsyms(); + if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno)) + goto close_prog; + + attr.sample_type = PERF_SAMPLE_RAW; + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_BPF_OUTPUT; + pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/, + -1/*group_fd*/, 0); + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd, + errno)) + goto close_prog; + + err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY); + if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err, + errno)) + goto close_prog; + + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); + if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n", + err, errno)) + goto close_prog; + + err = perf_event_mmap(pmu_fd); + if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno)) + goto close_prog; + + /* trigger some syscall action */ + for (i = 0; i < MAX_CNT_RAWTP; i++) + nanosleep(&tv, NULL); + + err = perf_event_poller(pmu_fd, get_stack_print_output); + if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno)) + goto close_prog; + + goto close_prog_noerr; +close_prog: + error_cnt++; +close_prog_noerr: + bpf_object__close(obj); +} + int main(void) { + jit_enabled = is_jit_enabled(); + test_pkt_access(); test_xdp(); test_xdp_adjust_tail(); @@ -1219,6 +1368,7 @@ int main(void) test_stacktrace_map(); test_stacktrace_build_id(); test_stacktrace_map_raw_tp(); + test_get_stack_raw_tp(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; From 79b45350131057250236e162ce7b3c0b291dc0a4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:16 -0700 Subject: [PATCH 11/66] tools/bpf: add a test for bpf_get_stack with tracepoint prog The test_stacktrace_map and test_stacktrace_build_id are enhanced to call bpf_get_stack in the helper to get the stack trace as well. The stack traces from bpf_get_stack and bpf_get_stackid are compared to ensure that for the same stack as represented as the same hash, their ip addresses or build id's must be the same. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 70 +++++++++++++++++-- .../selftests/bpf/test_stacktrace_build_id.c | 20 +++++- .../selftests/bpf/test_stacktrace_map.c | 19 ++++- 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 0ddbf34b2438..aa336f0abebc 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -906,11 +906,47 @@ static int compare_map_keys(int map1_fd, int map2_fd) return 0; } +static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len) +{ + __u32 key, next_key, *cur_key_p, *next_key_p; + char *val_buf1, *val_buf2; + int i, err = 0; + + val_buf1 = malloc(stack_trace_len); + val_buf2 = malloc(stack_trace_len); + cur_key_p = NULL; + next_key_p = &key; + while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) { + err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1); + if (err) + goto out; + err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2); + if (err) + goto out; + for (i = 0; i < stack_trace_len; i++) { + if (val_buf1[i] != val_buf2[i]) { + err = -1; + goto out; + } + } + key = *next_key_p; + cur_key_p = &key; + next_key_p = &next_key; + } + if (errno != ENOENT) + err = -1; + +out: + free(val_buf1); + free(val_buf2); + return err; +} + static void test_stacktrace_map() { - int control_map_fd, stackid_hmap_fd, stackmap_fd; + int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; const char *file = "./test_stacktrace_map.o"; - int bytes, efd, err, pmu_fd, prog_fd; + int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len; struct perf_event_attr attr = {}; __u32 key, val, duration = 0; struct bpf_object *obj; @@ -966,6 +1002,10 @@ static void test_stacktrace_map() if (stackmap_fd < 0) goto disable_pmu; + stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); + if (stack_amap_fd < 0) + goto disable_pmu; + /* give some time for bpf program run */ sleep(1); @@ -987,6 +1027,12 @@ static void test_stacktrace_map() "err %d errno %d\n", err, errno)) goto disable_pmu_noerr; + stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64); + err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); + if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap", + "err %d errno %d\n", err, errno)) + goto disable_pmu_noerr; + goto disable_pmu_noerr; disable_pmu: error_cnt++; @@ -1080,9 +1126,9 @@ err: static void test_stacktrace_build_id(void) { - int control_map_fd, stackid_hmap_fd, stackmap_fd; + int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; const char *file = "./test_stacktrace_build_id.o"; - int bytes, efd, err, pmu_fd, prog_fd; + int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len; struct perf_event_attr attr = {}; __u32 key, previous_key, val, duration = 0; struct bpf_object *obj; @@ -1147,6 +1193,11 @@ static void test_stacktrace_build_id(void) err, errno)) goto disable_pmu; + stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); + if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0); assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0); @@ -1198,8 +1249,15 @@ static void test_stacktrace_build_id(void) previous_key = key; } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0); - CHECK(build_id_matches < 1, "build id match", - "Didn't find expected build ID from the map"); + if (CHECK(build_id_matches < 1, "build id match", + "Didn't find expected build ID from the map")) + goto disable_pmu; + + stack_trace_len = PERF_MAX_STACK_DEPTH + * sizeof(struct bpf_stack_build_id); + err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); + CHECK(err, "compare_stack_ips stackmap vs. stack_amap", + "err %d errno %d\n", err, errno); disable_pmu: ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c index b755bd783ce5..d86c281e957f 100644 --- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c @@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(__u32), .value_size = sizeof(__u32), - .max_entries = 10000, + .max_entries = 16384, }; struct bpf_map_def SEC("maps") stackmap = { @@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = { .map_flags = BPF_F_STACK_BUILD_ID, }; +struct bpf_map_def SEC("maps") stack_amap = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct bpf_stack_build_id) + * PERF_MAX_STACK_DEPTH, + .max_entries = 128, +}; + /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ struct random_urandom_args { unsigned long long pad; @@ -42,7 +50,10 @@ struct random_urandom_args { SEC("tracepoint/random/urandom_read") int oncpu(struct random_urandom_args *args) { + __u32 max_len = sizeof(struct bpf_stack_build_id) + * PERF_MAX_STACK_DEPTH; __u32 key = 0, val = 0, *value_p; + void *stack_p; value_p = bpf_map_lookup_elem(&control_map, &key); if (value_p && *value_p) @@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args) /* The size of stackmap and stackid_hmap should be the same */ key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK); - if ((int)key >= 0) + if ((int)key >= 0) { bpf_map_update_elem(&stackid_hmap, &key, &val, 0); + stack_p = bpf_map_lookup_elem(&stack_amap, &key); + if (stack_p) + bpf_get_stack(args, stack_p, max_len, + BPF_F_USER_STACK | BPF_F_USER_BUILD_ID); + } return 0; } diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c index 76d85c5d08bd..af111af7ca1a 100644 --- a/tools/testing/selftests/bpf/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/test_stacktrace_map.c @@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(__u32), .value_size = sizeof(__u32), - .max_entries = 10000, + .max_entries = 16384, }; struct bpf_map_def SEC("maps") stackmap = { .type = BPF_MAP_TYPE_STACK_TRACE, .key_size = sizeof(__u32), .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH, - .max_entries = 10000, + .max_entries = 16384, +}; + +struct bpf_map_def SEC("maps") stack_amap = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH, + .max_entries = 16384, }; /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ @@ -44,7 +51,9 @@ struct sched_switch_args { SEC("tracepoint/sched/sched_switch") int oncpu(struct sched_switch_args *ctx) { + __u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64); __u32 key = 0, val = 0, *value_p; + void *stack_p; value_p = bpf_map_lookup_elem(&control_map, &key); if (value_p && *value_p) @@ -52,8 +61,12 @@ int oncpu(struct sched_switch_args *ctx) /* The size of stackmap and stackid_hmap should be the same */ key = bpf_get_stackid(ctx, &stackmap, 0); - if ((int)key >= 0) + if ((int)key >= 0) { bpf_map_update_elem(&stackid_hmap, &key, &val, 0); + stack_p = bpf_map_lookup_elem(&stack_amap, &key); + if (stack_p) + bpf_get_stack(ctx, stack_p, max_len, 0); + } return 0; } From a3ef8e9a4d7cc26d7528d50d10c5720b523b07c9 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 28 Apr 2018 16:06:19 -0700 Subject: [PATCH 12/66] bpf: Fix helpers ctx struct types in uapi doc Helpers may operate on two types of ctx structures: user visible ones (e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones (e.g. `struct bpf_sock_ops_kern`) in kernel implementation. UAPI documentation must refer to only user visible structures. The patch replaces references to `_kern` structures in BPF helpers description by corresponding user visible structures. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1afb606a18b9..23b334bba1a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1361,7 +1361,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1435,7 +1435,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1533,7 +1533,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1544,7 +1544,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1588,7 +1588,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1721,7 +1721,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing From 96871b9f6fc28847becf60e07477e44f6f74794f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 28 Apr 2018 16:06:20 -0700 Subject: [PATCH 13/66] bpf: Sync bpf.h to tools/ The patch syncs bpf.h to tools/. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1afb606a18b9..23b334bba1a6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1361,7 +1361,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1435,7 +1435,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1533,7 +1533,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1544,7 +1544,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1588,7 +1588,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1721,7 +1721,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing From 7ef3771205302a71a8bb4a2286ef98e5cade5d1a Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Sat, 28 Apr 2018 23:39:29 -0700 Subject: [PATCH 14/66] bpf: Allow bpf_current_task_under_cgroup in interrupt Currently, the bpf_current_task_under_cgroup helper has a check where if the BPF program is running in_interrupt(), it will return -EINVAL. This prevents the helper to be used in many useful scenarios, particularly BPF programs attached to Perf Events. This commit removes the check. Tested a few NMI (Perf Event) and some softirq context, the helper returns the correct result. Signed-off-by: Teng Qin Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 46d866e9937c..ce2cbbff27e4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -475,8 +475,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - if (unlikely(in_interrupt())) - return -EINVAL; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; From 34745aed515c1d6040110ff82378056533518eb6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 29 Apr 2018 19:27:48 -0700 Subject: [PATCH 15/66] samples/bpf: fix kprobe attachment issue on x64 Commit d5a00528b58c ("syscalls/core, syscalls/x86: Rename struct pt_regs-based sys_*() to __x64_sys_*()") renamed a lot of syscall function sys_*() to __x64_sys_*(). This caused several kprobe based samples/bpf tests failing. This patch fixed the problem in bpf_load.c. For x86_64 architecture, function name __x64_sys_*() will be first used for kprobe event creation. If the creation is successful, it will be used. Otherwise, function name sys_*() will be used for kprobe event creation. Fixes: d5a00528b58c ("syscalls/core, syscalls/x86: Rename struct pt_regs-based sys_*() to __x64_sys_*()") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- samples/bpf/bpf_load.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index a27ef3c42e4e..da9bccfaf391 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -145,6 +145,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) } if (is_kprobe || is_kretprobe) { + bool need_normal_check = true; + const char *event_prefix = ""; + if (is_kprobe) event += 7; else @@ -158,18 +161,33 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (isdigit(*event)) return populate_prog_array(event, fd); - snprintf(buf, sizeof(buf), - "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", - is_kprobe ? 'p' : 'r', event, event); - err = system(buf); - if (err < 0) { - printf("failed to create kprobe '%s' error '%s'\n", - event, strerror(errno)); - return -1; +#ifdef __x86_64__ + if (strncmp(event, "sys_", 4) == 0) { + snprintf(buf, sizeof(buf), + "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events", + is_kprobe ? 'p' : 'r', event, event); + err = system(buf); + if (err >= 0) { + need_normal_check = false; + event_prefix = "__x64_"; + } + } +#endif + if (need_normal_check) { + snprintf(buf, sizeof(buf), + "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", + is_kprobe ? 'p' : 'r', event, event); + err = system(buf); + if (err < 0) { + printf("failed to create kprobe '%s' error '%s'\n", + event, strerror(errno)); + return -1; + } } strcpy(buf, DEBUGFS); strcat(buf, "events/kprobes/"); + strcat(buf, event_prefix); strcat(buf, event); strcat(buf, "/id"); } else if (is_tracepoint) { From 4d220ed0f8140c478ab7b0a14d96821da639b646 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 28 Apr 2018 19:56:37 -0700 Subject: [PATCH 16/66] bpf: remove tracepoints from bpf core tracepoints to bpf core were added as a way to provide introspection to bpf programs and maps, but after some time it became clear that this approach is inadequate, so prog_id, map_id and corresponding get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were introduced and fully adopted by bpftool and other applications. The tracepoints in bpf core started to rot and causing syzbot warnings: WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274 Kernel panic - not syncing: panic_on_warn set ... perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228 trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline] map_update_elem kernel/bpf/syscall.c:597 [inline] SYSC_bpf kernel/bpf/syscall.c:1478 [inline] Hence this patch deletes tracepoints in bpf core. Reported-by: Eric Biggers Reported-by: syzbot Signed-off-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- MAINTAINERS | 1 - include/linux/bpf_trace.h | 1 - include/trace/events/bpf.h | 355 ------------------------------------- kernel/bpf/core.c | 6 - kernel/bpf/inode.c | 16 +- kernel/bpf/syscall.c | 15 +- 6 files changed, 2 insertions(+), 392 deletions(-) delete mode 100644 include/trace/events/bpf.h diff --git a/MAINTAINERS b/MAINTAINERS index a52800867850..537fd17a211b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2727,7 +2727,6 @@ F: Documentation/networking/filter.txt F: Documentation/bpf/ F: include/linux/bpf* F: include/linux/filter.h -F: include/trace/events/bpf.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* F: include/uapi/linux/filter.h diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h index e6fe98ae3794..ddf896abcfb6 100644 --- a/include/linux/bpf_trace.h +++ b/include/linux/bpf_trace.h @@ -2,7 +2,6 @@ #ifndef __LINUX_BPF_TRACE_H__ #define __LINUX_BPF_TRACE_H__ -#include #include #endif /* __LINUX_BPF_TRACE_H__ */ diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h deleted file mode 100644 index 150185647e6b..000000000000 --- a/include/trace/events/bpf.h +++ /dev/null @@ -1,355 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bpf - -#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BPF_H - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL - -#include -#include -#include -#include - -#define __PROG_TYPE_MAP(FN) \ - FN(SOCKET_FILTER) \ - FN(KPROBE) \ - FN(SCHED_CLS) \ - FN(SCHED_ACT) \ - FN(TRACEPOINT) \ - FN(XDP) \ - FN(PERF_EVENT) \ - FN(CGROUP_SKB) \ - FN(CGROUP_SOCK) \ - FN(LWT_IN) \ - FN(LWT_OUT) \ - FN(LWT_XMIT) - -#define __MAP_TYPE_MAP(FN) \ - FN(HASH) \ - FN(ARRAY) \ - FN(PROG_ARRAY) \ - FN(PERF_EVENT_ARRAY) \ - FN(PERCPU_HASH) \ - FN(PERCPU_ARRAY) \ - FN(STACK_TRACE) \ - FN(CGROUP_ARRAY) \ - FN(LRU_HASH) \ - FN(LRU_PERCPU_HASH) \ - FN(LPM_TRIE) - -#define __PROG_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x); -#define __PROG_TYPE_SYM_FN(x) \ - { BPF_PROG_TYPE_##x, #x }, -#define __PROG_TYPE_SYM_TAB \ - __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 } -__PROG_TYPE_MAP(__PROG_TYPE_TP_FN) - -#define __MAP_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x); -#define __MAP_TYPE_SYM_FN(x) \ - { BPF_MAP_TYPE_##x, #x }, -#define __MAP_TYPE_SYM_TAB \ - __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 } -__MAP_TYPE_MAP(__MAP_TYPE_TP_FN) - -DECLARE_EVENT_CLASS(bpf_prog_event, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - ), - - TP_printk("prog=%s type=%s", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB)) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -TRACE_EVENT(bpf_prog_load, - - TP_PROTO(const struct bpf_prog *prg, int ufd), - - TP_ARGS(prg, ufd), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - __field(int, ufd) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - __entry->ufd = ufd; - ), - - TP_printk("prog=%s type=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB), - __entry->ufd) -); - -TRACE_EVENT(bpf_map_create, - - TP_PROTO(const struct bpf_map *map, int ufd), - - TP_ARGS(map, ufd), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, size_key) - __field(u32, size_value) - __field(u32, max_entries) - __field(u32, flags) - __field(int, ufd) - ), - - TP_fast_assign( - __entry->type = map->map_type; - __entry->size_key = map->key_size; - __entry->size_value = map->value_size; - __entry->max_entries = map->max_entries; - __entry->flags = map->map_flags; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __entry->size_key, __entry->size_value, - __entry->max_entries, __entry->flags) -); - -DECLARE_EVENT_CLASS(bpf_obj_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __assign_str(path, pname->name); - __entry->ufd = ufd; - ), - - TP_printk("prog=%s path=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __get_str(path), __entry->ufd) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_obj_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname), - - TP_STRUCT__entry( - __field(u32, type) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - __assign_str(path, pname->name); - __entry->type = map->map_type; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d path=%s", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __get_str(path)) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_map_keyval, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(u32, val_len) - __dynamic_array(u8, val, map->value_size) - __field(bool, val_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - memcpy(__get_dynamic_array(val), val, map->value_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->val_len = min(map->value_size, 16U); - __entry->val_trunc = map->value_size != __entry->val_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "", - __print_hex(__get_dynamic_array(val), __entry->val_len), - __entry->val_trunc ? " ..." : "") -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -TRACE_EVENT(bpf_map_delete_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key), - - TP_ARGS(map, ufd, key), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); - -TRACE_EVENT(bpf_map_next_key, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *key_next), - - TP_ARGS(map, ufd, key, key_next), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __dynamic_array(u8, nxt, map->key_size) - __field(bool, key_trunc) - __field(bool, key_null) - __field(int, ufd) - ), - - TP_fast_assign( - if (key) - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->key_null = !key; - memcpy(__get_dynamic_array(nxt), key_next, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key), - __entry->key_len), - __entry->key_trunc && !__entry->key_null ? " ..." : "", - __print_hex(__get_dynamic_array(nxt), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); -#endif /* CONFIG_BPF_SYSCALL */ -#endif /* _TRACE_BPF_H */ - -#include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9349a5db3cf2..90feeba3a1a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1845,9 +1845,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); -#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index a41343009ccc..ed13645bd80c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); - if ((trace_bpf_obj_pin_prog_enabled() || - trace_bpf_obj_pin_map_enabled()) && !ret) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_pin_prog(raw, ufd, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_pin_map(raw, ufd, pname); - } out: putname(pname); return ret; @@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) else goto out; - if (ret < 0) { + if (ret < 0) bpf_any_put(raw, type); - } else if (trace_bpf_obj_get_prog_enabled() || - trace_bpf_obj_get_map_enabled()) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_get_prog(raw, ret, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_get_map(raw, ret, pname); - } out: putname(pname); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0bd2944eafb9..263e13ede029 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -503,7 +503,6 @@ static int map_create(union bpf_attr *attr) return err; } - trace_bpf_map_create(map, err); return err; free_map: @@ -663,7 +662,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; - trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -760,8 +758,6 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -814,8 +810,6 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -879,7 +873,6 @@ out: if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; - trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -1027,7 +1020,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) if (atomic_dec_and_test(&prog->aux->refcnt)) { int i; - trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1194,11 +1186,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; + return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); @@ -1373,7 +1361,6 @@ static int bpf_prog_load(union bpf_attr *attr) } bpf_prog_kallsyms_add(prog); - trace_bpf_prog_load(prog, err); return err; free_used_maps: From 3bd5a09b529c03bac354c9d48e688ed2aca934fd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:03 +0100 Subject: [PATCH 17/66] bpf: fix formatting for bpf_perf_event_read() helper doc Some edits brought to the last iteration of BPF helper functions documentation introduced an error with RST formatting. As a result, most of one paragraph is rendered in bold text when only the name of a helper should be. Fix it, and fix formatting of another function name in the same paragraph. Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23b334bba1a6..530ff6588d8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -828,12 +828,12 @@ union bpf_attr { * * Also, be aware that the newer helper * **bpf_perf_event_read_value**\ () is recommended over - * **bpf_perf_event_read*\ () in general. The latter has some ABI + * **bpf_perf_event_read**\ () in general. The latter has some ABI * quirks where error and counter value are used as a return code * (which is wrong to do since ranges may overlap). This issue is - * fixed with bpf_perf_event_read_value(), which at the same time - * provides more features over the **bpf_perf_event_read**\ () - * interface. Please refer to the description of + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of * **bpf_perf_event_read_value**\ () for details. * Return * The value of the perf event counter read from the map, or a From 79552fbc0f9dc20dc022be7cc48eb3761623fa56 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:04 +0100 Subject: [PATCH 18/66] bpf: fix formatting for bpf_get_stack() helper doc Fix formatting (indent) for bpf_get_stack() helper documentation, so that the doc is rendered correctly with the Python script. Fixes: c195651e565a ("bpf: add bpf_get_stack helper") Cc: Yonghong Song Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 530ff6588d8f..8daef7326bb7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1770,33 +1770,33 @@ union bpf_attr { * * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) * Description - * Return a user or a kernel stack in bpf program provided buffer. - * To achieve this, the helper needs *ctx*, which is a pointer - * to the context on which the tracing program is executed. - * To store the stacktrace, the bpf program provides *buf* with - * a nonnegative *size*. + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * the following flags: + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. * - * **bpf_get_stack**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject - * to sufficient large buffer size. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: * - * :: + * :: * - * # sysctl kernel.perf_event_max_stack= + * # sysctl kernel.perf_event_max_stack= * * Return * a non-negative value equal to or less than size on success, or From a56497d3d5cf081e00fde944a1de129442e7b065 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:05 +0100 Subject: [PATCH 19/66] bpf: update bpf.h uapi header for tools Bring fixes for eBPF helper documentation formatting to bpf.h under tools/ as well. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 52 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 23b334bba1a6..8daef7326bb7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -828,12 +828,12 @@ union bpf_attr { * * Also, be aware that the newer helper * **bpf_perf_event_read_value**\ () is recommended over - * **bpf_perf_event_read*\ () in general. The latter has some ABI + * **bpf_perf_event_read**\ () in general. The latter has some ABI * quirks where error and counter value are used as a return code * (which is wrong to do since ranges may overlap). This issue is - * fixed with bpf_perf_event_read_value(), which at the same time - * provides more features over the **bpf_perf_event_read**\ () - * interface. Please refer to the description of + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of * **bpf_perf_event_read_value**\ () for details. * Return * The value of the perf event counter read from the map, or a @@ -1770,33 +1770,33 @@ union bpf_attr { * * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) * Description - * Return a user or a kernel stack in bpf program provided buffer. - * To achieve this, the helper needs *ctx*, which is a pointer - * to the context on which the tracing program is executed. - * To store the stacktrace, the bpf program provides *buf* with - * a nonnegative *size*. + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * the following flags: + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. * - * **bpf_get_stack**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject - * to sufficient large buffer size. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: * - * :: + * :: * - * # sysctl kernel.perf_event_max_stack= + * # sysctl kernel.perf_event_max_stack= * * Return * a non-negative value equal to or less than size on success, or From c0dd967818a2b8106771a7c9fb7ef48d7be8a9d4 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 30 Apr 2018 07:26:23 -0700 Subject: [PATCH 20/66] tools, include: Grab a copy of linux/erspan.h Bring the erspan uapi header file so BPF tunnel helpers can use it. Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.") Reported-by: Yonghong Song Signed-off-by: William Tu Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/erspan.h | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tools/include/uapi/linux/erspan.h diff --git a/tools/include/uapi/linux/erspan.h b/tools/include/uapi/linux/erspan.h new file mode 100644 index 000000000000..841573019ae1 --- /dev/null +++ b/tools/include/uapi/linux/erspan.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ERSPAN Tunnel Metadata + * + * Copyright (c) 2018 VMware + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Userspace API for metadata mode ERSPAN tunnel + */ +#ifndef _UAPI_ERSPAN_H +#define _UAPI_ERSPAN_H + +#include /* For __beXX in userspace */ +#include + +/* ERSPAN version 2 metadata header */ +struct erspan_md2 { + __be32 timestamp; + __be16 sgt; /* security group tag */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hwid_upper:2, + ft:5, + p:1; + __u8 o:1, + gra:2, + dir:1, + hwid:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 p:1, + ft:5, + hwid_upper:2; + __u8 hwid:4, + dir:1, + gra:2, + o:1; +#else +#error "Please fix " +#endif +}; + +struct erspan_metadata { + int version; + union { + __be32 index; /* Version 1 (type II)*/ + struct erspan_md2 md2; /* Version 2 (type III) */ + } u; +}; + +#endif /* _UAPI_ERSPAN_H */ From a3fe1f6f2adaa01c620793666d489a584211ecba Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 1 May 2018 21:18:38 +0100 Subject: [PATCH 21/66] tools: bpftool: change time format for program 'loaded at:' information To make eBPF program load time easier to parse from "bpftool prog" output for machines, change the time format used by the program. The format now differs for plain and JSON version: - Plain version uses a string formatted according to ISO 8601. - JSON uses the number of seconds since the Epoch, wich is less friendly for humans but even easier to process. Example output: # ./bpftool prog 41298: xdp tag a04f5eef06a7f555 dev foo loaded_at 2018-04-18T17:19:47+0100 uid 0 xlated 16B not jited memlock 4096B # ./bpftool prog -p [{ "id": 41298, "type": "xdp", "tag": "a04f5eef06a7f555", "gpl_compatible": false, "dev": { "ifindex": 14, "ns_dev": 3, "ns_inode": 4026531993, "ifname": "foo" }, "loaded_at": 1524068387, "uid": 0, "bytes_xlated": 16, "jited": false, "bytes_memlock": 4096 } ] Previously, "Apr 18/17:19" would be used at both places. Suggested-by: Alexei Starovoitov Signed-off-by: Quentin Monnet Acked-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/prog.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index e71a0a11afde..9bdfdf2d3fbe 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -96,7 +96,10 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) return; } - strftime(buf, size, "%b %d/%H:%M", &load_tm); + if (json_output) + strftime(buf, size, "%s", &load_tm); + else + strftime(buf, size, "%FT%T%z", &load_tm); } static int prog_fd_by_tag(unsigned char *tag) @@ -245,7 +248,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) print_boot_time(info->load_time, buf, sizeof(buf)); /* Piggy back on load_time, since 0 uid is a valid one */ - jsonw_string_field(json_wtr, "loaded_at", buf); + jsonw_name(json_wtr, "loaded_at"); + jsonw_printf(json_wtr, "%s", buf); jsonw_uint_field(json_wtr, "uid", info->created_by_uid); } From a2c7a98301d9f9bcfc4244de04252a04c0b68a79 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Apr 2018 11:54:40 +0200 Subject: [PATCH 22/66] x86/bpf: Clean up non-standard comments, to make the code more readable So by chance I looked into x86 assembly in arch/x86/net/bpf_jit_comp.c and noticed the weird and inconsistent comment style it mistakenly learned from the networking code: /* Multi-line comment ... * ... looks like this. */ Fix this to use the standard comment style specified in Documentation/CodingStyle and used in arch/x86/ as well: /* * Multi-line comment ... * ... looks like this. */ Also, to quote Linus's ... more explicit views about this: http://article.gmane.org/gmane.linux.kernel.cryptoapi/21066 > But no, the networking code picked *none* of the above sane formats. > Instead, it picked these two models that are just half-arsed > shit-for-brains: > > (no) > /* This is disgusting drug-induced > * crap, and should die > */ > > (no-no-no) > /* This is also very nasty > * and visually unbalanced */ > > Please. The networking code actually has the *worst* possible comment > style. You can literally find that (no-no-no) style, which is just > really horribly disgusting and worse than the otherwise fairly similar > (d) in pretty much every way. Also improve the comments and some other details while at it: - Don't mix same-line and previous-line comment style on otherwise identical code patterns within the same function, - capitalize 'BPF' and x86 register names consistently, - capitalize sentences consistently, - instead of 'x64' use 'x86-64': x64 is a Microsoft specific term, - use more consistent punctuation, - use standard coding style in macros as well, - fix typos and a few other minor details. Consistent coding style is not optional, at least in arch/x86/. No change in functionality. ( In case this commit causes conflicts with pending development code I'll be glad to help resolve any conflicts! ) Acked-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: David S. Miller Cc: Eric Dumazet Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Daniel Borkmann --- arch/x86/net/bpf_jit_comp.c | 233 ++++++++++++++++++++---------------- 1 file changed, 133 insertions(+), 100 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index abce27ceb411..1c3c81ddeb87 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,4 +1,5 @@ -/* bpf_jit_comp.c : BPF JIT compiler +/* + * bpf_jit_comp.c: BPF JIT compiler * * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com @@ -17,7 +18,7 @@ #include /* - * assembly code in arch/x86/net/bpf_jit.S + * Assembly code in arch/x86/net/bpf_jit.S */ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[]; @@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) + #define EMIT1_off32(b1, off) \ - do {EMIT1(b1); EMIT(off, 4); } while (0) + do { EMIT1(b1); EMIT(off, 4); } while (0) #define EMIT2_off32(b1, b2, off) \ - do {EMIT2(b1, b2); EMIT(off, 4); } while (0) + do { EMIT2(b1, b2); EMIT(off, 4); } while (0) #define EMIT3_off32(b1, b2, b3, off) \ - do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) + do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) #define EMIT4_off32(b1, b2, b3, b4, off) \ - do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) + do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) static bool is_imm8(int value) { @@ -70,9 +72,10 @@ static bool is_uimm32(u64 value) } /* mov dst, src */ -#define EMIT_mov(DST, SRC) \ - do {if (DST != SRC) \ - EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \ +#define EMIT_mov(DST, SRC) \ + do { \ + if (DST != SRC) \ + EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \ } while (0) static int bpf_size_to_x86_bytes(int bpf_size) @@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) return 0; } -/* list of x86 cond jumps opcodes (. + s8) +/* + * List of x86 cond jumps opcodes (. + s8) * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) */ #define X86_JB 0x72 @@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -/* pick a register outside of BPF range for JIT internal work */ +/* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) -/* The following table maps BPF registers to x64 registers. +/* + * The following table maps BPF registers to x86-64 registers. * - * x64 register r12 is unused, since if used as base address + * x86-64 register R12 is unused, since if used as base address * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * r9 caches skb->len - skb->data_len - * r10 caches skb->data, and used for blinding (if enabled) + * R9 caches skb->len - skb->data_len + * R10 caches skb->data, and used for blinding (if enabled) */ static const int reg2hex[] = { - [BPF_REG_0] = 0, /* rax */ - [BPF_REG_1] = 7, /* rdi */ - [BPF_REG_2] = 6, /* rsi */ - [BPF_REG_3] = 2, /* rdx */ - [BPF_REG_4] = 1, /* rcx */ - [BPF_REG_5] = 0, /* r8 */ - [BPF_REG_6] = 3, /* rbx callee saved */ - [BPF_REG_7] = 5, /* r13 callee saved */ - [BPF_REG_8] = 6, /* r14 callee saved */ - [BPF_REG_9] = 7, /* r15 callee saved */ - [BPF_REG_FP] = 5, /* rbp readonly */ - [BPF_REG_AX] = 2, /* r10 temp register */ - [AUX_REG] = 3, /* r11 temp register */ + [BPF_REG_0] = 0, /* RAX */ + [BPF_REG_1] = 7, /* RDI */ + [BPF_REG_2] = 6, /* RSI */ + [BPF_REG_3] = 2, /* RDX */ + [BPF_REG_4] = 1, /* RCX */ + [BPF_REG_5] = 0, /* R8 */ + [BPF_REG_6] = 3, /* RBX callee saved */ + [BPF_REG_7] = 5, /* R13 callee saved */ + [BPF_REG_8] = 6, /* R14 callee saved */ + [BPF_REG_9] = 7, /* R15 callee saved */ + [BPF_REG_FP] = 5, /* RBP readonly */ + [BPF_REG_AX] = 2, /* R10 temp register */ + [AUX_REG] = 3, /* R11 temp register */ }; -/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15 +/* + * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15 * which need extra byte of encoding. * rax,rcx,...,rbp have simpler encoding */ @@ -153,7 +159,7 @@ static bool is_axreg(u32 reg) return reg == BPF_REG_0; } -/* add modifiers if 'reg' maps to x64 registers r8..r15 */ +/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */ static u8 add_1mod(u8 byte, u32 reg) { if (is_ereg(reg)) @@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2) return byte; } -/* encode 'dst_reg' register into x64 opcode 'byte' */ +/* Encode 'dst_reg' register into x86-64 opcode 'byte' */ static u8 add_1reg(u8 byte, u32 dst_reg) { return byte + reg2hex[dst_reg]; } -/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */ +/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) { return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); @@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) static void jit_fill_hole(void *area, unsigned int size) { - /* fill whole space with int3 instructions */ + /* Fill whole space with INT3 instructions */ memset(area, 0xcc, size); } struct jit_context { - int cleanup_addr; /* epilogue code offset */ + int cleanup_addr; /* Epilogue code offset */ bool seen_ld_abs; bool seen_ax_reg; }; -/* maximum number of bytes emitted while JITing one eBPF insn */ +/* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 #define AUX_STACK_SPACE \ - (32 /* space for rbx, r13, r14, r15 */ + \ - 8 /* space for skb_copy_bits() buffer */) + (32 /* Space for RBX, R13, R14, R15 */ + \ + 8 /* Space for skb_copy_bits() buffer */) #define PROLOGUE_SIZE 37 -/* emit x64 prologue code for BPF program and check it's size. +/* + * Emit x86-64 prologue code for BPF program and check its size. * bpf_tail_call helper will skip it while jumping into another program */ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) @@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) u8 *prog = *pprog; int cnt = 0; - EMIT1(0x55); /* push rbp */ - EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ + /* push rbp */ + EMIT1(0x55); + + /* mov rbp,rsp */ + EMIT3(0x48, 0x89, 0xE5); /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ EMIT3_off32(0x48, 0x81, 0xEC, @@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) /* sub rbp, AUX_STACK_SPACE */ EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); - /* all classic BPF filters use R6(rbx) save it */ + /* All classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp+0],rbx */ EMIT4(0x48, 0x89, 0x5D, 0); - /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 - * as temporary, so all tcpdump filters need to spill/fill R7(r13) and - * R8(r14). R9(r15) spill could be made conditional, but there is only + /* + * bpf_convert_filter() maps classic BPF register X to R7 and uses R8 + * as temporary, so all tcpdump filters need to spill/fill R7(R13) and + * R8(R14). R9(R15) spill could be made conditional, but there is only * one 'bpf_error' return path out of helper functions inside bpf_jit.S * The overhead of extra spill is negligible for any filter other * than synthetic ones. Therefore not worth adding complexity. @@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) EMIT4(0x4C, 0x89, 0x7D, 24); if (!ebpf_from_cbpf) { - /* Clear the tail call counter (tail_call_cnt): for eBPF tail + /* + * Clear the tail call counter (tail_call_cnt): for eBPF tail * calls we need to reset the counter to 0. It's done in two - * instructions, resetting rax register to 0, and moving it + * instructions, resetting RAX register to 0, and moving it * to the counter location. */ @@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } -/* generate the following code: +/* + * Generate the following code: + * * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; @@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog) int label1, label2, label3; int cnt = 0; - /* rdi - pointer to ctx + /* + * rdi - pointer to ctx * rsi - pointer to bpf_array * rdx - index in bpf_array */ - /* if (index >= array->map.max_entries) - * goto out; + /* + * if (index >= array->map.max_entries) + * goto out; */ EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */ +#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) - * goto out; + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ @@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - /* if (prog == NULL) - * goto out; + /* + * if (prog == NULL) + * goto out; */ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) @@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog) offsetof(struct bpf_prog, bpf_func)); EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ - /* now we're ready to jump into next BPF program + /* + * Wow we're ready to jump into next BPF program * rdi == ctx (1st arg) * rax == prog->bpf_func + prologue_size */ @@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog) u8 *prog = *pprog; int cnt = 0; - /* r9d = skb->len - skb->data_len (headlen) + /* + * r9d = skb->len - skb->data_len (headlen) * r10 = skb->data */ /* mov %r9d, off32(%rdi) */ @@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u8 b1, b2, b3; int cnt = 0; - /* optimization: if imm32 is positive, use 'mov %eax, imm32' + /* + * Optimization: if imm32 is positive, use 'mov %eax, imm32' * (which zero-extends imm32) to save 2 bytes. */ if (sign_propagate && (s32)imm32 < 0) { @@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, goto done; } - /* optimization: if imm32 is zero, use 'xor %eax, %eax' + /* + * Optimization: if imm32 is zero, use 'xor %eax, %eax' * to save 3 bytes. */ if (imm32 == 0) { @@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { - /* For emitting plain u32, where sign bit must not be + /* + * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 * directly, so save couple of bytes by just doing * 'mov %eax, imm32' instead. @@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - /* b3 holds 'normal' opcode, b2 short form only valid + /* + * b3 holds 'normal' opcode, b2 short form only valid * in case dst is eax/rax. */ switch (BPF_OP(insn->code)) { @@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* mov rax, dst_reg */ EMIT_mov(BPF_REG_0, dst_reg); - /* xor edx, edx + /* + * xor edx, edx * equivalent to 'xor rdx, rdx', but one byte less */ EMIT2(0x31, 0xd2); @@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; } - /* shifts */ + /* Shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_ARSH | BPF_K: @@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: - /* check for bad case when dst_reg == rcx */ + /* Check for bad case when dst_reg == rcx */ if (dst_reg == BPF_REG_4) { /* mov r11, dst_reg */ EMIT_mov(AUX_REG, dst_reg); @@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_END | BPF_FROM_BE: switch (imm32) { case 16: - /* emit 'ror %ax, 8' to swap lower 2 bytes */ + /* Emit 'ror %ax, 8' to swap lower 2 bytes */ EMIT1(0x66); if (is_ereg(dst_reg)) EMIT1(0x41); EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8); - /* emit 'movzwl eax, ax' */ + /* Emit 'movzwl eax, ax' */ if (is_ereg(dst_reg)) EMIT3(0x45, 0x0F, 0xB7); else @@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: - /* emit 'bswap eax' to swap lower 4 bytes */ + /* Emit 'bswap eax' to swap lower 4 bytes */ if (is_ereg(dst_reg)) EMIT2(0x41, 0x0F); else @@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_1reg(0xC8, dst_reg)); break; case 64: - /* emit 'bswap rax' to swap 8 bytes */ + /* Emit 'bswap rax' to swap 8 bytes */ EMIT3(add_1mod(0x48, dst_reg), 0x0F, add_1reg(0xC8, dst_reg)); break; @@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_END | BPF_FROM_LE: switch (imm32) { case 16: - /* emit 'movzwl eax, ax' to zero extend 16-bit + /* + * Emit 'movzwl eax, ax' to zero extend 16-bit * into 64 bit */ if (is_ereg(dst_reg)) @@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: - /* emit 'mov eax, eax' to clear upper 32-bits */ + /* Emit 'mov eax, eax' to clear upper 32-bits */ if (is_ereg(dst_reg)) EMIT1(0x45); EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg)); @@ -809,9 +835,9 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: - /* emit 'mov byte ptr [rax + off], al' */ + /* Emit 'mov byte ptr [rax + off], al' */ if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* have to add extra byte for x86 SIL, DIL regs */ + /* We have to add extra byte for x86 SIL, DIL regs */ src_reg == BPF_REG_1 || src_reg == BPF_REG_2) EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); else @@ -840,25 +866,26 @@ stx: if (is_imm8(insn->off)) /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: - /* emit 'movzx rax, byte ptr [rax + off]' */ + /* Emit 'movzx rax, byte ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); goto ldx; case BPF_LDX | BPF_MEM | BPF_H: - /* emit 'movzx rax, word ptr [rax + off]' */ + /* Emit 'movzx rax, word ptr [rax + off]' */ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); goto ldx; case BPF_LDX | BPF_MEM | BPF_W: - /* emit 'mov eax, dword ptr [rax+0x14]' */ + /* Emit 'mov eax, dword ptr [rax+0x14]' */ if (is_ereg(dst_reg) || is_ereg(src_reg)) EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); else EMIT1(0x8B); goto ldx; case BPF_LDX | BPF_MEM | BPF_DW: - /* emit 'mov rax, qword ptr [rax+0x14]' */ + /* Emit 'mov rax, qword ptr [rax+0x14]' */ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); -ldx: /* if insn->off == 0 we can save one extra byte, but - * special case of x86 r13 which always needs an offset +ldx: /* + * If insn->off == 0 we can save one extra byte, but + * special case of x86 R13 which always needs an offset * is not worth the hassle */ if (is_imm8(insn->off)) @@ -870,7 +897,7 @@ ldx: /* if insn->off == 0 we can save one extra byte, but /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ case BPF_STX | BPF_XADD | BPF_W: - /* emit 'lock add dword ptr [rax + off], eax' */ + /* Emit 'lock add dword ptr [rax + off], eax' */ if (is_ereg(dst_reg) || is_ereg(src_reg)) EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); else @@ -897,14 +924,15 @@ xadd: if (is_imm8(insn->off)) } else { EMIT2(0x41, 0x52); /* push %r10 */ EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since + /* + * We need to adjust jmp offset, since * pop %r9, pop %r10 take 4 bytes after call insn */ jmp_offset += 4; } } if (!imm32 || !is_simm32(jmp_offset)) { - pr_err("unsupported bpf func %d addr %p image %p\n", + pr_err("unsupported BPF func %d addr %p image %p\n", imm32, func, image); return -EINVAL; } @@ -970,7 +998,7 @@ xadd: if (is_imm8(insn->off)) else EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32); -emit_cond_jmp: /* convert BPF opcode to x86 */ +emit_cond_jmp: /* Convert BPF opcode to x86 */ switch (BPF_OP(insn->code)) { case BPF_JEQ: jmp_cond = X86_JE; @@ -996,22 +1024,22 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ jmp_cond = X86_JBE; break; case BPF_JSGT: - /* signed '>', GT in x86 */ + /* Signed '>', GT in x86 */ jmp_cond = X86_JG; break; case BPF_JSLT: - /* signed '<', LT in x86 */ + /* Signed '<', LT in x86 */ jmp_cond = X86_JL; break; case BPF_JSGE: - /* signed '>=', GE in x86 */ + /* Signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; case BPF_JSLE: - /* signed '<=', LE in x86 */ + /* Signed '<=', LE in x86 */ jmp_cond = X86_JLE; break; - default: /* to silence gcc warning */ + default: /* to silence GCC warning */ return -EFAULT; } jmp_offset = addrs[i + insn->off] - addrs[i]; @@ -1039,7 +1067,7 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ jmp_offset = addrs[i + insn->off] - addrs[i]; if (!jmp_offset) - /* optimize out nop jumps */ + /* Optimize out nop jumps */ break; emit_jmp: if (is_imm8(jmp_offset)) { @@ -1061,7 +1089,7 @@ common_load: ctx->seen_ld_abs = seen_ld_abs = true; jmp_offset = func - (image + addrs[i]); if (!func || !is_simm32(jmp_offset)) { - pr_err("unsupported bpf func %d addr %p image %p\n", + pr_err("unsupported BPF func %d addr %p image %p\n", imm32, func, image); return -EINVAL; } @@ -1080,7 +1108,8 @@ common_load: EMIT2_off32(0x81, 0xC6, imm32); } } - /* skb pointer is in R6 (%rbx), it will be copied into + /* + * skb pointer is in R6 (%rbx), it will be copied into * %rdi if skb_copy_bits() call is necessary. * sk_load_* helpers also use %r10 and %r9d. * See bpf_jit.S @@ -1111,7 +1140,7 @@ common_load: goto emit_jmp; } seen_exit = true; - /* update cleanup_addr */ + /* Update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp+0] */ EMIT4(0x48, 0x8B, 0x5D, 0); @@ -1129,10 +1158,11 @@ common_load: break; default: - /* By design x64 JIT should support all BPF instructions + /* + * By design x86-64 JIT should support all BPF instructions. * This error will be seen if new instruction was added - * to interpreter, but not to JIT - * or if there is junk in bpf_prog + * to the interpreter, but not to the JIT, or if there is + * junk in bpf_prog. */ pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL; @@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) return orig_prog; tmp = bpf_jit_blind_constants(prog); - /* If blinding was requested and we failed during blinding, + /* + * If blinding was requested and we failed during blinding, * we must fall back to the interpreter. */ if (IS_ERR(tmp)) @@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_addrs; } - /* Before first pass, make a rough estimation of addrs[] - * each bpf instruction is translated to less than 64 bytes + /* + * Before first pass, make a rough estimation of addrs[] + * each BPF instruction is translated to less than 64 bytes */ for (proglen = 0, i = 0; i < prog->len; i++) { proglen += 64; @@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.cleanup_addr = proglen; skip_init_addrs: - /* JITed image shrinks with every pass and the loop iterates - * until the image stops shrinking. Very large bpf programs + /* + * JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large BPF programs * may converge on the last pass. In such case do one more - * pass to emit the final image + * pass to emit the final image. */ for (pass = 0; pass < 20 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); From 6f96674dbd8ca659769f1c65ca15638e50b69341 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 2 May 2018 14:20:24 +0100 Subject: [PATCH 23/66] bpf: relax constraints on formatting for eBPF helper documentation The Python script used to parse and extract eBPF helpers documentation from include/uapi/linux/bpf.h expects a very specific formatting for the descriptions (single dot represents a space, '>' stands for a tab): /* ... *.int bpf_helper(list of arguments) *.> Description *.> > Start of description *.> > Another line of description *.> > And yet another line of description *.> Return *.> > 0 on success, or a negative error in case of failure ... */ This is too strict, and painful for developers who wants to add documentation for new helpers. Worse, it is extremely difficult to check that the formatting is correct during reviews. Change the format expected by the script and make it more flexible. The script now works whether or not the initial space (right after the star) is present, and accepts both tabs and white spaces (or a combination of both) for indenting description sections and contents. Concretely, something like the following would now be supported: /* ... *int bpf_helper(list of arguments) *......Description *.> > Start of description... *> > Another line of description *..............And yet another line of description *> Return *.> ........0 on success, or a negative error in case of failure ... */ While at it, remove unnecessary carets from each regex used with match() in the script. They are redundant, as match() tries to match from the beginning of the string by default. v2: Remove unnecessary caret when a regex is used with match(). Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- scripts/bpf_helpers_doc.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 30ba0fee36e4..8f59897fbda1 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -39,9 +39,9 @@ class Helper(object): Break down helper function protocol into smaller chunks: return type, name, distincts arguments. """ - arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$') + arg_re = re.compile('((const )?(struct )?(\w+|...))( (\**)(\w+))?$') res = {} - proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') + proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') capture = proto_re.match(self.proto) res['ret_type'] = capture.group(1) @@ -87,7 +87,7 @@ class HeaderParser(object): # - Same as above, with "const" and/or "struct" in front of type # - "..." (undefined number of arguments, for bpf_trace_printk()) # There is at least one term ("void"), and at most five arguments. - p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') + p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') capture = p.match(self.line) if not capture: raise NoHelperFound @@ -95,7 +95,7 @@ class HeaderParser(object): return capture.group(1) def parse_desc(self): - p = re.compile('^ \* \tDescription$') + p = re.compile(' \* ?(?:\t| {6,8})Description$') capture = p.match(self.line) if not capture: # Helper can have empty description and we might be parsing another @@ -109,7 +109,7 @@ class HeaderParser(object): if self.line == ' *\n': desc += '\n' else: - p = re.compile('^ \* \t\t(.*)') + p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') capture = p.match(self.line) if capture: desc += capture.group(1) + '\n' @@ -118,7 +118,7 @@ class HeaderParser(object): return desc def parse_ret(self): - p = re.compile('^ \* \tReturn$') + p = re.compile(' \* ?(?:\t| {6,8})Return$') capture = p.match(self.line) if not capture: # Helper can have empty retval and we might be parsing another @@ -132,7 +132,7 @@ class HeaderParser(object): if self.line == ' *\n': ret += '\n' else: - p = re.compile('^ \* \t\t(.*)') + p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') capture = p.match(self.line) if capture: ret += capture.group(1) + '\n' From 03f5781be2c7b7e728d724ac70ba10799cc710d7 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 3 May 2018 14:10:43 +0800 Subject: [PATCH 24/66] bpf, x86_32: add eBPF JIT compiler for ia32 The JIT compiler emits ia32 bit instructions. Currently, It supports eBPF only. Classic BPF is supported because of the conversion by BPF core. Almost all instructions from eBPF ISA supported except the following: BPF_ALU64 | BPF_DIV | BPF_K BPF_ALU64 | BPF_DIV | BPF_X BPF_ALU64 | BPF_MOD | BPF_K BPF_ALU64 | BPF_MOD | BPF_X BPF_STX | BPF_XADD | BPF_W BPF_STX | BPF_XADD | BPF_DW It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL at the moment. IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI. I use EAX|EDX|ECX|EBX as temporary registers to simulate instructions in eBPF ISA, and allocate ESI|EDI to BPF_REG_AX for constant blinding, all others eBPF registers, R0-R10, are simulated through scratch space on stack. The reasons behind the hardware registers allocation policy are: 1:MUL need EAX:EDX, shift operation need ECX, so they aren't fit for general eBPF 64bit register simulation. 2:We need at least 4 registers to simulate most eBPF ISA operations on registers operands instead of on register&memory operands. 3:We need to put BPF_REG_AX on hardware registers, or constant blinding will degrade jit performance heavily. Tested on PC (Intel(R) Core(TM) i5-5200U CPU). Testing results on i5-5200U: 1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed] 2) test_progs: Summary: 83 PASSED, 0 FAILED. 3) test_lpm: OK 4) test_lru_map: OK 5) test_verifier: Summary: 828 PASSED, 0 FAILED. Above tests are all done in following two conditions separately: 1:bpf_jit_enable=1 and bpf_jit_harden=0 2:bpf_jit_enable=1 and bpf_jit_harden=2 Below are some numbers for this jit implementation: Note: I run test_progs in kselftest 100 times continuously for every condition, the numbers are in format: total/times=avg. The numbers that test_bpf reports show almost the same relation. a:jit_enable=0 and jit_harden=0 b:jit_enable=1 and jit_harden=0 test_pkt_access:PASS:ipv4:15622/100=156 test_pkt_access:PASS:ipv4:10674/100=106 test_pkt_access:PASS:ipv6:9130/100=91 test_pkt_access:PASS:ipv6:4855/100=48 test_xdp:PASS:ipv4:240198/100=2401 test_xdp:PASS:ipv4:138912/100=1389 test_xdp:PASS:ipv6:137326/100=1373 test_xdp:PASS:ipv6:68542/100=685 test_l4lb:PASS:ipv4:61100/100=611 test_l4lb:PASS:ipv4:37302/100=373 test_l4lb:PASS:ipv6:101000/100=1010 test_l4lb:PASS:ipv6:55030/100=550 c:jit_enable=1 and jit_harden=2 test_pkt_access:PASS:ipv4:10558/100=105 test_pkt_access:PASS:ipv6:5092/100=50 test_xdp:PASS:ipv4:131902/100=1319 test_xdp:PASS:ipv6:77932/100=779 test_l4lb:PASS:ipv4:38924/100=389 test_l4lb:PASS:ipv6:57520/100=575 The numbers show we get 30%~50% improvement. See Documentation/networking/filter.txt for more information. Changelog: Changes v5-v6: 1:Add do {} while (0) to RETPOLINE_RAX_BPF_JIT for consistence reason. 2:Clean up non-standard comments, reported by Daniel Borkmann. 3:Fix a memory leak issue, repoted by Daniel Borkmann. Changes v4-v5: 1:Delete is_on_stack, BPF_REG_AX is the only one on real hardware registers, so just check with it. 2:Apply commit 1612a981b766 ("bpf, x64: fix JIT emission for dead code"), suggested by Daniel Borkmann. Changes v3-v4: 1:Fix changelog in commit. I install llvm-6.0, then test_progs willn't report errors. I submit another patch: "bpf: fix misaligned access for BPF_PROG_TYPE_PERF_EVENT program type on x86_32 platform" to fix another problem, after that patch, test_verifier willn't report errors too. 2:Fix clear r0[1] twice unnecessarily in *BPF_IND|BPF_ABS* simulation. Changes v2-v3: 1:Move BPF_REG_AX to real hardware registers for performance reason. 3:Using bpf_load_pointer instead of bpf_jit32.S, suggested by Daniel Borkmann. 4:Delete partial codes in 1c2a088a6626, suggested by Daniel Borkmann. 5:Some bug fixes and comments improvement. Changes v1-v2: 1:Fix bug in emit_ia32_neg64. 2:Fix bug in emit_ia32_arsh_r64. 3:Delete filename in top level comment, suggested by Thomas Gleixner. 4:Delete unnecessary boiler plate text, suggested by Thomas Gleixner. 5:Rewrite some words in changelog. 6:CodingSytle improvement and a little more comments. Signed-off-by: Wang YanQing Signed-off-by: Daniel Borkmann --- Documentation/sysctl/net.txt | 1 + arch/x86/Kconfig | 2 +- arch/x86/include/asm/nospec-branch.h | 30 +- arch/x86/net/Makefile | 8 +- arch/x86/net/bpf_jit_comp32.c | 2553 ++++++++++++++++++++++++++ 5 files changed, 2588 insertions(+), 6 deletions(-) create mode 100644 arch/x86/net/bpf_jit_comp32.c diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 5992602469d8..9ecde517728c 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -45,6 +45,7 @@ through bpf(2) and passing a verifier in the kernel, a JIT will then translate these BPF proglets into native CPU instructions. There are two flavors of JITs, the newer eBPF JIT currently supported on: - x86_64 + - x86_32 - arm64 - arm32 - ppc64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 00fcf81f2c56..1f5fa2f2c168 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -137,7 +137,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_EBPF_JIT if X86_64 + select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f928ad9b143f..2cd344d1a6e5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -291,16 +291,20 @@ do { \ * lfence * jmp spec_trap * do_rop: - * mov %rax,(%rsp) + * mov %rax,(%rsp) for x86_64 + * mov %edx,(%esp) for x86_32 * retq * * Without retpolines configured: * - * jmp *%rax + * jmp *%rax for x86_64 + * jmp *%edx for x86_32 */ #ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_64 # define RETPOLINE_RAX_BPF_JIT_SIZE 17 # define RETPOLINE_RAX_BPF_JIT() \ +do { \ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ /* spec_trap: */ \ EMIT2(0xF3, 0x90); /* pause */ \ @@ -308,11 +312,31 @@ do { \ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ /* do_rop: */ \ EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ - EMIT1(0xC3); /* retq */ + EMIT1(0xC3); /* retq */ \ +} while (0) #else +# define RETPOLINE_EDX_BPF_JIT() \ +do { \ + EMIT1_off32(0xE8, 7); /* call do_rop */ \ + /* spec_trap: */ \ + EMIT2(0xF3, 0x90); /* pause */ \ + EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ + EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ + /* do_rop: */ \ + EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \ + EMIT1(0xC3); /* ret */ \ +} while (0) +#endif +#else /* !CONFIG_RETPOLINE */ + +#ifdef CONFIG_X86_64 # define RETPOLINE_RAX_BPF_JIT_SIZE 2 # define RETPOLINE_RAX_BPF_JIT() \ EMIT2(0xFF, 0xE0); /* jmp *%rax */ +#else +# define RETPOLINE_EDX_BPF_JIT() \ + EMIT2(0xFF, 0xE2) /* jmp *%edx */ +#endif #endif #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index fefb4b619598..c6b464a261bb 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -1,6 +1,10 @@ # # Arch-specific network modules # -OBJECT_FILES_NON_STANDARD_bpf_jit.o += y -obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o +ifeq ($(CONFIG_X86_32),y) + obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o +else + OBJECT_FILES_NON_STANDARD_bpf_jit.o += y + obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o +endif diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c new file mode 100644 index 000000000000..61e61341b777 --- /dev/null +++ b/arch/x86/net/bpf_jit_comp32.c @@ -0,0 +1,2553 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Just-In-Time compiler for eBPF filters on IA32 (32bit x86) + * + * Author: Wang YanQing (udknight@gmail.com) + * The code based on code and ideas from: + * Eric Dumazet (eric.dumazet@gmail.com) + * and from: + * Shubham Bansal + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * eBPF prog stack layout: + * + * high + * original ESP => +-----+ + * | | callee saved registers + * +-----+ + * | ... | eBPF JIT scratch space + * BPF_FP,IA32_EBP => +-----+ + * | ... | eBPF prog stack + * +-----+ + * |RSVD | JIT scratchpad + * current ESP => +-----+ + * | | + * | ... | Function call stack + * | | + * +-----+ + * low + * + * The callee saved registers: + * + * high + * original ESP => +------------------+ \ + * | ebp | | + * current EBP => +------------------+ } callee saved registers + * | ebx,esi,edi | | + * +------------------+ / + * low + */ + +static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +{ + if (len == 1) + *ptr = bytes; + else if (len == 2) + *(u16 *)ptr = bytes; + else { + *(u32 *)ptr = bytes; + barrier(); + } + return ptr + len; +} + +#define EMIT(bytes, len) \ + do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) + +#define EMIT1(b1) EMIT(b1, 1) +#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) +#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) +#define EMIT4(b1, b2, b3, b4) \ + EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) + +#define EMIT1_off32(b1, off) \ + do { EMIT1(b1); EMIT(off, 4); } while (0) +#define EMIT2_off32(b1, b2, off) \ + do { EMIT2(b1, b2); EMIT(off, 4); } while (0) +#define EMIT3_off32(b1, b2, b3, off) \ + do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0) +#define EMIT4_off32(b1, b2, b3, b4, off) \ + do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) + +#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len) + +static bool is_imm8(int value) +{ + return value <= 127 && value >= -128; +} + +static bool is_simm32(s64 value) +{ + return value == (s64) (s32) value; +} + +#define STACK_OFFSET(k) (k) +#define TCALL_CNT (MAX_BPF_JIT_REG + 0) /* Tail Call Count */ + +#define IA32_EAX (0x0) +#define IA32_EBX (0x3) +#define IA32_ECX (0x1) +#define IA32_EDX (0x2) +#define IA32_ESI (0x6) +#define IA32_EDI (0x7) +#define IA32_EBP (0x5) +#define IA32_ESP (0x4) + +/* + * List of x86 cond jumps opcodes (. + s8) + * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) + */ +#define IA32_JB 0x72 +#define IA32_JAE 0x73 +#define IA32_JE 0x74 +#define IA32_JNE 0x75 +#define IA32_JBE 0x76 +#define IA32_JA 0x77 +#define IA32_JL 0x7C +#define IA32_JGE 0x7D +#define IA32_JLE 0x7E +#define IA32_JG 0x7F + +/* + * Map eBPF registers to IA32 32bit registers or stack scratch space. + * + * 1. All the registers, R0-R10, are mapped to scratch space on stack. + * 2. We need two 64 bit temp registers to do complex operations on eBPF + * registers. + * 3. For performance reason, the BPF_REG_AX for blinding constant, is + * mapped to real hardware register pair, IA32_ESI and IA32_EDI. + * + * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit + * registers, we have to map each eBPF registers with two IA32 32 bit regs + * or scratch memory space and we have to build eBPF 64 bit register from those. + * + * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers. + */ +static const u8 bpf2ia32[][2] = { + /* Return value from in-kernel function, and exit value from eBPF */ + [BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)}, + + /* The arguments from eBPF program to in-kernel function */ + /* Stored on stack scratch space */ + [BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)}, + [BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)}, + [BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)}, + [BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)}, + [BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)}, + + /* Callee saved registers that in-kernel function will preserve */ + /* Stored on stack scratch space */ + [BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)}, + [BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)}, + [BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)}, + [BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)}, + + /* Read only Frame Pointer to access Stack */ + [BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)}, + + /* Temporary register for blinding constants. */ + [BPF_REG_AX] = {IA32_ESI, IA32_EDI}, + + /* Tail call count. Stored on stack scratch space. */ + [TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)}, +}; + +#define dst_lo dst[0] +#define dst_hi dst[1] +#define src_lo src[0] +#define src_hi src[1] + +#define STACK_ALIGNMENT 8 +/* + * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, + * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9, + * BPF_REG_FP, BPF_REG_AX and Tail call counts. + */ +#define SCRATCH_SIZE 96 + +/* Total stack size used in JITed code */ +#define _STACK_SIZE \ + (stack_depth + \ + + SCRATCH_SIZE + \ + + 4 /* Extra space for skb_copy_bits buffer */) + +#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT) + +/* Get the offset of eBPF REGISTERs stored on scratch space. */ +#define STACK_VAR(off) (off) + +/* Offset of skb_copy_bits buffer */ +#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE) + +/* Encode 'dst_reg' register into IA32 opcode 'byte' */ +static u8 add_1reg(u8 byte, u32 dst_reg) +{ + return byte + dst_reg; +} + +/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */ +static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) +{ + return byte + dst_reg + (src_reg << 3); +} + +static void jit_fill_hole(void *area, unsigned int size) +{ + /* Fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + +static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk, + u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (dstk) { + if (val == 0) { + /* xor eax,eax */ + EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX)); + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst)); + } else { + EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP), + STACK_VAR(dst), val); + } + } else { + if (val == 0) + EMIT2(0x33, add_2reg(0xC0, dst, dst)); + else + EMIT2_off32(0xC7, add_1reg(0xC0, dst), + val); + } + *pprog = prog; +} + +/* dst = imm (4 bytes)*/ +static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk, + bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 sreg = sstk ? IA32_EAX : src; + + if (sstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src)); + if (dstk) + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst)); + else + /* mov dst,sreg */ + EMIT2(0x89, add_2reg(0xC0, dst, sreg)); + + *pprog = prog; +} + +/* dst = src */ +static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[], + const u8 src[], bool dstk, + bool sstk, u8 **pprog) +{ + emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog); + if (is64) + /* complete 8 byte move */ + emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog); + else + /* zero out high 4 bytes */ + emit_ia32_mov_i(dst_hi, 0, dstk, pprog); +} + +/* Sign extended move */ +static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[], + const u32 val, bool dstk, u8 **pprog) +{ + u32 hi = 0; + + if (is64 && (val & (1<<31))) + hi = (u32)~0; + emit_ia32_mov_i(dst_lo, val, dstk, pprog); + emit_ia32_mov_i(dst_hi, hi, dstk, pprog); +} + +/* + * ALU operation (32 bit) + * dst = dst * src + */ +static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk, + bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 sreg = sstk ? IA32_ECX : src; + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src)); + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst)); + else + /* mov eax,dst */ + EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX)); + + + EMIT2(0xF7, add_1reg(0xE0, sreg)); + + if (dstk) + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst)); + else + /* mov dst,eax */ + EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX)); + + *pprog = prog; +} + +static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk && val != 64) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + switch (val) { + case 16: + /* + * Emit 'movzwl eax,ax' to zero extend 16-bit + * into 64 bit + */ + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + break; + case 32: + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + break; + case 64: + /* nop */ + break; + } + + if (dstk && val != 64) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + switch (val) { + case 16: + /* Emit 'ror %ax, 8' to swap lower 2 bytes */ + EMIT1(0x66); + EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8); + + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); + + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + break; + case 32: + /* Emit 'bswap eax' to swap lower 4 bytes */ + EMIT1(0x0F); + EMIT1(add_1reg(0xC8, dreg_lo)); + + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + break; + case 64: + /* Emit 'bswap eax' to swap lower 4 bytes */ + EMIT1(0x0F); + EMIT1(add_1reg(0xC8, dreg_lo)); + + /* Emit 'bswap edx' to swap lower 4 bytes */ + EMIT1(0x0F); + EMIT1(add_1reg(0xC8, dreg_hi)); + + /* mov ecx,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi)); + /* mov dreg_hi,dreg_lo */ + EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo)); + /* mov dreg_lo,ecx */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX)); + + break; + } + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +/* + * ALU operation (32 bit) + * dst = dst (div|mod) src + */ +static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src, + bool dstk, bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src)); + else if (src != IA32_ECX) + /* mov ecx,src */ + EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX)); + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst)); + else + /* mov eax,dst */ + EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX)); + + /* xor edx,edx */ + EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX)); + /* div ecx */ + EMIT2(0xF7, add_1reg(0xF0, IA32_ECX)); + + if (op == BPF_MOD) { + if (dstk) + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst)); + else + EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX)); + } else { + if (dstk) + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst)); + else + EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX)); + } + *pprog = prog; +} + +/* + * ALU operation (32 bit) + * dst = dst (shift) src + */ +static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src, + bool dstk, bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg = dstk ? IA32_EAX : dst; + u8 b2; + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst)); + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src)); + else if (src != IA32_ECX) + /* mov ecx,src */ + EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX)); + + switch (op) { + case BPF_LSH: + b2 = 0xE0; break; + case BPF_RSH: + b2 = 0xE8; break; + case BPF_ARSH: + b2 = 0xF8; break; + default: + return; + } + EMIT2(0xD3, add_1reg(b2, dreg)); + + if (dstk) + /* mov dword ptr [ebp+off],dreg */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst)); + *pprog = prog; +} + +/* + * ALU operation (32 bit) + * dst = dst (op) src + */ +static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op, + const u8 dst, const u8 src, bool dstk, + bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 sreg = sstk ? IA32_EAX : src; + u8 dreg = dstk ? IA32_EDX : dst; + + if (sstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src)); + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst)); + + switch (BPF_OP(op)) { + /* dst = dst + src */ + case BPF_ADD: + if (hi && is64) + EMIT2(0x11, add_2reg(0xC0, dreg, sreg)); + else + EMIT2(0x01, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst - src */ + case BPF_SUB: + if (hi && is64) + EMIT2(0x19, add_2reg(0xC0, dreg, sreg)); + else + EMIT2(0x29, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst | src */ + case BPF_OR: + EMIT2(0x09, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst & src */ + case BPF_AND: + EMIT2(0x21, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst ^ src */ + case BPF_XOR: + EMIT2(0x31, add_2reg(0xC0, dreg, sreg)); + break; + } + + if (dstk) + /* mov dword ptr [ebp+off],dreg */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), + STACK_VAR(dst)); + *pprog = prog; +} + +/* ALU operation (64 bit) */ +static inline void emit_ia32_alu_r64(const bool is64, const u8 op, + const u8 dst[], const u8 src[], + bool dstk, bool sstk, + u8 **pprog) +{ + u8 *prog = *pprog; + + emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog); + if (is64) + emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk, + &prog); + else + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + *pprog = prog; +} + +/* + * ALU operation (32 bit) + * dst = dst (op) val + */ +static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op, + const u8 dst, const s32 val, bool dstk, + u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg = dstk ? IA32_EAX : dst; + u8 sreg = IA32_EDX; + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst)); + + if (!is_imm8(val)) + /* mov edx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val); + + switch (op) { + /* dst = dst + val */ + case BPF_ADD: + if (hi && is64) { + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xD0, dreg), val); + else + EMIT2(0x11, add_2reg(0xC0, dreg, sreg)); + } else { + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xC0, dreg), val); + else + EMIT2(0x01, add_2reg(0xC0, dreg, sreg)); + } + break; + /* dst = dst - val */ + case BPF_SUB: + if (hi && is64) { + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xD8, dreg), val); + else + EMIT2(0x19, add_2reg(0xC0, dreg, sreg)); + } else { + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xE8, dreg), val); + else + EMIT2(0x29, add_2reg(0xC0, dreg, sreg)); + } + break; + /* dst = dst | val */ + case BPF_OR: + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xC8, dreg), val); + else + EMIT2(0x09, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst & val */ + case BPF_AND: + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xE0, dreg), val); + else + EMIT2(0x21, add_2reg(0xC0, dreg, sreg)); + break; + /* dst = dst ^ val */ + case BPF_XOR: + if (is_imm8(val)) + EMIT3(0x83, add_1reg(0xF0, dreg), val); + else + EMIT2(0x31, add_2reg(0xC0, dreg, sreg)); + break; + case BPF_NEG: + EMIT2(0xF7, add_1reg(0xD8, dreg)); + break; + } + + if (dstk) + /* mov dword ptr [ebp+off],dreg */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), + STACK_VAR(dst)); + *pprog = prog; +} + +/* ALU operation (64 bit) */ +static inline void emit_ia32_alu_i64(const bool is64, const u8 op, + const u8 dst[], const u32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + u32 hi = 0; + + if (is64 && (val & (1<<31))) + hi = (u32)~0; + + emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog); + if (is64) + emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog); + else + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + + *pprog = prog; +} + +/* dst = ~dst (64 bit) */ +static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + /* xor ecx,ecx */ + EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + /* sub dreg_lo,ecx */ + EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX)); + /* mov dreg_lo,ecx */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX)); + + /* xor ecx,ecx */ + EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + /* sbb dreg_hi,ecx */ + EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX)); + /* mov dreg_hi,ecx */ + EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX)); + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +/* dst = dst << src */ +static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[], + bool dstk, bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + static int jmp_label1 = -1; + static int jmp_label2 = -1; + static int jmp_label3 = -1; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src_lo)); + else + /* mov ecx,src_lo */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); + + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* Jumps when >= 32 */ + if (is_imm8(jmp_label(jmp_label1, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); + + /* < 32 */ + /* shl dreg_hi,cl */ + EMIT2(0xD3, add_1reg(0xE0, dreg_hi)); + /* mov ebx,dreg_lo */ + EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shl dreg_lo,cl */ + EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); + + /* IA32_ECX = -IA32_ECX + 32 */ + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shr ebx,cl */ + EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); + /* or dreg_hi,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 32 */ + if (jmp_label1 == -1) + jmp_label1 = cnt; + + /* cmp ecx,64 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); + /* Jumps when >= 64 */ + if (is_imm8(jmp_label(jmp_label2, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + + /* >= 32 && < 64 */ + /* sub ecx,32 */ + EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); + /* shl dreg_lo,cl */ + EMIT2(0xD3, add_1reg(0xE0, dreg_lo)); + /* mov dreg_hi,dreg_lo */ + EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo)); + + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 64 */ + if (jmp_label2 == -1) + jmp_label2 = cnt; + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + + if (jmp_label3 == -1) + jmp_label3 = cnt; + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + /* out: */ + *pprog = prog; +} + +/* dst = dst >> src (signed)*/ +static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[], + bool dstk, bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + static int jmp_label1 = -1; + static int jmp_label2 = -1; + static int jmp_label3 = -1; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src_lo)); + else + /* mov ecx,src_lo */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); + + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* Jumps when >= 32 */ + if (is_imm8(jmp_label(jmp_label1, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); + + /* < 32 */ + /* lshr dreg_lo,cl */ + EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); + /* mov ebx,dreg_hi */ + EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* ashr dreg_hi,cl */ + EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); + + /* IA32_ECX = -IA32_ECX + 32 */ + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shl ebx,cl */ + EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); + /* or dreg_lo,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 32 */ + if (jmp_label1 == -1) + jmp_label1 = cnt; + + /* cmp ecx,64 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); + /* Jumps when >= 64 */ + if (is_imm8(jmp_label(jmp_label2, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + + /* >= 32 && < 64 */ + /* sub ecx,32 */ + EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); + /* ashr dreg_hi,cl */ + EMIT2(0xD3, add_1reg(0xF8, dreg_hi)); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 64 */ + if (jmp_label2 == -1) + jmp_label2 = cnt; + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + + if (jmp_label3 == -1) + jmp_label3 = cnt; + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + /* out: */ + *pprog = prog; +} + +/* dst = dst >> src */ +static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + static int jmp_label1 = -1; + static int jmp_label2 = -1; + static int jmp_label3 = -1; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + if (sstk) + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src_lo)); + else + /* mov ecx,src_lo */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX)); + + /* cmp ecx,32 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32); + /* Jumps when >= 32 */ + if (is_imm8(jmp_label(jmp_label1, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6)); + + /* < 32 */ + /* lshr dreg_lo,cl */ + EMIT2(0xD3, add_1reg(0xE8, dreg_lo)); + /* mov ebx,dreg_hi */ + EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shr dreg_hi,cl */ + EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); + + /* IA32_ECX = -IA32_ECX + 32 */ + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shl ebx,cl */ + EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); + /* or dreg_lo,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 32 */ + if (jmp_label1 == -1) + jmp_label1 = cnt; + /* cmp ecx,64 */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64); + /* Jumps when >= 64 */ + if (is_imm8(jmp_label(jmp_label2, 2))) + EMIT2(IA32_JAE, jmp_label(jmp_label2, 2)); + else + EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6)); + + /* >= 32 && < 64 */ + /* sub ecx,32 */ + EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32); + /* shr dreg_hi,cl */ + EMIT2(0xD3, add_1reg(0xE8, dreg_hi)); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + + /* goto out; */ + if (is_imm8(jmp_label(jmp_label3, 2))) + EMIT2(0xEB, jmp_label(jmp_label3, 2)); + else + EMIT1_off32(0xE9, jmp_label(jmp_label3, 5)); + + /* >= 64 */ + if (jmp_label2 == -1) + jmp_label2 = cnt; + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + + if (jmp_label3 == -1) + jmp_label3 = cnt; + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + /* out: */ + *pprog = prog; +} + +/* dst = dst << val */ +static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + /* Do LSH operation */ + if (val < 32) { + /* shl dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val); + /* mov ebx,dreg_lo */ + EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX)); + /* shl dreg_lo,imm8 */ + EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val); + + /* IA32_ECX = 32 - val */ + /* mov ecx,val */ + EMIT2(0xB1, val); + /* movzx ecx,ecx */ + EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shr ebx,cl */ + EMIT2(0xD3, add_1reg(0xE8, IA32_EBX)); + /* or dreg_hi,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX)); + } else if (val >= 32 && val < 64) { + u32 value = val - 32; + + /* shl dreg_lo,imm8 */ + EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value); + /* mov dreg_hi,dreg_lo */ + EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo)); + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + } else { + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + } + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +/* dst = dst >> val */ +static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + /* Do RSH operation */ + if (val < 32) { + /* shr dreg_lo,imm8 */ + EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); + /* mov ebx,dreg_hi */ + EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* shr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val); + + /* IA32_ECX = 32 - val */ + /* mov ecx,val */ + EMIT2(0xB1, val); + /* movzx ecx,ecx */ + EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shl ebx,cl */ + EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); + /* or dreg_lo,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); + } else if (val >= 32 && val < 64) { + u32 value = val - 32; + + /* shr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + } else { + /* xor dreg_lo,dreg_lo */ + EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo)); + /* xor dreg_hi,dreg_hi */ + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); + } + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +/* dst = dst >> val (signed) */ +static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + /* Do RSH operation */ + if (val < 32) { + /* shr dreg_lo,imm8 */ + EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val); + /* mov ebx,dreg_hi */ + EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX)); + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val); + + /* IA32_ECX = 32 - val */ + /* mov ecx,val */ + EMIT2(0xB1, val); + /* movzx ecx,ecx */ + EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + /* neg ecx */ + EMIT2(0xF7, add_1reg(0xD8, IA32_ECX)); + /* add ecx,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32); + + /* shl ebx,cl */ + EMIT2(0xD3, add_1reg(0xE0, IA32_EBX)); + /* or dreg_lo,ebx */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX)); + } else if (val >= 32 && val < 64) { + u32 value = val - 32; + + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); + } else { + /* ashr dreg_hi,imm8 */ + EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31); + /* mov dreg_lo,dreg_hi */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi)); + } + + if (dstk) { + /* mov dword ptr [ebp+off],dreg_lo */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],dreg_hi */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi), + STACK_VAR(dst_hi)); + } + *pprog = prog; +} + +static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_hi)); + else + /* mov eax,dst_hi */ + EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX)); + + if (sstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo)); + else + /* mul src_lo */ + EMIT2(0xF7, add_1reg(0xE0, src_lo)); + + /* mov ecx,eax */ + EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX)); + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + else + /* mov eax,dst_lo */ + EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX)); + + if (sstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi)); + else + /* mul src_hi */ + EMIT2(0xF7, add_1reg(0xE0, src_hi)); + + /* add eax,eax */ + EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX)); + + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + else + /* mov eax,dst_lo */ + EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX)); + + if (sstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo)); + else + /* mul src_lo */ + EMIT2(0xF7, add_1reg(0xE0, src_lo)); + + /* add ecx,edx */ + EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX)); + + if (dstk) { + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],ecx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(dst_hi)); + } else { + /* mov dst_lo,eax */ + EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX)); + /* mov dst_hi,ecx */ + EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX)); + } + + *pprog = prog; +} + +static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val, + bool dstk, u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + u32 hi; + + hi = val & (1<<31) ? (u32)~0 : 0; + /* movl eax,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val); + if (dstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi)); + else + /* mul dst_hi */ + EMIT2(0xF7, add_1reg(0xE0, dst_hi)); + + /* mov ecx,eax */ + EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX)); + + /* movl eax,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi); + if (dstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo)); + else + /* mul dst_lo */ + EMIT2(0xF7, add_1reg(0xE0, dst_lo)); + /* add ecx,eax */ + EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX)); + + /* movl eax,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val); + if (dstk) + /* mul dword ptr [ebp+off] */ + EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo)); + else + /* mul dst_lo */ + EMIT2(0xF7, add_1reg(0xE0, dst_lo)); + + /* add ecx,edx */ + EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX)); + + if (dstk) { + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + /* mov dword ptr [ebp+off],ecx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(dst_hi)); + } else { + /* mov dword ptr [ebp+off],eax */ + EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX)); + /* mov dword ptr [ebp+off],ecx */ + EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX)); + } + + *pprog = prog; +} + +static int bpf_size_to_x86_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 4; /* imm32 */ + else + return 0; +} + +struct jit_context { + int cleanup_addr; /* Epilogue code offset */ +}; + +/* Maximum number of bytes emitted while JITing one eBPF insn */ +#define BPF_MAX_INSN_SIZE 128 +#define BPF_INSN_SAFETY 64 + +#define PROLOGUE_SIZE 35 + +/* + * Emit prologue code for BPF program and check it's size. + * bpf_tail_call helper will skip it while jumping into another program. + */ +static void emit_prologue(u8 **pprog, u32 stack_depth) +{ + u8 *prog = *pprog; + int cnt = 0; + const u8 *r1 = bpf2ia32[BPF_REG_1]; + const u8 fplo = bpf2ia32[BPF_REG_FP][0]; + const u8 fphi = bpf2ia32[BPF_REG_FP][1]; + const u8 *tcc = bpf2ia32[TCALL_CNT]; + + /* push ebp */ + EMIT1(0x55); + /* mov ebp,esp */ + EMIT2(0x89, 0xE5); + /* push edi */ + EMIT1(0x57); + /* push esi */ + EMIT1(0x56); + /* push ebx */ + EMIT1(0x53); + + /* sub esp,STACK_SIZE */ + EMIT2_off32(0x81, 0xEC, STACK_SIZE); + /* sub ebp,SCRATCH_SIZE+4+12*/ + EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16); + /* xor ebx,ebx */ + EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX)); + + /* Set up BPF prog stack base register */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo)); + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi)); + + /* Move BPF_CTX (EAX) to BPF_REG_R1 */ + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0])); + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1])); + + /* Initialize Tail Count */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0])); + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1])); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + *pprog = prog; +} + +/* Emit epilogue code for BPF program */ +static void emit_epilogue(u8 **pprog, u32 stack_depth) +{ + u8 *prog = *pprog; + const u8 *r0 = bpf2ia32[BPF_REG_0]; + int cnt = 0; + + /* mov eax,dword ptr [ebp+off]*/ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0])); + /* mov edx,dword ptr [ebp+off]*/ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1])); + + /* add ebp,SCRATCH_SIZE+4+12*/ + EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16); + + /* mov ebx,dword ptr [ebp-12]*/ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12); + /* mov esi,dword ptr [ebp-8]*/ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8); + /* mov edi,dword ptr [ebp-4]*/ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4); + + EMIT1(0xC9); /* leave */ + EMIT1(0xC3); /* ret */ + *pprog = prog; +} + +/* + * Generate the following code: + * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... + * if (index >= array->map.max_entries) + * goto out; + * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * prog = array->ptrs[index]; + * if (prog == NULL) + * goto out; + * goto *(prog->bpf_func + prologue_size); + * out: + */ +static void emit_bpf_tail_call(u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + const u8 *r1 = bpf2ia32[BPF_REG_1]; + const u8 *r2 = bpf2ia32[BPF_REG_2]; + const u8 *r3 = bpf2ia32[BPF_REG_3]; + const u8 *tcc = bpf2ia32[TCALL_CNT]; + u32 lo, hi; + static int jmp_label1 = -1; + + /* + * if (index >= array->map.max_entries) + * goto out; + */ + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0])); + /* mov edx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0])); + + /* cmp dword ptr [eax+off],edx */ + EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX), + offsetof(struct bpf_array, map.max_entries)); + /* jbe out */ + EMIT2(IA32_JBE, jmp_label(jmp_label1, 2)); + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + lo = (u32)MAX_TAIL_CALL_CNT; + hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0])); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1])); + + /* cmp edx,hi */ + EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi); + EMIT2(IA32_JNE, 3); + /* cmp ecx,lo */ + EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo); + + /* ja out */ + EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); + + /* add eax,0x1 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01); + /* adc ebx,0x0 */ + EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00); + + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0])); + /* mov dword ptr [ebp+off],edx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1])); + + /* prog = array->ptrs[index]; */ + /* mov edx, [eax + edx * 4 + offsetof(...)] */ + EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs)); + + /* + * if (prog == NULL) + * goto out; + */ + /* test edx,edx */ + EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX)); + /* je out */ + EMIT2(IA32_JE, jmp_label(jmp_label1, 2)); + + /* goto *(prog->bpf_func + prologue_size); */ + /* mov edx, dword ptr [edx + 32] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX), + offsetof(struct bpf_prog, bpf_func)); + /* add edx,prologue_size */ + EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE); + + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0])); + + /* + * Now we're ready to jump into next BPF program: + * eax == ctx (1st arg) + * edx == prog->bpf_func + prologue_size + */ + RETPOLINE_EDX_BPF_JIT(); + + if (jmp_label1 == -1) + jmp_label1 = cnt; + + /* out: */ + *pprog = prog; +} + +/* Push the scratch stack register on top of the stack. */ +static inline void emit_push_r64(const u8 src[], u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi)); + /* push ecx */ + EMIT1(0x51); + + /* mov ecx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo)); + /* push ecx */ + EMIT1(0x51); + + *pprog = prog; +} + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct bpf_insn *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + bool seen_exit = false; + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + int i, cnt = 0; + int proglen = 0; + u8 *prog = temp; + + emit_prologue(&prog, bpf_prog->aux->stack_depth); + + for (i = 0; i < insn_cnt; i++, insn++) { + const s32 imm32 = insn->imm; + const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true; + const bool sstk = insn->src_reg == BPF_REG_AX ? false : true; + const u8 code = insn->code; + const u8 *dst = bpf2ia32[insn->dst_reg]; + const u8 *src = bpf2ia32[insn->src_reg]; + const u8 *r0 = bpf2ia32[BPF_REG_0]; + s64 jmp_offset; + u8 jmp_cond; + int ilen; + u8 *func; + + switch (code) { + /* ALU operations */ + /* dst = src */ + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_MOV | BPF_X: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_mov_r64(is64, dst, src, dstk, + sstk, &prog); + break; + case BPF_K: + /* Sign-extend immediate value to dst reg */ + emit_ia32_mov_i64(is64, dst, imm32, + dstk, &prog); + break; + } + break; + /* dst = dst + src/imm */ + /* dst = dst - src/imm */ + /* dst = dst | src/imm */ + /* dst = dst & src/imm */ + /* dst = dst ^ src/imm */ + /* dst = dst * src/imm */ + /* dst = dst << src */ + /* dst = dst >> src */ + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_ADD | BPF_X: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_X: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_X: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_X: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_alu_r64(is64, BPF_OP(code), dst, + src, dstk, sstk, &prog); + break; + case BPF_K: + emit_ia32_alu_i64(is64, BPF_OP(code), dst, + imm32, dstk, &prog); + break; + } + break; + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_mul_r(dst_lo, src_lo, dstk, + sstk, &prog); + break; + case BPF_K: + /* mov ecx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), + imm32); + emit_ia32_mul_r(dst_lo, IA32_ECX, dstk, + false, &prog); + break; + } + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_ARSH | BPF_K: + case BPF_ALU | BPF_ARSH | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo, + dstk, sstk, &prog); + break; + case BPF_K: + /* mov ecx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), + imm32); + emit_ia32_shift_r(BPF_OP(code), dst_lo, + IA32_ECX, dstk, false, + &prog); + break; + } + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + /* dst = dst / src(imm) */ + /* dst = dst % src(imm) */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_div_mod_r(BPF_OP(code), dst_lo, + src_lo, dstk, sstk, &prog); + break; + case BPF_K: + /* mov ecx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), + imm32); + emit_ia32_div_mod_r(BPF_OP(code), dst_lo, + IA32_ECX, dstk, false, + &prog); + break; + } + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_X: + goto notyet; + /* dst = dst >> imm */ + /* dst = dst << imm */ + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_K: + if (unlikely(imm32 > 31)) + return -EINVAL; + /* mov ecx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32); + emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk, + false, &prog); + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + /* dst = dst << imm */ + case BPF_ALU64 | BPF_LSH | BPF_K: + if (unlikely(imm32 > 63)) + return -EINVAL; + emit_ia32_lsh_i64(dst, imm32, dstk, &prog); + break; + /* dst = dst >> imm */ + case BPF_ALU64 | BPF_RSH | BPF_K: + if (unlikely(imm32 > 63)) + return -EINVAL; + emit_ia32_rsh_i64(dst, imm32, dstk, &prog); + break; + /* dst = dst << src */ + case BPF_ALU64 | BPF_LSH | BPF_X: + emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog); + break; + /* dst = dst >> src */ + case BPF_ALU64 | BPF_RSH | BPF_X: + emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog); + break; + /* dst = dst >> src (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog); + break; + /* dst = dst >> imm (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_K: + if (unlikely(imm32 > 63)) + return -EINVAL; + emit_ia32_arsh_i64(dst, imm32, dstk, &prog); + break; + /* dst = ~dst */ + case BPF_ALU | BPF_NEG: + emit_ia32_alu_i(is64, false, BPF_OP(code), + dst_lo, 0, dstk, &prog); + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); + break; + /* dst = ~dst (64 bit) */ + case BPF_ALU64 | BPF_NEG: + emit_ia32_neg64(dst, dstk, &prog); + break; + /* dst = dst * src/imm */ + case BPF_ALU64 | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_K: + switch (BPF_SRC(code)) { + case BPF_X: + emit_ia32_mul_r64(dst, src, dstk, sstk, &prog); + break; + case BPF_K: + emit_ia32_mul_i64(dst, imm32, dstk, &prog); + break; + } + break; + /* dst = htole(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + emit_ia32_to_le_r64(dst, imm32, dstk, &prog); + break; + /* dst = htobe(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_BE: + emit_ia32_to_be_r64(dst, imm32, dstk, &prog); + break; + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: { + s32 hi, lo = imm32; + + hi = insn[1].imm; + emit_ia32_mov_i(dst_lo, lo, dstk, &prog); + emit_ia32_mov_i(dst_hi, hi, dstk, &prog); + insn++; + i++; + break; + } + /* ST: *(u8*)(dst_reg + off) = imm */ + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_DW: + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + else + /* mov eax,dst_lo */ + EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX)); + + switch (BPF_SIZE(code)) { + case BPF_B: + EMIT(0xC6, 1); break; + case BPF_H: + EMIT2(0x66, 0xC7); break; + case BPF_W: + case BPF_DW: + EMIT(0xC7, 1); break; + } + + if (is_imm8(insn->off)) + EMIT2(add_1reg(0x40, IA32_EAX), insn->off); + else + EMIT1_off32(add_1reg(0x80, IA32_EAX), + insn->off); + EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code))); + + if (BPF_SIZE(code) == BPF_DW) { + u32 hi; + + hi = imm32 & (1<<31) ? (u32)~0 : 0; + EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX), + insn->off + 4); + EMIT(hi, 4); + } + break; + + /* STX: *(u8*)(dst_reg + off) = src_reg */ + case BPF_STX | BPF_MEM | BPF_B: + case BPF_STX | BPF_MEM | BPF_H: + case BPF_STX | BPF_MEM | BPF_W: + case BPF_STX | BPF_MEM | BPF_DW: + if (dstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + else + /* mov eax,dst_lo */ + EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX)); + + if (sstk) + /* mov edx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(src_lo)); + else + /* mov edx,src_lo */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX)); + + switch (BPF_SIZE(code)) { + case BPF_B: + EMIT(0x88, 1); break; + case BPF_H: + EMIT2(0x66, 0x89); break; + case BPF_W: + case BPF_DW: + EMIT(0x89, 1); break; + } + + if (is_imm8(insn->off)) + EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX), + insn->off); + else + EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX), + insn->off); + + if (BPF_SIZE(code) == BPF_DW) { + if (sstk) + /* mov edi,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, + IA32_EDX), + STACK_VAR(src_hi)); + else + /* mov edi,src_hi */ + EMIT2(0x8B, add_2reg(0xC0, src_hi, + IA32_EDX)); + EMIT1(0x89); + if (is_imm8(insn->off + 4)) { + EMIT2(add_2reg(0x40, IA32_EAX, + IA32_EDX), + insn->off + 4); + } else { + EMIT1(add_2reg(0x80, IA32_EAX, + IA32_EDX)); + EMIT(insn->off + 4, 4); + } + } + break; + + /* LDX: dst_reg = *(u8*)(src_reg + off) */ + case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_MEM | BPF_DW: + if (sstk) + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(src_lo)); + else + /* mov eax,dword ptr [ebp+off] */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX)); + + switch (BPF_SIZE(code)) { + case BPF_B: + EMIT2(0x0F, 0xB6); break; + case BPF_H: + EMIT2(0x0F, 0xB7); break; + case BPF_W: + case BPF_DW: + EMIT(0x8B, 1); break; + } + + if (is_imm8(insn->off)) + EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX), + insn->off); + else + EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX), + insn->off); + + if (dstk) + /* mov dword ptr [ebp+off],edx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_lo)); + else + /* mov dst_lo,edx */ + EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX)); + switch (BPF_SIZE(code)) { + case BPF_B: + case BPF_H: + case BPF_W: + if (dstk) { + EMIT3(0xC7, add_1reg(0x40, IA32_EBP), + STACK_VAR(dst_hi)); + EMIT(0x0, 4); + } else { + EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0); + } + break; + case BPF_DW: + EMIT2_off32(0x8B, + add_2reg(0x80, IA32_EAX, IA32_EDX), + insn->off + 4); + if (dstk) + EMIT3(0x89, + add_2reg(0x40, IA32_EBP, + IA32_EDX), + STACK_VAR(dst_hi)); + else + EMIT2(0x89, + add_2reg(0xC0, dst_hi, IA32_EDX)); + break; + default: + break; + } + break; + /* call */ + case BPF_JMP | BPF_CALL: + { + const u8 *r1 = bpf2ia32[BPF_REG_1]; + const u8 *r2 = bpf2ia32[BPF_REG_2]; + const u8 *r3 = bpf2ia32[BPF_REG_3]; + const u8 *r4 = bpf2ia32[BPF_REG_4]; + const u8 *r5 = bpf2ia32[BPF_REG_5]; + + if (insn->src_reg == BPF_PSEUDO_CALL) + goto notyet; + + func = (u8 *) __bpf_call_base + imm32; + jmp_offset = func - (image + addrs[i]); + + if (!imm32 || !is_simm32(jmp_offset)) { + pr_err("unsupported BPF func %d addr %p image %p\n", + imm32, func, image); + return -EINVAL; + } + + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(r1[0])); + /* mov edx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(r1[1])); + + emit_push_r64(r5, &prog); + emit_push_r64(r4, &prog); + emit_push_r64(r3, &prog); + emit_push_r64(r2, &prog); + + EMIT1_off32(0xE8, jmp_offset + 9); + + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(r0[0])); + /* mov dword ptr [ebp+off],edx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(r0[1])); + + /* add esp,32 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32); + break; + } + case BPF_JMP | BPF_TAIL_CALL: + emit_bpf_tail_call(&prog); + break; + + /* cond jump */ + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: { + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 sreg_lo = sstk ? IA32_ECX : src_lo; + u8 sreg_hi = sstk ? IA32_EBX : src_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + if (sstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), + STACK_VAR(src_hi)); + } + + /* cmp dreg_hi,sreg_hi */ + EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi)); + EMIT2(IA32_JNE, 2); + /* cmp dreg_lo,sreg_lo */ + EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo)); + goto emit_cond_jmp; + } + case BPF_JMP | BPF_JSET | BPF_X: { + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 sreg_lo = sstk ? IA32_ECX : src_lo; + u8 sreg_hi = sstk ? IA32_EBX : src_hi; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + if (sstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), + STACK_VAR(src_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), + STACK_VAR(src_hi)); + } + /* and dreg_lo,sreg_lo */ + EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo)); + /* and dreg_hi,sreg_hi */ + EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); + /* or dreg_lo,dreg_hi */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + goto emit_cond_jmp; + } + case BPF_JMP | BPF_JSET | BPF_K: { + u32 hi; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 sreg_lo = IA32_ECX; + u8 sreg_hi = IA32_EBX; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + hi = imm32 & (1<<31) ? (u32)~0 : 0; + + /* mov ecx,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32); + /* mov ebx,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi); + + /* and dreg_lo,sreg_lo */ + EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo)); + /* and dreg_hi,sreg_hi */ + EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi)); + /* or dreg_lo,dreg_hi */ + EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi)); + goto emit_cond_jmp; + } + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: { + u32 hi; + u8 dreg_lo = dstk ? IA32_EAX : dst_lo; + u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 sreg_lo = IA32_ECX; + u8 sreg_hi = IA32_EBX; + + if (dstk) { + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(dst_lo)); + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(dst_hi)); + } + + hi = imm32 & (1<<31) ? (u32)~0 : 0; + /* mov ecx,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32); + /* mov ebx,imm32 */ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi); + + /* cmp dreg_hi,sreg_hi */ + EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi)); + EMIT2(IA32_JNE, 2); + /* cmp dreg_lo,sreg_lo */ + EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo)); + +emit_cond_jmp: /* Convert BPF opcode to x86 */ + switch (BPF_OP(code)) { + case BPF_JEQ: + jmp_cond = IA32_JE; + break; + case BPF_JSET: + case BPF_JNE: + jmp_cond = IA32_JNE; + break; + case BPF_JGT: + /* GT is unsigned '>', JA in x86 */ + jmp_cond = IA32_JA; + break; + case BPF_JLT: + /* LT is unsigned '<', JB in x86 */ + jmp_cond = IA32_JB; + break; + case BPF_JGE: + /* GE is unsigned '>=', JAE in x86 */ + jmp_cond = IA32_JAE; + break; + case BPF_JLE: + /* LE is unsigned '<=', JBE in x86 */ + jmp_cond = IA32_JBE; + break; + case BPF_JSGT: + /* Signed '>', GT in x86 */ + jmp_cond = IA32_JG; + break; + case BPF_JSLT: + /* Signed '<', LT in x86 */ + jmp_cond = IA32_JL; + break; + case BPF_JSGE: + /* Signed '>=', GE in x86 */ + jmp_cond = IA32_JGE; + break; + case BPF_JSLE: + /* Signed '<=', LE in x86 */ + jmp_cond = IA32_JLE; + break; + default: /* to silence GCC warning */ + return -EFAULT; + } + jmp_offset = addrs[i + insn->off] - addrs[i]; + if (is_imm8(jmp_offset)) { + EMIT2(jmp_cond, jmp_offset); + } else if (is_simm32(jmp_offset)) { + EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); + } else { + pr_err("cond_jmp gen bug %llx\n", jmp_offset); + return -EFAULT; + } + + break; + } + case BPF_JMP | BPF_JA: + if (insn->off == -1) + /* -1 jmp instructions will always jump + * backwards two bytes. Explicitly handling + * this case avoids wasting too many passes + * when there are long sequences of replaced + * dead code. + */ + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->off] - addrs[i]; + + if (!jmp_offset) + /* Optimize out nop jumps */ + break; +emit_jmp: + if (is_imm8(jmp_offset)) { + EMIT2(0xEB, jmp_offset); + } else if (is_simm32(jmp_offset)) { + EMIT1_off32(0xE9, jmp_offset); + } else { + pr_err("jmp gen bug %llx\n", jmp_offset); + return -EFAULT; + } + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + { + int size; + const u8 *r6 = bpf2ia32[BPF_REG_6]; + + /* Setting up first argument */ + /* mov eax,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(r6[0])); + + /* Setting up second argument */ + if (BPF_MODE(code) == BPF_ABS) { + /* mov %edx, imm32 */ + EMIT1_off32(0xBA, imm32); + } else { + if (sstk) + /* mov edx,dword ptr [ebp+off] */ + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, + IA32_EDX), + STACK_VAR(src_lo)); + else + /* mov edx,src_lo */ + EMIT2(0x8B, add_2reg(0xC0, src_lo, + IA32_EDX)); + if (imm32) { + if (is_imm8(imm32)) + /* add %edx,imm8 */ + EMIT3(0x83, 0xC2, imm32); + else + /* add %edx,imm32 */ + EMIT2_off32(0x81, 0xC2, imm32); + } + } + + /* Setting up third argument */ + switch (BPF_SIZE(code)) { + case BPF_W: + size = 4; + break; + case BPF_H: + size = 2; + break; + case BPF_B: + size = 1; + break; + default: + return -EINVAL; + } + /* mov ecx,val */ + EMIT2(0xB1, size); + /* movzx ecx,ecx */ + EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); + + /* mov ebx,ebp */ + EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX)); + /* add %ebx,imm8 */ + EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER); + /* push ebx */ + EMIT1(0x53); + + /* Setting up function pointer to call */ + /* mov ebx,imm32*/ + EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), + (unsigned int)bpf_load_pointer); + + EMIT2(0xFF, add_1reg(0xD0, IA32_EBX)); + /* add %esp,4 */ + EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4); + /* xor edx,edx */ + EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX)); + + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(r0[0])); + /* mov dword ptr [ebp+off],edx */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), + STACK_VAR(r0[1])); + + /* + * Check if return address is NULL or not. + * If NULL then jump to epilogue else continue + * to load the value from retn address + */ + EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0); + jmp_offset = ctx->cleanup_addr - addrs[i]; + + switch (BPF_SIZE(code)) { + case BPF_W: + jmp_offset += 7; + break; + case BPF_H: + jmp_offset += 10; + break; + case BPF_B: + jmp_offset += 6; + break; + } + + EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset); + /* Load value from the address */ + switch (BPF_SIZE(code)) { + case BPF_W: + /* mov eax,[eax] */ + EMIT2(0x8B, 0x0); + /* Emit 'bswap eax' */ + EMIT2(0x0F, add_1reg(0xC8, IA32_EAX)); + break; + case BPF_H: + EMIT3(0x0F, 0xB7, 0x0); + EMIT1(0x66); + EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8); + break; + case BPF_B: + EMIT3(0x0F, 0xB6, 0x0); + break; + } + + /* mov dword ptr [ebp+off],eax */ + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), + STACK_VAR(r0[0])); + break; + } + /* STX XADD: lock *(u32 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_W: + /* STX XADD: lock *(u64 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_DW: + goto notyet; + case BPF_JMP | BPF_EXIT: + if (seen_exit) { + jmp_offset = ctx->cleanup_addr - addrs[i]; + goto emit_jmp; + } + seen_exit = true; + /* Update cleanup_addr */ + ctx->cleanup_addr = proglen; + emit_epilogue(&prog, bpf_prog->aux->stack_depth); + break; +notyet: + pr_info_once("*** NOT YET: opcode %02x ***\n", code); + return -EFAULT; + default: + /* + * This error will be seen if new instruction was added + * to interpreter, but not to JIT or if there is junk in + * bpf_prog + */ + pr_err("bpf_jit: unknown opcode %02x\n", code); + return -EINVAL; + } + + ilen = prog - temp; + if (ilen > BPF_MAX_INSN_SIZE) { + pr_err("bpf_jit: fatal insn size error\n"); + return -EFAULT; + } + + if (image) { + if (unlikely(proglen + ilen > oldproglen)) { + pr_err("bpf_jit: fatal error\n"); + return -EFAULT; + } + memcpy(image + proglen, temp, ilen); + } + proglen += ilen; + addrs[i] = proglen; + prog = temp; + } + return proglen; +} + +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +{ + struct bpf_binary_header *header = NULL; + struct bpf_prog *tmp, *orig_prog = prog; + int proglen, oldproglen = 0; + struct jit_context ctx = {}; + bool tmp_blinded = false; + u8 *image = NULL; + int *addrs; + int pass; + int i; + + if (!prog->jit_requested) + return orig_prog; + + tmp = bpf_jit_blind_constants(prog); + /* + * If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } + + addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); + if (!addrs) { + prog = orig_prog; + goto out; + } + + /* + * Before first pass, make a rough estimation of addrs[] + * each BPF instruction is translated to less than 64 bytes + */ + for (proglen = 0, i = 0; i < prog->len; i++) { + proglen += 64; + addrs[i] = proglen; + } + ctx.cleanup_addr = proglen; + + /* + * JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large BPF programs + * may converge on the last pass. In such case do one more + * pass to emit the final image. + */ + for (pass = 0; pass < 20 || image; pass++) { + proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + if (proglen <= 0) { +out_image: + image = NULL; + if (header) + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_addrs; + } + if (image) { + if (proglen != oldproglen) { + pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", + proglen, oldproglen); + goto out_image; + } + break; + } + if (proglen == oldproglen) { + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); + if (!header) { + prog = orig_prog; + goto out_addrs; + } + } + oldproglen = proglen; + cond_resched(); + } + + if (bpf_jit_enable > 1) + bpf_jit_dump(prog->len, proglen, pass + 1, image); + + if (image) { + bpf_jit_binary_lock_ro(header); + prog->bpf_func = (void *)image; + prog->jited = 1; + prog->jited_len = proglen; + } else { + prog = orig_prog; + } + +out_addrs: + kfree(addrs); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); + return prog; +} From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:22 +0200 Subject: [PATCH 25/66] net: initial AF_XDP skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buildable skeleton of AF_XDP without any functionality. Just what it takes to register a new address family. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 8 ++++++++ include/linux/socket.h | 5 ++++- net/Kconfig | 1 + net/core/sock.c | 12 ++++++++---- net/xdp/Kconfig | 7 +++++++ security/selinux/hooks.c | 4 +++- security/selinux/include/classmap.h | 4 +++- 7 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 net/xdp/Kconfig diff --git a/MAINTAINERS b/MAINTAINERS index 537fd17a211b..52d246fd29c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15424,6 +15424,14 @@ T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/tuners/tuner-xc2028.* +XDP SOCKETS (AF_XDP) +M: Björn Töpel +M: Magnus Karlsson +L: netdev@vger.kernel.org +S: Maintained +F: kernel/bpf/xskmap.c +F: net/xdp/ + XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk M: Roger Pau Monné diff --git a/include/linux/socket.h b/include/linux/socket.h index ea50f4a65816..7ed4713d5337 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -207,8 +207,9 @@ struct ucred { * PF_SMC protocol family that * reuses AF_INET address family */ +#define AF_XDP 44 /* XDP sockets */ -#define AF_MAX 44 /* For now.. */ +#define AF_MAX 45 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -257,6 +258,7 @@ struct ucred { #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC +#define PF_XDP AF_XDP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ @@ -338,6 +340,7 @@ struct ucred { #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 +#define SOL_XDP 283 /* IPX options */ #define IPX_TYPE 1 diff --git a/net/Kconfig b/net/Kconfig index 6fa1a4493b8c..86471a1c1ed4 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -59,6 +59,7 @@ source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "net/xdp/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/core/sock.c b/net/core/sock.c index b2c3db169ca1..e7d8b6c955c6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ - x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") @@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = { "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , + "rlock-AF_MAX" }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , @@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , + "wlock-AF_MAX" }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , @@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , + "elock-AF_MAX" }; /* diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig new file mode 100644 index 000000000000..90e4a7152854 --- /dev/null +++ b/net/xdp/Kconfig @@ -0,0 +1,7 @@ +config XDP_SOCKETS + bool "XDP sockets" + depends on BPF_SYSCALL + default n + help + XDP sockets allows a channel between XDP programs and + userspace applications. diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4cafe6a19167..5c508d26b367 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1471,7 +1471,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; -#if PF_MAX > 44 + case PF_XDP: + return SECCLASS_XDP_SOCKET; +#if PF_MAX > 45 #error New address family defined, please update this function. #endif } diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 7f0372426494..bd5fe0d3204a 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -240,9 +240,11 @@ struct security_class_mapping secclass_map[] = { { "manage_subnet", NULL } }, { "bpf", {"map_create", "map_read", "map_write", "prog_load", "prog_run"} }, + { "xdp_socket", + { COMMON_SOCK_PERMS, NULL } }, { NULL } }; -#if PF_MAX > 44 +#if PF_MAX > 45 #error New address family defined, please update secclass_map. #endif From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:23 +0200 Subject: [PATCH 26/66] xsk: add user memory registration support sockopt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the base structure of the AF_XDP address family is set up. Further, we introduce the abilty register a window of user memory to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory window is viewed by an AF_XDP socket as a set of equally large frames. After a user memory registration all frames are "owned" by the user application, and not the kernel. v2: More robust checks on umem creation and unaccount on error. Call set_page_dirty_lock on cleanup. Simplified xdp_umem_reg. Co-authored-by: Magnus Karlsson Signed-off-by: Magnus Karlsson Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 31 +++++ include/uapi/linux/if_xdp.h | 34 +++++ net/Makefile | 1 + net/xdp/Makefile | 2 + net/xdp/xdp_umem.c | 245 ++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 45 +++++++ net/xdp/xdp_umem_props.h | 23 ++++ net/xdp/xsk.c | 215 +++++++++++++++++++++++++++++++ 8 files changed, 596 insertions(+) create mode 100644 include/net/xdp_sock.h create mode 100644 include/uapi/linux/if_xdp.h create mode 100644 net/xdp/Makefile create mode 100644 net/xdp/xdp_umem.c create mode 100644 net/xdp/xdp_umem.h create mode 100644 net/xdp/xdp_umem_props.h create mode 100644 net/xdp/xsk.c diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..94785f5db13e --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +#include +#include + +struct xdp_umem; + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xdp_umem *umem; + /* Protects multiple processes in the control path */ + struct mutex mutex; +}; + +#endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..41252135a0fe --- /dev/null +++ b/include/uapi/linux/if_xdp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* XDP socket options */ +#define XDP_UMEM_REG 3 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 frame_headroom; /* Frame head room */ +}; + +#endif /* _LINUX_IF_XDP_H */ diff --git a/net/Makefile b/net/Makefile index a6147c61b174..77aaddedbd29 100644 --- a/net/Makefile +++ b/net/Makefile @@ -85,3 +85,4 @@ obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ +obj-$(CONFIG_XDP_SOCKETS) += xdp/ diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 000000000000..a5d736640a0f --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o + diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 000000000000..ec8b3552be44 --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +#define XDP_UMEM_MIN_FRAME_SIZE 2048 + +int xdp_umem_create(struct xdp_umem **umem) +{ + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); + + if (!(*umem)) + return -ENOMEM; + + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; + + set_page_dirty_lock(page); + put_page(page); + } + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + struct task_struct *task; + struct mm_struct *mm; + + if (umem->pgs) { + xdp_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + mmput(mm); + umem->pgs = NULL; + } + + xdp_umem_unaccount_pages(umem); +out: + kfree(umem); +} + +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + +void xdp_get_umem(struct xdp_umem *umem) +{ + atomic_inc(&umem->users); +} + +void xdp_put_umem(struct xdp_umem *umem) +{ + if (!umem) + return; + + if (atomic_dec_and_test(&umem->users)) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + if (!umem->pgs) + return -ENOMEM; + + down_write(¤t->mm->mmap_sem); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + up_write(¤t->mm->mmap_sem); + + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u64 addr = mr->addr, size = mr->len; + unsigned int nframes, nfpp; + int size_chk, err; + + if (!umem) + return -EINVAL; + + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (!is_power_of_2(frame_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + nframes = size / frame_size; + if (nframes == 0 || nframes > UINT_MAX) + return -EINVAL; + + nfpp = PAGE_SIZE / frame_size; + if (nframes < nfpp || nframes % nfpp) + return -EINVAL; + + frame_headroom = ALIGN(frame_headroom, 64); + + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + if (size_chk < 0) + return -EINVAL; + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->props.frame_size = frame_size; + umem->props.nframes = nframes; + umem->frame_headroom = frame_headroom; + umem->npgs = size / PAGE_SIZE; + umem->pgs = NULL; + umem->user = NULL; + + umem->frame_size_log2 = ilog2(frame_size); + umem->nfpp_mask = nfpp - 1; + umem->nfpplog2 = ilog2(nfpp); + atomic_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + goto out; + + err = xdp_umem_pin_pages(umem); + if (err) + goto out_account; + return 0; + +out_account: + xdp_umem_unaccount_pages(umem); +out: + put_pid(umem->pid); + return err; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 000000000000..4597ae81a221 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include +#include +#include + +#include "xdp_umem_props.h" + +struct xdp_umem { + struct page **pgs; + struct xdp_umem_props props; + u32 npgs; + u32 frame_headroom; + u32 nfpp_mask; + u32 nfpplog2; + u32 frame_size_log2; + struct user_struct *user; + struct pid *pid; + unsigned long address; + size_t size; + atomic_t users; + struct work_struct work; +}; + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); +void xdp_get_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem); +int xdp_umem_create(struct xdp_umem **umem); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h new file mode 100644 index 000000000000..77fb5daf29f3 --- /dev/null +++ b/net/xdp/xdp_umem_props.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_PROPS_H_ +#define XDP_UMEM_PROPS_H_ + +struct xdp_umem_props { + u32 frame_size; + u32 nframes; +}; + +#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 000000000000..84e0e867febb --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + + return 0; +} + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_UMEM_REG: + { + struct xdp_umem_reg mr; + struct xdp_umem *umem; + + if (xs->umem) + return -EBUSY; + + if (copy_from_user(&mr, optval, sizeof(mr))) + return -EFAULT; + + mutex_lock(&xs->mutex); + err = xdp_umem_create(&umem); + + err = xdp_umem_reg(umem, &mr); + if (err) { + kfree(umem); + mutex_unlock(&xs->mutex); + return err; + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + + xs->umem = umem; + mutex_unlock(&xs->mutex); + return 0; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = sock_no_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct xdp_sock *xs; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + xs = xdp_sk(sk); + mutex_init(&xs->mutex); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static int __init xsk_init(void) +{ + int err; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + return 0; + +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init); From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:24 +0200 Subject: [PATCH 27/66] xsk: add umem fill queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_FILL_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the user process to the kernel. These frames will in a later patch be filled in with Rx packet data by the kernel. v2: Fixed potential crash in xsk_mmap. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 15 +++++++++ net/xdp/Makefile | 2 +- net/xdp/xdp_umem.c | 5 +++ net/xdp/xdp_umem.h | 2 ++ net/xdp/xsk.c | 65 ++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 58 +++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 38 ++++++++++++++++++++++ 7 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 net/xdp/xsk_queue.c create mode 100644 net/xdp/xsk_queue.h diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 41252135a0fe..975661e1baca 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,6 +23,7 @@ /* XDP socket options */ #define XDP_UMEM_REG 3 +#define XDP_UMEM_FILL_RING 4 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -31,4 +32,18 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +/* Pgoff for mmaping the rings */ +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 + +struct xdp_ring { + __u32 producer __attribute__((aligned(64))); + __u32 consumer __attribute__((aligned(64))); +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + __u32 desc[0] __attribute__((aligned(64))); +}; + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/Makefile b/net/xdp/Makefile index a5d736640a0f..074fb2b2d51c 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ec8b3552be44..e1f627d0cc1c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + if (umem->fq) { + xskq_destroy(umem->fq); + umem->fq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 4597ae81a221..25634b8a5c6f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -19,9 +19,11 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem_props.h" struct xdp_umem { + struct xsk_queue *fq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 84e0e867febb..da67a3c5c1c9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -32,6 +32,7 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem.h" static struct xdp_sock *xdp_sk(struct sock *sk) @@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +{ + struct xsk_queue *q; + + if (entries == 0 || *queue || !is_power_of_2(entries)) + return -EINVAL; + + q = xskq_create(entries); + if (!q) + return -ENOMEM; + + *queue = q; + return 0; +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return 0; } + case XDP_UMEM_FILL_RING: + { + struct xsk_queue **q; + int entries; + + if (!xs->umem) + return -EINVAL; + + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->umem->fq; + err = xsk_init_queue(entries, q); + mutex_unlock(&xs->mutex); + return err; + } default: break; } @@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct xdp_sock *xs = xdp_sk(sock->sk); + struct xsk_queue *q = NULL; + unsigned long pfn; + struct page *qpg; + + if (!xs->umem) + return -EINVAL; + + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + + if (!q) + return -EINVAL; + + qpg = virt_to_head_page(q->ring); + if (size > (PAGE_SIZE << compound_order(qpg))) + return -EINVAL; + + pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, + size, vma->vm_page_prot); +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = { .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .mmap = sock_no_mmap, + .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c new file mode 100644 index 000000000000..23da4f29d3fb --- /dev/null +++ b/net/xdp/xsk_queue.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include + +#include "xsk_queue.h" + +static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +{ + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); +} + +struct xsk_queue *xskq_create(u32 nentries) +{ + struct xsk_queue *q; + gfp_t gfp_flags; + size_t size; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->nentries = nentries; + q->ring_mask = nentries - 1; + + gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | + __GFP_COMP | __GFP_NORETRY; + size = xskq_umem_get_ring_size(q); + + q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, + get_order(size)); + if (!q->ring) { + kfree(q); + return NULL; + } + + return q; +} + +void xskq_destroy(struct xsk_queue *q) +{ + if (!q) + return; + + page_frag_free(q->ring); + kfree(q); +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h new file mode 100644 index 000000000000..7eb556bf73be --- /dev/null +++ b/net/xdp/xsk_queue.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XSK_QUEUE_H +#define _LINUX_XSK_QUEUE_H + +#include +#include + +#include "xdp_umem_props.h" + +struct xsk_queue { + struct xdp_umem_props umem_props; + u32 ring_mask; + u32 nentries; + u32 prod_head; + u32 prod_tail; + u32 cons_head; + u32 cons_tail; + struct xdp_ring *ring; + u64 invalid_descs; +}; + +struct xsk_queue *xskq_create(u32 nentries); +void xskq_destroy(struct xsk_queue *q); + +#endif /* _LINUX_XSK_QUEUE_H */ From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:25 +0200 Subject: [PATCH 28/66] xsk: add Rx queue setup and mmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate a queue, where the kernel can pass completed Rx frames from the kernel to user process. The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 16 +++++++++++++++ net/xdp/xsk.c | 41 +++++++++++++++++++++++++++++-------- net/xdp/xsk_queue.c | 11 ++++++++-- net/xdp/xsk_queue.h | 2 +- 5 files changed, 62 insertions(+), 12 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 94785f5db13e..db9a321de087 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,11 +18,15 @@ #include #include +struct net_device; +struct xsk_queue; struct xdp_umem; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; + struct xsk_queue *rx; + struct net_device *dev; struct xdp_umem *umem; /* Protects multiple processes in the control path */ struct mutex mutex; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 975661e1baca..65324558829d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -22,6 +22,7 @@ #include /* XDP socket options */ +#define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 @@ -33,13 +34,28 @@ struct xdp_umem_reg { }; /* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +struct xdp_desc { + __u32 idx; + __u32 len; + __u16 offset; + __u8 flags; + __u8 padding[5]; +}; + struct xdp_ring { __u32 producer __attribute__((aligned(64))); __u32 consumer __attribute__((aligned(64))); }; +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] __attribute__((aligned(64))); +}; + /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index da67a3c5c1c9..92bd9b7e548f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "xsk_queue.h" #include "xdp_umem.h" @@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } -static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +static int xsk_init_queue(u32 entries, struct xsk_queue **queue, + bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; - q = xskq_create(entries); + q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; @@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; switch (optname) { + case XDP_RX_RING: + { + struct xsk_queue **q; + int entries; + + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->rx; + err = xsk_init_queue(entries, q, false); + mutex_unlock(&xs->mutex); + return err; + } case XDP_UMEM_REG: { struct xdp_umem_reg mr; @@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_lock(&xs->mutex); q = &xs->umem->fq; - err = xsk_init_queue(entries, q); + err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } @@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; - if (!xs->umem) - return -EINVAL; + if (offset == XDP_PGOFF_RX_RING) { + q = xs->rx; + } else { + if (!xs->umem) + return -EINVAL; - if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; - else - return -EINVAL; + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + } if (!q) return -EINVAL; @@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; + xskq_destroy(xs->rx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 23da4f29d3fb..894f9f89afc7 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); } -struct xsk_queue *xskq_create(u32 nentries) +static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) +{ + return (sizeof(struct xdp_ring) + + q->nentries * sizeof(struct xdp_desc)); +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; gfp_t gfp_flags; @@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = xskq_umem_get_ring_size(q); + size = umem_queue ? xskq_umem_get_ring_size(q) : + xskq_rxtx_get_ring_size(q); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7eb556bf73be..5439fa381763 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,7 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; -struct xsk_queue *xskq_create(u32 nentries); +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); #endif /* _LINUX_XSK_QUEUE_H */ From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:26 +0200 Subject: [PATCH 29/66] xsk: add support for bind for Rx Here, the bind syscall is added. Binding an AF_XDP socket, means associating the socket to an umem, a netdev and a queue index. This can be done in two ways. The first way, creating a "socket from scratch". Create the umem using the XDP_UMEM_REG setsockopt and an associated fill queue with XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE setsockopt. Call bind passing ifindex and queue index ("channel" in ethtool speak). The second way to bind a socket, is simply skipping the umem/netdev/queue index, and passing another already setup AF_XDP socket. The new socket will then have the same umem/netdev/queue index as the parent so it will share the same umem. You must also set the flags field in the socket address to XDP_SHARED_UMEM. v2: Use PTR_ERR instead of passing error variable explicitly. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 11 ++++ net/xdp/xdp_umem.c | 5 ++ net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 124 +++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 8 +++ net/xdp/xsk_queue.h | 1 + 7 files changed, 150 insertions(+), 1 deletion(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index db9a321de087..85d02512f59b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; }; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 65324558829d..e5091881f776 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -21,6 +21,17 @@ #include +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM 1 + +struct sockaddr_xdp { + __u16 sxdp_family; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; + __u16 sxdp_flags; +}; + /* XDP socket options */ #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index e1f627d0cc1c..9bac1ad570fa 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -248,3 +248,8 @@ out: put_pid(umem->pid); return err; } + +bool xdp_umem_validate_queues(struct xdp_umem *umem) +{ + return umem->fq; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 25634b8a5c6f..b13133e9c501 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,7 @@ struct xdp_umem { struct work_struct work; }; +bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92bd9b7e548f..bf2c97b87992 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void __xsk_release(struct xdp_sock *xs) +{ + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + + dev_put(xs->dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) @@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); + if (xs->dev) { + __xsk_release(xs); + xs->dev = NULL; + } + sock_orphan(sk); sock->sk = NULL; @@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock) return 0; } +static struct socket *xsk_lookup_xsk_from_fd(int fd) +{ + struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return ERR_PTR(-ENOPROTOOPT); + } + + return sock; +} + +static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; + struct sock *sk = sock->sk; + struct net_device *dev, *dev_curr; + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_umem *old_umem = NULL; + int err = 0; + + if (addr_len < sizeof(struct sockaddr_xdp)) + return -EINVAL; + if (sxdp->sxdp_family != AF_XDP) + return -EINVAL; + + mutex_lock(&xs->mutex); + dev_curr = xs->dev; + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); + if (!dev) { + err = -ENODEV; + goto out_release; + } + + if (!xs->rx) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + struct xdp_sock *umem_xs; + struct socket *sock; + + if (xs->umem) { + /* We have already our own. */ + err = -EINVAL; + goto out_unlock; + } + + sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); + if (IS_ERR(sock)) { + err = PTR_ERR(sock); + goto out_unlock; + } + + umem_xs = xdp_sk(sock->sk); + if (!umem_xs->umem) { + /* No umem to inherit. */ + err = -EBADF; + sockfd_put(sock); + goto out_unlock; + } else if (umem_xs->dev != dev || + umem_xs->queue_id != sxdp->sxdp_queue_id) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xdp_get_umem(umem_xs->umem); + old_umem = xs->umem; + xs->umem = umem_xs->umem; + sockfd_put(sock); + } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + err = -EINVAL; + goto out_unlock; + } + + /* Rebind? */ + if (dev_curr && (dev_curr != dev || + xs->queue_id != sxdp->sxdp_queue_id)) { + __xsk_release(xs); + if (old_umem) + xdp_put_umem(old_umem); + } + + xs->dev = dev; + xs->queue_id = sxdp->sxdp_queue_id; + + xskq_set_umem(xs->rx, &xs->umem->props); + +out_unlock: + if (err) + dev_put(dev); +out_release: + mutex_unlock(&xs->mutex); + return err; +} + static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, - .bind = sock_no_bind, + .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 894f9f89afc7..d012e5e23591 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -16,6 +16,14 @@ #include "xsk_queue.h" +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +{ + if (!q) + return; + + q->umem_props = *umem_props; +} + static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5439fa381763..9ddd2ee07a84 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,6 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:27 +0200 Subject: [PATCH 30/66] xsk: add Rx receive functions and poll support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here the actual receive functions of AF_XDP are implemented, that in a later commit, will be called from the XDP layers. There's one set of functions for the XDP_DRV side and another for XDP_SKB (generic). A new XDP API, xdp_return_buff, is also introduced. Adding xdp_return_buff, which is analogous to xdp_return_frame, but acts upon an struct xdp_buff. The API will be used by AF_XDP in future commits. Support for the poll syscall is also implemented. v2: xskq_validate_id did not update cons_tail. The entries variable was calculated twice in xskq_nb_avail. Squashed xdp_return_buff commit. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + include/net/xdp_sock.h | 22 ++++++++ net/core/xdp.c | 15 ++++-- net/xdp/xdp_umem.h | 18 +++++++ net/xdp/xsk.c | 73 +++++++++++++++++++++++++- net/xdp/xsk_queue.h | 114 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 238 insertions(+), 5 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 137ad5f9f40f..0b689cf561c7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 85d02512f59b..a0342dff6a4d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,28 @@ struct xdp_sock { u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; + u64 rx_dropped; }; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 0c86b53a3a63..bf6758f74339 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,11 +308,9 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(struct xdp_frame *xdpf) +static void xdp_return(void *data, struct xdp_mem_info *mem) { - struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; - void *data = xdpf->data; struct page *page; switch (mem->type) { @@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf) break; } } + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + xdp_return(xdpf->data, &xdpf->mem); +} EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + xdp_return(xdp->data, &xdp->rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index b13133e9c501..c7378a11721f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,24 @@ struct xdp_umem { struct work_struct work; }; +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +{ + u64 pg, off; + char *data; + + pg = idx >> umem->nfpplog2; + off = (idx & umem->nfpp_mask) << umem->frame_size_log2; + + data = page_address(umem->pgs[pg]); + return data + off; +} + +static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, + u32 idx) +{ + return xdp_umem_get_data(umem, idx) + umem->frame_headroom; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bf2c97b87992..4e1e6c581e1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 *id, len = xdp->data_end - xdp->data; + void *buffer; + int err = 0; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + id = xskq_peek_id(xs->umem->fq); + if (!id) + return -ENOSPC; + + buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, *id, len, + xs->umem->frame_headroom); + if (!err) + xskq_discard_id(xs->umem->fq); + + return err; +} + +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (likely(!err)) + xdp_return_buff(xdp); + else + xs->rx_dropped++; + + return err; +} + +void xsk_flush(struct xdp_sock *xs) +{ + xskq_produce_flush_desc(xs->rx); + xs->sk.sk_data_ready(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (!err) + xsk_flush(xs); + else + xs->rx_dropped++; + + return err; +} + +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (xs->rx && !xskq_empty_desc(xs->rx)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { @@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; goto out_unlock; + } else { + /* This xsk has its own umem. */ + xskq_set_umem(xs->umem->fq, &xs->umem->props); } /* Rebind? */ @@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = sock_no_poll, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ddd2ee07a84..0a9b92b4f93a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -20,6 +20,8 @@ #include "xdp_umem_props.h" +#define RX_BATCH_SIZE 16 + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -32,8 +34,118 @@ struct xsk_queue { u64 invalid_descs; }; +/* Common functions operating for both RXTX and umem queues */ + +static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries == 0) { + /* Refresh the local pointer */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + } + + return (entries > dcnt) ? dcnt : entries; +} + +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +{ + u32 free_entries = q->nentries - (producer - q->cons_tail); + + if (free_entries >= dcnt) + return free_entries; + + /* Refresh the local tail pointer */ + q->cons_tail = READ_ONCE(q->ring->consumer); + return q->nentries - (producer - q->cons_tail); +} + +/* UMEM queue */ + +static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +{ + if (unlikely(idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline u32 *xskq_validate_id(struct xsk_queue *q) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_id(q, ring->desc[idx])) + return &ring->desc[idx]; + + q->cons_tail++; + } + + return NULL; +} + +static inline u32 *xskq_peek_id(struct xsk_queue *q) +{ + struct xdp_umem_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_id(q); + } + + ring = (struct xdp_umem_ring *)q->ring; + return &ring->desc[q->cons_tail & q->ring_mask]; +} + +static inline void xskq_discard_id(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_id(q); +} + +/* Rx queue */ + +static inline int xskq_produce_batch_desc(struct xsk_queue *q, + u32 id, u32 len, u16 offset) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx; + + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + idx = (q->prod_head++) & q->ring_mask; + ring->desc[idx].idx = id; + ring->desc[idx].len = len; + ring->desc[idx].offset = offset; + + return 0; +} + +static inline void xskq_produce_flush_desc(struct xsk_queue *q) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail = q->prod_head, + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + +static inline bool xskq_empty_desc(struct xsk_queue *q) +{ + return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); +} + void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); -void xskq_destroy(struct xsk_queue *q); +void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */ From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:28 +0200 Subject: [PATCH 31/66] bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 25 ++++ include/linux/bpf_types.h | 3 + include/net/xdp_sock.h | 7 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/verifier.c | 8 +- kernel/bpf/xskmap.c | 239 ++++++++++++++++++++++++++++++++++++++ net/xdp/xsk.c | 5 + 8 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/xskmap.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c553f6f9c6b0..68ecdb4eea09 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b28fcf6f6ae..d7df1b323082 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a0342dff6a4d..ce3a2ab16b8f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + struct list_head flush_node; u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; @@ -39,6 +40,7 @@ struct xdp_buff; int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline void xsk_flush(struct xdp_sock *xs) { } + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8daef7326bb7..a3a495052511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 35c485fa9ea3..f27f5496d6fe 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 712d8655e916..0d91f18b2eb5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..869dbb11b612 --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + struct list_head __percpu *flush_list; +}; + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + int cpu, err = -EINVAL; + struct xsk_map *m; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += sizeof(struct list_head) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + m->flush_list = alloc_percpu(struct list_head); + if (!m->flush_list) + goto free_m; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_list); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i; + + synchronize_net(); + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_list); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del(xs->flush_node.prev, xs->flush_node.next); + xs->flush_node.prev = NULL; + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e1e6c581e1d..b931a0db5588 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return !!xs->rx; +} + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 *id, len = xdp->data_end - xdp->data; From 1b1a251c83d29ec18b4a2191cce5c5b7117529ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:29 +0200 Subject: [PATCH 32/66] xsk: wire up XDP_DRV side of AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit wires up the xskmap to XDP_DRV layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d3781daa26ab..40d4bbb4508d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2801,7 +2801,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { struct net_device *dev = fwd; struct xdp_frame *xdpf; @@ -2819,14 +2820,25 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2845,6 +2857,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2859,6 +2874,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } From 02671e23e7b383763fe1ae4f20b56d8029f9dfc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:30 +0200 Subject: [PATCH 33/66] xsk: wire up XDP_SKB side of AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit wires up the xskmap to XDP_SKB layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- net/core/dev.c | 35 +++++++++++++++++++---------------- net/core/filter.c | 17 ++++++++++++++--- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 64899c04c1a6..b7f81e3a70cb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -760,7 +760,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *prog); + struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); diff --git a/net/core/dev.c b/net/core/dev.c index 8f8931b93140..aea36b5a2fed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3994,12 +3994,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; - struct xdp_buff xdp; int hlen, off; u32 mac_len; @@ -4034,19 +4034,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data_end = xdp.data_end; - orig_data = xdp.data; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp->data_end; + orig_data = xdp->data; rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; + xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); - off = xdp.data - orig_data; + off = xdp->data - orig_data; if (off > 0) __skb_pull(skb, off); else if (off < 0) @@ -4056,10 +4056,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if bpf_xdp_adjust_tail was used. it can only "shrink" * pckt. */ - off = orig_data_end - xdp.data_end; + off = orig_data_end - xdp->data_end; if (off != 0) { - skb_set_tail_pointer(skb, xdp.data_end - xdp.data); + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len -= off; + } switch (act) { @@ -4068,7 +4069,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, mac_len); break; case XDP_PASS: - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; @@ -4118,17 +4119,19 @@ static struct static_key generic_xdp_needed __read_mostly; int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + struct xdp_buff xdp; + u32 act; int err; + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, - xdp_prog); + &xdp, xdp_prog); if (err) goto out_redir; - /* fallthru to submit skb */ + break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; diff --git a/net/core/filter.c b/net/core/filter.c index 40d4bbb4508d..120bc8a202d9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -59,6 +59,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -2973,13 +2974,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -3001,6 +3003,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -3015,7 +3025,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -3023,7 +3033,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -3037,6 +3047,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:31 +0200 Subject: [PATCH 34/66] xsk: add umem completion queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the kernel to user process. This will be used by the TX path to tell user space that a certain frame has been transmitted and user space can use it for something else, if it wishes. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xdp_umem.c | 7 ++++++- net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 7 ++++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e5091881f776..71581a139f26 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -36,6 +36,7 @@ struct sockaddr_xdp { #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 +#define XDP_UMEM_COMPLETION_RING 5 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 struct xdp_desc { __u32 idx; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9bac1ad570fa..881dfdefe235 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -70,6 +70,11 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->fq = NULL; } + if (umem->cq) { + xskq_destroy(umem->cq); + umem->cq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); @@ -251,5 +256,5 @@ out: bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return umem->fq; + return (umem->fq && umem->cq); } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c7378a11721f..7e0b2fab8522 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -24,6 +24,7 @@ struct xdp_umem { struct xsk_queue *fq; + struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b931a0db5588..f4a2c5bc6da9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -255,6 +255,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else { /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); + xskq_set_umem(xs->umem->cq, &xs->umem->props); } /* Rebind? */ @@ -334,6 +335,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return 0; } case XDP_UMEM_FILL_RING: + case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; @@ -345,7 +347,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->umem->fq; + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : + &xs->umem->cq; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; @@ -375,6 +378,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_UMEM_PGOFF_FILL_RING) q = xs->umem->fq; + else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) + q = xs->umem->cq; else return -EINVAL; } From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:32 +0200 Subject: [PATCH 35/66] xsk: add Tx queue setup and mmap support Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate a queue, where the user process can pass frames to be transmitted by the kernel. The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xsk.c | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ce3a2ab16b8f..185f4928fbda 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; + struct xsk_queue *tx ____cacheline_aligned_in_smp; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 71581a139f26..e2ea878d025c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -34,6 +34,7 @@ struct sockaddr_xdp { /* XDP socket options */ #define XDP_RX_RING 1 +#define XDP_TX_RING 2 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f4a2c5bc6da9..2d7b0c90d996 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -206,7 +206,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } - if (!xs->rx) { + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } @@ -291,6 +291,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_RX_RING: + case XDP_TX_RING: { struct xsk_queue **q; int entries; @@ -301,7 +302,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->rx; + q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); return err; @@ -372,6 +373,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_PGOFF_RX_RING) { q = xs->rx; + } else if (offset == XDP_PGOFF_TX_RING) { + q = xs->tx; } else { if (!xs->umem) return -EINVAL; @@ -431,6 +434,7 @@ static void xsk_destruct(struct sock *sk) return; xskq_destroy(xs->rx); + xskq_destroy(xs->tx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); From 865b03f21162e4edfda51fc08693c864b1d4fdaf Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:33 +0200 Subject: [PATCH 36/66] dev: packet: make packet_direct_xmit a common function The new dev_direct_xmit will be used by AF_XDP in later commits. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 1 + net/core/dev.c | 38 +++++++++++++++++++++++++++++++++++ net/packet/af_packet.c | 42 +++++---------------------------------- 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 366c32891158..a30435118530 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2486,6 +2486,7 @@ void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); diff --git a/net/core/dev.c b/net/core/dev.c index aea36b5a2fed..d3fdc86516e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3625,6 +3625,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) } EXPORT_SYMBOL(dev_queue_xmit_accel); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + + local_bh_enable(); + + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + + return ret; +drop: + atomic_long_inc(&dev->tx_dropped); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(dev_direct_xmit); /************************************************************************* * Receiver routines diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 01f3515cada0..611a26d5235c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); +static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { @@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { - struct net_device *dev = skb->dev; - struct sk_buff *orig_skb = skb; - struct netdev_queue *txq; - int ret = NETDEV_TX_BUSY; - bool again = false; - - if (unlikely(!netif_running(dev) || - !netif_carrier_ok(dev))) - goto drop; - - skb = validate_xmit_skb_list(skb, dev, &again); - if (skb != orig_skb) - goto drop; - - packet_pick_tx_queue(dev, skb); - txq = skb_get_tx_queue(dev, skb); - - local_bh_disable(); - - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq, false); - HARD_TX_UNLOCK(dev, txq); - - local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - - return ret; -drop: - atomic_long_inc(&dev->tx_dropped); - kfree_skb_list(skb); - return NET_XMIT_DROP; + return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) @@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; } -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 packet_pick_tx_queue(struct sk_buff *skb) { + struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; u16 queue_index; @@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) queue_index = __packet_pick_tx_queue(dev, skb); } - skb_set_queue_mapping(skb, queue_index); + return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook From 35fcde7f8deb51b707b161bf19cbd22363aef2df Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:34 +0200 Subject: [PATCH 37/66] xsk: support for Tx Here, Tx support is added. The user fills the Tx queue with frames to be sent by the kernel, and let's the kernel know using the sendmsg syscall. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 111 ++++++++++++++++++++++++++++++++++++++++++-- net/xdp/xsk_queue.h | 93 ++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2d7b0c90d996..b33c535c7996 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,8 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#define TX_BATCH_SIZE 16 + static struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; @@ -101,6 +103,108 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +static void xsk_destruct_skb(struct sk_buff *skb) +{ + u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + struct xdp_sock *xs = xdp_sk(skb->sk); + + WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + + sock_wfree(skb); +} + +static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, + size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + u32 max_batch = TX_BATCH_SIZE; + struct xdp_sock *xs = xdp_sk(sk); + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + if (unlikely(!xs->tx)) + return -ENOBUFS; + if (need_wait) + return -EOPNOTSUPP; + + mutex_lock(&xs->mutex); + + while (xskq_peek_desc(xs->tx, &desc)) { + char *buffer; + u32 id, len; + + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + if (xskq_reserve_id(xs->umem->cq)) { + err = -EAGAIN; + goto out; + } + + len = desc.len; + if (unlikely(len > xs->dev->mtu)) { + err = -EMSGSIZE; + goto out; + } + + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + if (unlikely(!skb)) { + err = -EAGAIN; + goto out; + } + + skb_put(skb, len); + id = desc.idx; + buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto out; + } + + skb->dev = xs->dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb->destructor = xsk_destruct_skb; + + err = dev_direct_xmit(skb, xs->queue_id); + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { + err = -EAGAIN; + /* SKB consumed by dev_direct_xmit() */ + goto out; + } + + sent_frame = true; + xskq_discard_desc(xs->tx); + } + +out: + if (sent_frame) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!xs->dev)) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return xsk_generic_xmit(sk, m, total_len); +} + static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -110,6 +214,8 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_empty_desc(xs->rx)) mask |= POLLIN | POLLRDNORM; + if (xs->tx && !xskq_full_desc(xs->tx)) + mask |= POLLOUT | POLLWRNORM; return mask; } @@ -270,6 +376,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = sxdp->sxdp_queue_id; xskq_set_umem(xs->rx, &xs->umem->props); + xskq_set_umem(xs->tx, &xs->umem->props); out_unlock: if (err) @@ -383,8 +490,6 @@ static int xsk_mmap(struct file *file, struct socket *sock, q = xs->umem->fq; else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = xs->umem->cq; - else - return -EINVAL; } if (!q) @@ -420,7 +525,7 @@ static const struct proto_ops xsk_proto_ops = { .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = sock_no_getsockopt, - .sendmsg = sock_no_sendmsg, + .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 0a9b92b4f93a..3497e8808608 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -111,7 +111,93 @@ static inline void xskq_discard_id(struct xsk_queue *q) (void)xskq_validate_id(q); } -/* Rx queue */ +static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[q->prod_tail++ & q->ring_mask] = id; + + /* Order producer and data */ + smp_wmb(); + + WRITE_ONCE(q->ring->producer, q->prod_tail); + return 0; +} + +static inline int xskq_reserve_id(struct xsk_queue *q) +{ + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + q->prod_head++; + return 0; +} + +/* Rx/Tx queue */ + +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +{ + u32 buff_len; + + if (unlikely(d->idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + + buff_len = q->umem_props.frame_size; + if (unlikely(d->len > buff_len || d->len == 0 || + d->offset > buff_len || d->offset + d->len > buff_len)) { + q->invalid_descs++; + return false; + } + + return true; +} + +static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_desc(q, &ring->desc[idx])) { + if (desc) + *desc = ring->desc[idx]; + return desc; + } + + q->cons_tail++; + } + + return NULL; +} + +static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + struct xdp_rxtx_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_desc(q, desc); + } + + ring = (struct xdp_rxtx_ring *)q->ring; + *desc = ring->desc[q->cons_tail & q->ring_mask]; + return desc; +} + +static inline void xskq_discard_desc(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_desc(q, NULL); +} static inline int xskq_produce_batch_desc(struct xsk_queue *q, u32 id, u32 len, u16 offset) @@ -139,6 +225,11 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) WRITE_ONCE(q->ring->producer, q->prod_tail); } +static inline bool xskq_full_desc(struct xsk_queue *q) +{ + return (xskq_nb_avail(q, q->nentries) == q->nentries); +} + static inline bool xskq_empty_desc(struct xsk_queue *q) { return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:35 +0200 Subject: [PATCH 38/66] xsk: statistics support In this commit, a new getsockopt is added: XDP_STATISTICS. This is used to obtain stats from the sockets. v2: getsockopt now returns size of stats structure. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 7 ++++++ net/xdp/xsk.c | 45 ++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 5 +++++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e2ea878d025c..77b88c4efe98 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -38,6 +38,7 @@ struct sockaddr_xdp { #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 +#define XDP_STATISTICS 6 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -46,6 +47,12 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b33c535c7996..009c5af5bba5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -468,6 +468,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int len; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case XDP_STATISTICS: + { + struct xdp_statistics stats; + + if (len < sizeof(stats)) + return -EINVAL; + + mutex_lock(&xs->mutex); + stats.rx_dropped = xs->rx_dropped; + stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); + stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); + mutex_unlock(&xs->mutex); + + if (copy_to_user(optval, &stats, sizeof(stats))) + return -EFAULT; + if (put_user(sizeof(stats), optlen)) + return -EFAULT; + + return 0; + } + default: + break; + } + + return -EOPNOTSUPP; +} + static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { @@ -524,7 +567,7 @@ static const struct proto_ops xsk_proto_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, - .getsockopt = sock_no_getsockopt, + .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 3497e8808608..7aa9a535db0e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -36,6 +36,11 @@ struct xsk_queue { /* Common functions operating for both RXTX and umem queues */ +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) { u32 entries = q->prod_tail - q->cons_tail; From b4b8faa1ded7a3bb34db374c692a51cea29f9080 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:36 +0200 Subject: [PATCH 39/66] samples/bpf: sample application and documentation for AF_XDP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a sample application for AF_XDP sockets. The application supports three different modes of operation: rxdrop, txonly and l2fwd. To show-case a simple round-robin load-balancing between a set of sockets in an xskmap, set the RR_LB compile time define option to 1 in "xdpsock.h". v2: The entries variable was calculated twice in {umem,xq}_nb_avail. Co-authored-by: Björn Töpel Signed-off-by: Björn Töpel Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- Documentation/networking/af_xdp.rst | 297 +++++++++ Documentation/networking/index.rst | 1 + samples/bpf/Makefile | 4 + samples/bpf/xdpsock.h | 11 + samples/bpf/xdpsock_kern.c | 56 ++ samples/bpf/xdpsock_user.c | 948 ++++++++++++++++++++++++++++ 6 files changed, 1317 insertions(+) create mode 100644 Documentation/networking/af_xdp.rst create mode 100644 samples/bpf/xdpsock.h create mode 100644 samples/bpf/xdpsock_kern.c create mode 100644 samples/bpf/xdpsock_user.c diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst new file mode 100644 index 000000000000..91928d9ee4bf --- /dev/null +++ b/Documentation/networking/af_xdp.rst @@ -0,0 +1,297 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +AF_XDP +====== + +Overview +======== + +AF_XDP is an address family that is optimized for high performance +packet processing. + +This document assumes that the reader is familiar with BPF and XDP. If +not, the Cilium project has an excellent reference guide at +http://cilium.readthedocs.io/en/doc-1.0/bpf/. + +Using the XDP_REDIRECT action from an XDP program, the program can +redirect ingress frames to other XDP enabled netdevs, using the +bpf_redirect_map() function. AF_XDP sockets enable the possibility for +XDP programs to redirect frames to a memory buffer in a user-space +application. + +An AF_XDP socket (XSK) is created with the normal socket() +syscall. Associated with each XSK are two rings: the RX ring and the +TX ring. A socket can receive packets on the RX ring and it can send +packets on the TX ring. These rings are registered and sized with the +setsockopts XDP_RX_RING and XDP_TX_RING, respectively. It is mandatory +to have at least one of these rings for each socket. An RX or TX +descriptor ring points to a data buffer in a memory area called a +UMEM. RX and TX can share the same UMEM so that a packet does not have +to be copied between RX and TX. Moreover, if a packet needs to be kept +for a while due to a possible retransmit, the descriptor that points +to that packet can be changed to point to another and reused right +away. This again avoids copying data. + +The UMEM consists of a number of equally size frames and each frame +has a unique frame id. A descriptor in one of the rings references a +frame by referencing its frame id. The user space allocates memory for +this UMEM using whatever means it feels is most appropriate (malloc, +mmap, huge pages, etc). This memory area is then registered with the +kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two +rings: the FILL ring and the COMPLETION ring. The fill ring is used by +the application to send down frame ids for the kernel to fill in with +RX packet data. References to these frames will then appear in the RX +ring once each packet has been received. The completion ring, on the +other hand, contains frame ids that the kernel has transmitted +completely and can now be used again by user space, for either TX or +RX. Thus, the frame ids appearing in the completion ring are ids that +were previously transmitted using the TX ring. In summary, the RX and +FILL rings are used for the RX path and the TX and COMPLETION rings +are used for the TX path. + +The socket is then finally bound with a bind() call to a device and a +specific queue id on that device, and it is not until bind is +completed that traffic starts to flow. + +The UMEM can be shared between processes, if desired. If a process +wants to do this, it simply skips the registration of the UMEM and its +corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind +call and submits the XSK of the process it would like to share UMEM +with as well as its own newly created XSK socket. The new process will +then receive frame id references in its own RX ring that point to this +shared UMEM. Note that since the ring structures are single-consumer / +single-producer (for performance reasons), the new process has to +create its own socket with associated RX and TX rings, since it cannot +share this with the other process. This is also the reason that there +is only one set of FILL and COMPLETION rings per UMEM. It is the +responsibility of a single process to handle the UMEM. + +How is then packets distributed from an XDP program to the XSKs? There +is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The +user-space application can place an XSK at an arbitrary place in this +map. The XDP program can then redirect a packet to a specific index in +this map and at this point XDP validates that the XSK in that map was +indeed bound to that device and ring number. If not, the packet is +dropped. If the map is empty at that index, the packet is also +dropped. This also means that it is currently mandatory to have an XDP +program loaded (and one XSK in the XSKMAP) to be able to get any +traffic to user space through the XSK. + +AF_XDP can operate in two different modes: XDP_SKB and XDP_DRV. If the +driver does not have support for XDP, or XDP_SKB is explicitly chosen +when loading the XDP program, XDP_SKB mode is employed that uses SKBs +together with the generic XDP support and copies out the data to user +space. A fallback mode that works for any network device. On the other +hand, if the driver has support for XDP, it will be used by the AF_XDP +code to provide better performance, but there is still a copy of the +data into user space. + +Concepts +======== + +In order to use an AF_XDP socket, a number of associated objects need +to be setup. + +Jonathan Corbet has also written an excellent article on LWN, +"Accelerating networking with AF_XDP". It can be found at +https://lwn.net/Articles/750845/. + +UMEM +---- + +UMEM is a region of virtual contiguous memory, divided into +equal-sized frames. An UMEM is associated to a netdev and a specific +queue id of that netdev. It is created and configured (frame size, +frame headroom, start address and size) by using the XDP_UMEM_REG +setsockopt system call. A UMEM is bound to a netdev and queue id, via +the bind() system call. + +An AF_XDP is socket linked to a single UMEM, but one UMEM can have +multiple AF_XDP sockets. To share an UMEM created via one socket A, +the next socket B can do this by setting the XDP_SHARED_UMEM flag in +struct sockaddr_xdp member sxdp_flags, and passing the file descriptor +of A to struct sockaddr_xdp member sxdp_shared_umem_fd. + +The UMEM has two single-producer/single-consumer rings, that are used +to transfer ownership of UMEM frames between the kernel and the +user-space application. + +Rings +----- + +There are a four different kind of rings: Fill, Completion, RX and +TX. All rings are single-producer/single-consumer, so the user-space +application need explicit synchronization of multiple +processes/threads are reading/writing to them. + +The UMEM uses two rings: Fill and Completion. Each socket associated +with the UMEM must have an RX queue, TX queue or both. Say, that there +is a setup with four sockets (all doing TX and RX). Then there will be +one Fill ring, one Completion ring, four TX rings and four RX rings. + +The rings are head(producer)/tail(consumer) based rings. A producer +writes the data ring at the index pointed out by struct xdp_ring +producer member, and increasing the producer index. A consumer reads +the data ring at the index pointed out by struct xdp_ring consumer +member, and increasing the consumer index. + +The rings are configured and created via the _RING setsockopt system +calls and mmapped to user-space using the appropriate offset to mmap() +(XDP_PGOFF_RX_RING, XDP_PGOFF_TX_RING, XDP_UMEM_PGOFF_FILL_RING and +XDP_UMEM_PGOFF_COMPLETION_RING). + +The size of the rings need to be of size power of two. + +UMEM Fill Ring +~~~~~~~~~~~~~~ + +The Fill ring is used to transfer ownership of UMEM frames from +user-space to kernel-space. The UMEM indicies are passed in the +ring. As an example, if the UMEM is 64k and each frame is 4k, then the +UMEM has 16 frames and can pass indicies between 0 and 15. + +Frames passed to the kernel are used for the ingress path (RX rings). + +The user application produces UMEM indicies to this ring. + +UMEM Completetion Ring +~~~~~~~~~~~~~~~~~~~~~~ + +The Completion Ring is used transfer ownership of UMEM frames from +kernel-space to user-space. Just like the Fill ring, UMEM indicies are +used. + +Frames passed from the kernel to user-space are frames that has been +sent (TX ring) and can be used by user-space again. + +The user application consumes UMEM indicies from this ring. + + +RX Ring +~~~~~~~ + +The RX ring is the receiving side of a socket. Each entry in the ring +is a struct xdp_desc descriptor. The descriptor contains UMEM index +(idx), the length of the data (len), the offset into the frame +(offset). + +If no frames have been passed to kernel via the Fill ring, no +descriptors will (or can) appear on the RX ring. + +The user application consumes struct xdp_desc descriptors from this +ring. + +TX Ring +~~~~~~~ + +The TX ring is used to send frames. The struct xdp_desc descriptor is +filled (index, length and offset) and passed into the ring. + +To start the transfer a sendmsg() system call is required. This might +be relaxed in the future. + +The user application produces struct xdp_desc descriptors to this +ring. + +XSKMAP / BPF_MAP_TYPE_XSKMAP +---------------------------- + +On XDP side there is a BPF map type BPF_MAP_TYPE_XSKMAP (XSKMAP) that +is used in conjunction with bpf_redirect_map() to pass the ingress +frame to a socket. + +The user application inserts the socket into the map, via the bpf() +system call. + +Note that if an XDP program tries to redirect to a socket that does +not match the queue configuration and netdev, the frame will be +dropped. E.g. an AF_XDP socket is bound to netdev eth0 and +queue 17. Only the XDP program executing for eth0 and queue 17 will +successfully pass data to the socket. Please refer to the sample +application (samples/bpf/) in for an example. + +Usage +===== + +In order to use AF_XDP sockets there are two parts needed. The +user-space application and the XDP program. For a complete setup and +usage example, please refer to the sample application. The user-space +side is xdpsock_user.c and the XDP side xdpsock_kern.c. + +Naive ring dequeue and enqueue could look like this:: + + // typedef struct xdp_rxtx_ring RING; + // typedef struct xdp_umem_ring RING; + + // typedef struct xdp_desc RING_TYPE; + // typedef __u32 RING_TYPE; + + int dequeue_one(RING *ring, RING_TYPE *item) + { + __u32 entries = ring->ptrs.producer - ring->ptrs.consumer; + + if (entries == 0) + return -1; + + // read-barrier! + + *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)]; + ring->ptrs.consumer++; + return 0; + } + + int enqueue_one(RING *ring, const RING_TYPE *item) + { + u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer); + + if (free_entries == 0) + return -1; + + ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item; + + // write-barrier! + + ring->ptrs.producer++; + return 0; + } + + +For a more optimized version, please refer to the sample application. + +Sample application +================== + +There is a xdpsock benchmarking/test application included that +demonstrates how to use AF_XDP sockets with both private and shared +UMEMs. Say that you would like your UDP traffic from port 4242 to end +up in queue 16, that we will enable AF_XDP on. Here, we use ethtool +for this:: + + ethtool -N p3p2 rx-flow-hash udp4 fn + ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \ + action 16 + +Running the rxdrop benchmark in XDP_DRV mode can then be done +using:: + + samples/bpf/xdpsock -i p3p2 -q 16 -r -N + +For XDP_SKB mode, use the switch "-S" instead of "-N" and all options +can be displayed with "-h", as usual. + +Credits +======= + +- Björn Töpel (AF_XDP core) +- Magnus Karlsson (AF_XDP core) +- Alexander Duyck +- Alexei Starovoitov +- Daniel Borkmann +- Jesper Dangaard Brouer +- John Fastabend +- Jonathan Corbet (LWN coverage) +- Michael S. Tsirkin +- Qi Z Zhang +- Willem de Bruijn + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f204eaff657d..cbd9bdd4a79e 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -6,6 +6,7 @@ Contents: .. toctree:: :maxdepth: 2 + af_xdp batman-adv can dpaa2/index diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 5e31770ac087..8e0c7fb6d7cc 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -45,6 +45,7 @@ hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp hostprogs-y += cpustat hostprogs-y += xdp_adjust_tail +hostprogs-y += xdpsock # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o @@ -98,6 +99,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o +xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -151,6 +153,7 @@ always += xdp2skb_meta_kern.o always += syscall_tp_kern.o always += cpustat_kern.o always += xdp_adjust_tail_kern.o +always += xdpsock_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -197,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf HOSTLOADLIBES_cpustat += -lelf HOSTLOADLIBES_xdp_adjust_tail += -lelf +HOSTLOADLIBES_xdpsock += -lelf -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h new file mode 100644 index 000000000000..533ab81adfa1 --- /dev/null +++ b/samples/bpf/xdpsock.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef XDPSOCK_H_ +#define XDPSOCK_H_ + +/* Power-of-2 number of sockets */ +#define MAX_SOCKS 4 + +/* Round-robin receive */ +#define RR_LB 0 + +#endif /* XDPSOCK_H_ */ diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c new file mode 100644 index 000000000000..d8806c41362e --- /dev/null +++ b/samples/bpf/xdpsock_kern.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KBUILD_MODNAME "foo" +#include +#include "bpf_helpers.h" + +#include "xdpsock.h" + +struct bpf_map_def SEC("maps") qidconf_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") xsks_map = { + .type = BPF_MAP_TYPE_XSKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 4, +}; + +struct bpf_map_def SEC("maps") rr_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(unsigned int), + .max_entries = 1, +}; + +SEC("xdp_sock") +int xdp_sock_prog(struct xdp_md *ctx) +{ + int *qidconf, key = 0, idx; + unsigned int *rr; + + qidconf = bpf_map_lookup_elem(&qidconf_map, &key); + if (!qidconf) + return XDP_ABORTED; + + if (*qidconf != ctx->rx_queue_index) + return XDP_PASS; + +#if RR_LB /* NB! RR_LB is configured in xdpsock.h */ + rr = bpf_map_lookup_elem(&rr_map, &key); + if (!rr) + return XDP_ABORTED; + + *rr = (*rr + 1) & (MAX_SOCKS - 1); + idx = *rr; +#else + idx = 0; +#endif + + return bpf_redirect_map(&xsks_map, idx, 0); +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c new file mode 100644 index 000000000000..4b8a7cf3e63b --- /dev/null +++ b/samples/bpf/xdpsock_user.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 - 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_load.h" +#include "bpf_util.h" +#include "libbpf.h" + +#include "xdpsock.h" + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#define NUM_FRAMES 131072 +#define FRAME_HEADROOM 0 +#define FRAME_SIZE 2048 +#define NUM_DESCS 1024 +#define BATCH_SIZE 16 + +#define FQ_NUM_DESCS 1024 +#define CQ_NUM_DESCS 1024 + +#define DEBUG_HEXDUMP 0 + +typedef __u32 u32; + +static unsigned long prev_time; + +enum benchmark_type { + BENCH_RXDROP = 0, + BENCH_TXONLY = 1, + BENCH_L2FWD = 2, +}; + +static enum benchmark_type opt_bench = BENCH_RXDROP; +static u32 opt_xdp_flags; +static const char *opt_if = ""; +static int opt_ifindex; +static int opt_queue; +static int opt_poll; +static int opt_shared_packet_buffer; +static int opt_interval = 1; + +struct xdp_umem_uqueue { + u32 cached_prod; + u32 cached_cons; + u32 mask; + u32 size; + struct xdp_umem_ring *ring; +}; + +struct xdp_umem { + char (*frames)[FRAME_SIZE]; + struct xdp_umem_uqueue fq; + struct xdp_umem_uqueue cq; + int fd; +}; + +struct xdp_uqueue { + u32 cached_prod; + u32 cached_cons; + u32 mask; + u32 size; + struct xdp_rxtx_ring *ring; +}; + +struct xdpsock { + struct xdp_uqueue rx; + struct xdp_uqueue tx; + int sfd; + struct xdp_umem *umem; + u32 outstanding_tx; + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; +}; + +#define MAX_SOCKS 4 +static int num_socks; +struct xdpsock *xsks[MAX_SOCKS]; + +static unsigned long get_nsecs(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000UL + ts.tv_nsec; +} + +static void dump_stats(void); + +#define lassert(expr) \ + do { \ + if (!(expr)) { \ + fprintf(stderr, "%s:%s:%i: Assertion failed: " \ + #expr ": errno: %d/\"%s\"\n", \ + __FILE__, __func__, __LINE__, \ + errno, strerror(errno)); \ + dump_stats(); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define u_smp_rmb() barrier() +#define u_smp_wmb() barrier() +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +static const char pkt_data[] = + "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" + "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" + "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" + "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; + +static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 free_entries = q->size - (q->cached_prod - q->cached_cons); + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = q->ring->ptrs.consumer; + + return q->size - (q->cached_prod - q->cached_cons); +} + +static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs) +{ + u32 free_entries = q->cached_cons - q->cached_prod; + + if (free_entries >= ndescs) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = q->ring->ptrs.consumer + q->size; + return q->cached_cons - q->cached_prod; +} + +static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = q->ring->ptrs.producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = q->ring->ptrs.producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > ndescs) ? ndescs : entries; +} + +static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq, + struct xdp_desc *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) + return -ENOSPC; + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring->desc[idx] = d[i].idx; + } + + u_smp_wmb(); + + fq->ring->ptrs.producer = fq->cached_prod; + + return 0; +} + +static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) + return -ENOSPC; + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring->desc[idx] = d[i]; + } + + u_smp_wmb(); + + fq->ring->ptrs.producer = fq->cached_prod; + + return 0; +} + +static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq, + u32 *d, size_t nb) +{ + u32 idx, i, entries = umem_nb_avail(cq, nb); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = cq->cached_cons++ & cq->mask; + d[i] = cq->ring->desc[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + cq->ring->ptrs.consumer = cq->cached_cons; + } + + return entries; +} + +static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off) +{ + lassert(idx < NUM_FRAMES); + return &xsk->umem->frames[idx][off]; +} + +static inline int xq_enq(struct xdp_uqueue *uq, + const struct xdp_desc *descs, + unsigned int ndescs) +{ + struct xdp_rxtx_ring *r = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + r->desc[idx].idx = descs[i].idx; + r->desc[idx].len = descs[i].len; + r->desc[idx].offset = descs[i].offset; + } + + u_smp_wmb(); + + r->ptrs.producer = uq->cached_prod; + return 0; +} + +static inline int xq_enq_tx_only(struct xdp_uqueue *uq, + __u32 idx, unsigned int ndescs) +{ + struct xdp_rxtx_ring *q = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + q->desc[idx].idx = idx + i; + q->desc[idx].len = sizeof(pkt_data) - 1; + q->desc[idx].offset = 0; + } + + u_smp_wmb(); + + q->ptrs.producer = uq->cached_prod; + return 0; +} + +static inline int xq_deq(struct xdp_uqueue *uq, + struct xdp_desc *descs, + int ndescs) +{ + struct xdp_rxtx_ring *r = uq->ring; + unsigned int idx; + int i, entries; + + entries = xq_nb_avail(uq, ndescs); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = uq->cached_cons++ & uq->mask; + descs[i] = r->desc[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + r->ptrs.consumer = uq->cached_cons; + } + + return entries; +} + +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +#if DEBUG_HEXDUMP +static void hex_dump(void *pkt, size_t length, const char *prefix) +{ + int i = 0; + const unsigned char *address = (unsigned char *)pkt; + const unsigned char *line = address; + size_t line_size = 32; + unsigned char c; + + printf("length = %zu\n", length); + printf("%s | ", prefix); + while (length-- > 0) { + printf("%02X ", *address++); + if (!(++i % line_size) || (length == 0 && i % line_size)) { + if (length == 0) { + while (i++ % line_size) + printf("__ "); + } + printf(" | "); /* right close */ + while (line < address) { + c = *line++; + printf("%c", (c < 33 || c == 255) ? 0x2E : c); + } + printf("\n"); + if (length > 0) + printf("%s | ", prefix); + } + } + printf("\n"); +} +#endif + +static size_t gen_eth_frame(char *frame) +{ + memcpy(frame, pkt_data, sizeof(pkt_data) - 1); + return sizeof(pkt_data) - 1; +} + +static struct xdp_umem *xdp_umem_configure(int sfd) +{ + int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; + struct xdp_umem_reg mr; + struct xdp_umem *umem; + void *bufs; + + umem = calloc(1, sizeof(*umem)); + lassert(umem); + + lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ + NUM_FRAMES * FRAME_SIZE) == 0); + + mr.addr = (__u64)bufs; + mr.len = NUM_FRAMES * FRAME_SIZE; + mr.frame_size = FRAME_SIZE; + mr.frame_headroom = FRAME_HEADROOM; + + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size, + sizeof(int)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, + sizeof(int)) == 0); + + umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) + + FQ_NUM_DESCS * sizeof(u32), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_FILL_RING); + lassert(umem->fq.ring != MAP_FAILED); + + umem->fq.mask = FQ_NUM_DESCS - 1; + umem->fq.size = FQ_NUM_DESCS; + + umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) + + CQ_NUM_DESCS * sizeof(u32), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_COMPLETION_RING); + lassert(umem->cq.ring != MAP_FAILED); + + umem->cq.mask = CQ_NUM_DESCS - 1; + umem->cq.size = CQ_NUM_DESCS; + + umem->frames = (char (*)[FRAME_SIZE])bufs; + umem->fd = sfd; + + if (opt_bench == BENCH_TXONLY) { + int i; + + for (i = 0; i < NUM_FRAMES; i++) + (void)gen_eth_frame(&umem->frames[i][0]); + } + + return umem; +} + +static struct xdpsock *xsk_configure(struct xdp_umem *umem) +{ + struct sockaddr_xdp sxdp = {}; + int sfd, ndescs = NUM_DESCS; + struct xdpsock *xsk; + bool shared = true; + u32 i; + + sfd = socket(PF_XDP, SOCK_RAW, 0); + lassert(sfd >= 0); + + xsk = calloc(1, sizeof(*xsk)); + lassert(xsk); + + xsk->sfd = sfd; + xsk->outstanding_tx = 0; + + if (!umem) { + shared = false; + xsk->umem = xdp_umem_configure(sfd); + } else { + xsk->umem = umem; + } + + lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING, + &ndescs, sizeof(int)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, + &ndescs, sizeof(int)) == 0); + + /* Rx */ + xsk->rx.ring = mmap(NULL, + sizeof(struct xdp_ring) + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_RX_RING); + lassert(xsk->rx.ring != MAP_FAILED); + + if (!shared) { + for (i = 0; i < NUM_DESCS / 2; i++) + lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1) + == 0); + } + + /* Tx */ + xsk->tx.ring = mmap(NULL, + sizeof(struct xdp_ring) + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_TX_RING); + lassert(xsk->tx.ring != MAP_FAILED); + + xsk->rx.mask = NUM_DESCS - 1; + xsk->rx.size = NUM_DESCS; + + xsk->tx.mask = NUM_DESCS - 1; + xsk->tx.size = NUM_DESCS; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = opt_ifindex; + sxdp.sxdp_queue_id = opt_queue; + if (shared) { + sxdp.sxdp_flags = XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } + + lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0); + + return xsk; +} + +static void print_benchmark(bool running) +{ + const char *bench_str = "INVALID"; + + if (opt_bench == BENCH_RXDROP) + bench_str = "rxdrop"; + else if (opt_bench == BENCH_TXONLY) + bench_str = "txonly"; + else if (opt_bench == BENCH_L2FWD) + bench_str = "l2fwd"; + + printf("%s:%d %s ", opt_if, opt_queue, bench_str); + if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) + printf("xdp-skb "); + else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) + printf("xdp-drv "); + else + printf(" "); + + if (opt_poll) + printf("poll() "); + + if (running) { + printf("running..."); + fflush(stdout); + } +} + +static void dump_stats(void) +{ + unsigned long now = get_nsecs(); + long dt = now - prev_time; + int i; + + prev_time = now; + + for (i = 0; i < num_socks; i++) { + char *fmt = "%-15s %'-11.0f %'-11lu\n"; + double rx_pps, tx_pps; + + rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * + 1000000000. / dt; + tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * + 1000000000. / dt; + + printf("\n sock%d@", i); + print_benchmark(false); + printf("\n"); + + printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", + dt / 1000000000.); + printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); + + xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; + xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; + } +} + +static void *poller(void *arg) +{ + (void)arg; + for (;;) { + sleep(opt_interval); + dump_stats(); + } + + return NULL; +} + +static void int_exit(int sig) +{ + (void)sig; + dump_stats(); + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); + exit(EXIT_SUCCESS); +} + +static struct option long_options[] = { + {"rxdrop", no_argument, 0, 'r'}, + {"txonly", no_argument, 0, 't'}, + {"l2fwd", no_argument, 0, 'l'}, + {"interface", required_argument, 0, 'i'}, + {"queue", required_argument, 0, 'q'}, + {"poll", no_argument, 0, 'p'}, + {"shared-buffer", no_argument, 0, 's'}, + {"xdp-skb", no_argument, 0, 'S'}, + {"xdp-native", no_argument, 0, 'N'}, + {"interval", required_argument, 0, 'n'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -r, --rxdrop Discard all incoming packets (default)\n" + " -t, --txonly Only send packets\n" + " -l, --l2fwd MAC swap L2 forwarding\n" + " -i, --interface=n Run on interface n\n" + " -q, --queue=n Use queue n (default 0)\n" + " -p, --poll Use poll syscall\n" + " -s, --shared-buffer Use shared packet buffer\n" + " -S, --xdp-skb=n Use XDP skb-mod\n" + " -N, --xdp-native=n Enfore XDP native mode\n" + " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + "\n"; + fprintf(stderr, str, prog); + exit(EXIT_FAILURE); +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options, + &option_index); + if (c == -1) + break; + + switch (c) { + case 'r': + opt_bench = BENCH_RXDROP; + break; + case 't': + opt_bench = BENCH_TXONLY; + break; + case 'l': + opt_bench = BENCH_L2FWD; + break; + case 'i': + opt_if = optarg; + break; + case 'q': + opt_queue = atoi(optarg); + break; + case 's': + opt_shared_packet_buffer = 1; + break; + case 'p': + opt_poll = 1; + break; + case 'S': + opt_xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + case 'n': + opt_interval = atoi(optarg); + break; + default: + usage(basename(argv[0])); + } + } + + opt_ifindex = if_nametoindex(opt_if); + if (!opt_ifindex) { + fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", + opt_if); + usage(basename(argv[0])); + } +} + +static void kick_tx(int fd) +{ + int ret; + + ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN) + return; + lassert(0); +} + +static inline void complete_tx_l2fwd(struct xdpsock *xsk) +{ + u32 descs[BATCH_SIZE]; + unsigned int rcvd; + size_t ndescs; + + if (!xsk->outstanding_tx) + return; + + kick_tx(xsk->sfd); + ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : + xsk->outstanding_tx; + + /* re-add completed Tx buffers */ + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs); + if (rcvd > 0) { + umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static inline void complete_tx_only(struct xdpsock *xsk) +{ + u32 descs[BATCH_SIZE]; + unsigned int rcvd; + + if (!xsk->outstanding_tx) + return; + + kick_tx(xsk->sfd); + + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE); + if (rcvd > 0) { + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static void rx_drop(struct xdpsock *xsk) +{ + struct xdp_desc descs[BATCH_SIZE]; + unsigned int rcvd, i; + + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + if (!rcvd) + return; + + for (i = 0; i < rcvd; i++) { + u32 idx = descs[i].idx; + + lassert(idx < NUM_FRAMES); +#if DEBUG_HEXDUMP + char *pkt; + char buf[32]; + + pkt = xq_get_data(xsk, idx, descs[i].offset); + sprintf(buf, "idx=%d", idx); + hex_dump(pkt, descs[i].len, buf); +#endif + } + + xsk->rx_npkts += rcvd; + + umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd); +} + +static void rx_drop_all(void) +{ + struct pollfd fds[MAX_SOCKS + 1]; + int i, ret, timeout, nfds = 1; + + memset(fds, 0, sizeof(fds)); + + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsks[i]->sfd; + fds[i].events = POLLIN; + timeout = 1000; /* 1sn */ + } + + for (;;) { + if (opt_poll) { + ret = poll(fds, nfds, timeout); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + rx_drop(xsks[i]); + } +} + +static void tx_only(struct xdpsock *xsk) +{ + int timeout, ret, nfds = 1; + struct pollfd fds[nfds + 1]; + unsigned int idx = 0; + + memset(fds, 0, sizeof(fds)); + fds[0].fd = xsk->sfd; + fds[0].events = POLLOUT; + timeout = 1000; /* 1sn */ + + for (;;) { + if (opt_poll) { + ret = poll(fds, nfds, timeout); + if (ret <= 0) + continue; + + if (fds[0].fd != xsk->sfd || + !(fds[0].revents & POLLOUT)) + continue; + } + + if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) { + lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0); + + xsk->outstanding_tx += BATCH_SIZE; + idx += BATCH_SIZE; + idx %= NUM_FRAMES; + } + + complete_tx_only(xsk); + } +} + +static void l2fwd(struct xdpsock *xsk) +{ + for (;;) { + struct xdp_desc descs[BATCH_SIZE]; + unsigned int rcvd, i; + int ret; + + for (;;) { + complete_tx_l2fwd(xsk); + + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + if (rcvd > 0) + break; + } + + for (i = 0; i < rcvd; i++) { + char *pkt = xq_get_data(xsk, descs[i].idx, + descs[i].offset); + + swap_mac_addresses(pkt); +#if DEBUG_HEXDUMP + char buf[32]; + u32 idx = descs[i].idx; + + sprintf(buf, "idx=%d", idx); + hex_dump(pkt, descs[i].len, buf); +#endif + } + + xsk->rx_npkts += rcvd; + + ret = xq_enq(&xsk->tx, descs, rcvd); + lassert(ret == 0); + xsk->outstanding_tx += rcvd; + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char xdp_filename[256]; + int i, ret, key = 0; + pthread_t pt; + + parse_command_line(argc, argv); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(xdp_filename)) { + fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf); + exit(EXIT_FAILURE); + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) { + fprintf(stderr, "ERROR: link set xdp fd failed\n"); + exit(EXIT_FAILURE); + } + + ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0); + if (ret) { + fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n"); + exit(EXIT_FAILURE); + } + + /* Create sockets... */ + xsks[num_socks++] = xsk_configure(NULL); + +#if RR_LB + for (i = 0; i < MAX_SOCKS - 1; i++) + xsks[num_socks++] = xsk_configure(xsks[0]->umem); +#endif + + /* ...and insert them into the map. */ + for (i = 0; i < num_socks; i++) { + key = i; + ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0); + if (ret) { + fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); + exit(EXIT_FAILURE); + } + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + signal(SIGABRT, int_exit); + + setlocale(LC_ALL, ""); + + ret = pthread_create(&pt, NULL, poller, NULL); + lassert(ret == 0); + + prev_time = get_nsecs(); + + if (opt_bench == BENCH_RXDROP) + rx_drop_all(); + else if (opt_bench == BENCH_TXONLY) + tx_only(xsks[0]); + else + l2fwd(xsks[0]); + + return 0; +} From b390134c2423c557cac844ee3b7cb52dc1cfab02 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:12 +0200 Subject: [PATCH 40/66] bpf: prefix cbpf internal helpers with bpf_ No change in functionality, just remove the '__' prefix and replace it with a 'bpf_' prefix instead. We later on add a couple of more helpers for cBPF and keeping the scheme with '__' is suboptimal there. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 120bc8a202d9..c33595a8d604 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,12 +113,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -138,7 +138,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -162,13 +162,13 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -318,16 +318,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); From 93731ef086cee90af594e62874bb98ae6d6eee91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:13 +0200 Subject: [PATCH 41/66] bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason is that the eBPF tests from test_bpf module do not go via BPF verifier and therefore any instruction rewrites from verifier cannot take place. Therefore, move them into test_verifier which runs out of user space, so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches. It will have the same effect since runtime tests are also performed from there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto and keep it internal to core kernel. Additionally, also add further cBPF LD_ABS/LD_IND test coverage into test_bpf.ko suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 - lib/test_bpf.c | 570 ++++++++++++-------- net/core/filter.c | 6 +- tools/testing/selftests/bpf/test_verifier.c | 266 ++++++++- 4 files changed, 619 insertions(+), 225 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 68ecdb4eea09..d0e3d7ef36a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -714,8 +714,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -extern const struct bpf_func_proto bpf_skb_vlan_push_proto; -extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8e157806df7a..317f231462d4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -386,116 +386,6 @@ static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self) return 0; } -#define PUSH_CNT 68 -/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */ -static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) -{ - unsigned int len = BPF_MAXINSNS; - struct bpf_insn *insn; - int i = 0, j, k = 0; - - insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - insn[i++] = BPF_MOV64_REG(R6, R1); -loop: - for (j = 0; j < PUSH_CNT; j++) { - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); - i++; - insn[i++] = BPF_MOV64_REG(R1, R6); - insn[i++] = BPF_MOV64_IMM(R2, 1); - insn[i++] = BPF_MOV64_IMM(R3, 2); - insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_push_proto.func - __bpf_call_base); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); - i++; - } - - for (j = 0; j < PUSH_CNT; j++) { - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); - i++; - insn[i++] = BPF_MOV64_REG(R1, R6); - insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_pop_proto.func - __bpf_call_base); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); - i++; - } - if (++k < 5) - goto loop; - - for (; i < len - 1; i++) - insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef); - - insn[len - 1] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = len; - - return 0; -} - -static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self) -{ - struct bpf_insn *insn; - - insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - /* Due to func address being non-const, we need to - * assemble this here. - */ - insn[0] = BPF_MOV64_REG(R6, R1); - insn[1] = BPF_LD_ABS(BPF_B, 0); - insn[2] = BPF_LD_ABS(BPF_H, 0); - insn[3] = BPF_LD_ABS(BPF_W, 0); - insn[4] = BPF_MOV64_REG(R7, R6); - insn[5] = BPF_MOV64_IMM(R6, 0); - insn[6] = BPF_MOV64_REG(R1, R7); - insn[7] = BPF_MOV64_IMM(R2, 1); - insn[8] = BPF_MOV64_IMM(R3, 2); - insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_push_proto.func - __bpf_call_base); - insn[10] = BPF_MOV64_REG(R6, R7); - insn[11] = BPF_LD_ABS(BPF_B, 0); - insn[12] = BPF_LD_ABS(BPF_H, 0); - insn[13] = BPF_LD_ABS(BPF_W, 0); - insn[14] = BPF_MOV64_IMM(R0, 42); - insn[15] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = 16; - - return 0; -} - -static int bpf_fill_jump_around_ld_abs(struct bpf_test *self) -{ - unsigned int len = BPF_MAXINSNS; - struct bpf_insn *insn; - int i = 0; - - insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - insn[i++] = BPF_MOV64_REG(R6, R1); - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JEQ, R0, 10, len - i - 2); - i++; - while (i < len - 1) - insn[i++] = BPF_LD_ABS(BPF_B, 1); - insn[i] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = len; - - return 0; -} - static int __bpf_fill_stxdw(struct bpf_test *self, int size) { unsigned int len = BPF_MAXINSNS; @@ -1987,40 +1877,6 @@ static struct bpf_test tests[] = { { }, { { 0, -1 } } }, - { - "INT: DIV + ABS", - .u.insns_int = { - BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_LD_ABS(BPF_B, 3), - BPF_ALU64_IMM(BPF_MOV, R2, 2), - BPF_ALU32_REG(BPF_DIV, R0, R2), - BPF_ALU64_REG(BPF_MOV, R8, R0), - BPF_LD_ABS(BPF_B, 4), - BPF_ALU64_REG(BPF_ADD, R8, R0), - BPF_LD_IND(BPF_B, R8, -70), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 10, 20, 30, 40, 50 }, - { { 4, 0 }, { 5, 10 } } - }, - { - /* This one doesn't go through verifier, but is just raw insn - * as opposed to cBPF tests from here. Thus div by 0 tests are - * done in test_verifier in BPF kselftests. - */ - "INT: DIV by -1", - .u.insns_int = { - BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_ALU64_IMM(BPF_MOV, R7, -1), - BPF_LD_ABS(BPF_B, 3), - BPF_ALU32_REG(BPF_DIV, R0, R7), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 10, 20, 30, 40, 50 }, - { { 3, 0 }, { 4, 0 } } - }, { "check: missing ret", .u.insns = { @@ -2383,50 +2239,6 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } } }, - { - "nmap reduced", - .u.insns_int = { - BPF_MOV64_REG(R6, R1), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 28), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 26), - BPF_MOV32_IMM(R0, 18), - BPF_STX_MEM(BPF_W, R10, R0, -64), - BPF_LDX_MEM(BPF_W, R7, R10, -64), - BPF_LD_IND(BPF_W, R7, 14), - BPF_STX_MEM(BPF_W, R10, R0, -60), - BPF_MOV32_IMM(R0, 280971478), - BPF_STX_MEM(BPF_W, R10, R0, -56), - BPF_LDX_MEM(BPF_W, R7, R10, -56), - BPF_LDX_MEM(BPF_W, R0, R10, -60), - BPF_ALU32_REG(BPF_SUB, R0, R7), - BPF_JMP_IMM(BPF_JNE, R0, 0, 15), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 13), - BPF_MOV32_IMM(R0, 22), - BPF_STX_MEM(BPF_W, R10, R0, -56), - BPF_LDX_MEM(BPF_W, R7, R10, -56), - BPF_LD_IND(BPF_H, R7, 14), - BPF_STX_MEM(BPF_W, R10, R0, -52), - BPF_MOV32_IMM(R0, 17366), - BPF_STX_MEM(BPF_W, R10, R0, -48), - BPF_LDX_MEM(BPF_W, R7, R10, -48), - BPF_LDX_MEM(BPF_W, R0, R10, -52), - BPF_ALU32_REG(BPF_SUB, R0, R7), - BPF_JMP_IMM(BPF_JNE, R0, 0, 2), - BPF_MOV32_IMM(R0, 256), - BPF_EXIT_INSN(), - BPF_MOV32_IMM(R0, 0), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6}, - { { 38, 256 } }, - .stack_depth = 64, - }, /* BPF_ALU | BPF_MOV | BPF_X */ { "ALU_MOV_X: dst = 2", @@ -5485,22 +5297,6 @@ static struct bpf_test tests[] = { { { 1, 0xbee } }, .fill_helper = bpf_fill_ld_abs_get_processor_id, }, - { - "BPF_MAXINSNS: ld_abs+vlan_push/pop", - { }, - INTERNAL, - { 0x34 }, - { { ETH_HLEN, 0xbef } }, - .fill_helper = bpf_fill_ld_abs_vlan_push_pop, - }, - { - "BPF_MAXINSNS: jump around ld_abs", - { }, - INTERNAL, - { 10, 11 }, - { { 2, 10 } }, - .fill_helper = bpf_fill_jump_around_ld_abs, - }, /* * LD_IND / LD_ABS on fragmented SKBs */ @@ -5682,6 +5478,53 @@ static struct bpf_test tests[] = { { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, { {0x40, 0x05 } }, }, + { + "LD_IND byte positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xff } }, + }, + { + "LD_IND byte positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND byte negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, + { + "LD_IND byte negative offset, multiple calls", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3b), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 1), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 2), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 3), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 4), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, { "LD_IND halfword positive offset", .u.insns = { @@ -5730,6 +5573,39 @@ static struct bpf_test tests[] = { }, { {0x40, 0x66cc } }, }, + { + "LD_IND halfword positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3d), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffff } }, + }, + { + "LD_IND halfword positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND halfword negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, { "LD_IND word positive offset", .u.insns = { @@ -5820,6 +5696,39 @@ static struct bpf_test tests[] = { }, { {0x40, 0x66cc77dd } }, }, + { + "LD_IND word positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3b), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffffffff } }, + }, + { + "LD_IND word positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND word negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, { "LD_ABS byte", .u.insns = { @@ -5837,6 +5746,68 @@ static struct bpf_test tests[] = { }, { {0x40, 0xcc } }, }, + { + "LD_ABS byte positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xff } }, + }, + { + "LD_ABS byte positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS byte negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS byte negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, + { + "LD_ABS byte negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS byte negative offset, multiple calls", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3d), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, { "LD_ABS halfword", .u.insns = { @@ -5871,6 +5842,55 @@ static struct bpf_test tests[] = { }, { {0x40, 0x99ff } }, }, + { + "LD_ABS halfword positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffff } }, + }, + { + "LD_ABS halfword positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS halfword negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS halfword negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x1982 }, }, + }, + { + "LD_ABS halfword negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, { "LD_ABS word", .u.insns = { @@ -5939,6 +5959,140 @@ static struct bpf_test tests[] = { }, { {0x40, 0x88ee99ff } }, }, + { + "LD_ABS word positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffffffff } }, + }, + { + "LD_ABS word positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS word negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS word negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x25051982 }, }, + }, + { + "LD_ABS word negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LDX_MSH standalone, preserved A", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0xffeebbaa }, }, + }, + { + "LDX_MSH standalone, preserved A 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0x175e9d63), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3d), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x175e9d63 }, }, + }, + { + "LDX_MSH standalone, test result 1", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x14 }, }, + }, + { + "LDX_MSH standalone, test result 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x24 }, }, + }, + { + "LDX_MSH standalone, negative offset", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, -1), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0 }, }, + }, + { + "LDX_MSH standalone, negative offset 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x24 }, }, + }, + { + "LDX_MSH standalone, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x40), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0 }, }, + }, /* * verify that the interpreter or JIT correctly sets A and X * to 0. @@ -6127,14 +6281,6 @@ static struct bpf_test tests[] = { {}, { {0x1, 0x42 } }, }, - { - "LD_ABS with helper changing skb data", - { }, - INTERNAL, - { 0x34 }, - { { ETH_HLEN, 42 } }, - .fill_helper = bpf_fill_ld_abs_vlan_push_pop2, - }, /* Checking interpreter vs JIT wrt signed extended imms. */ { "JNE signed compare, test 1", diff --git a/net/core/filter.c b/net/core/filter.c index c33595a8d604..865500f6180d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2181,7 +2181,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2189,7 +2189,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2203,13 +2202,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1acafe26498b..275b4570b5b8 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -47,7 +47,7 @@ # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif -#define MAX_INSNS 512 +#define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 #define MAX_NR_MAPS 4 #define POINTER_VALUE 0xcafe4all @@ -77,6 +77,8 @@ struct bpf_test { } result, result_unpriv; enum bpf_prog_type prog_type; uint8_t flags; + __u8 data[TEST_DATA_LEN]; + void (*fill_helper)(struct bpf_test *self); }; /* Note we want this to be 64 bit aligned so that the end of our array is @@ -94,6 +96,62 @@ struct other_val { long long bar; }; +static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) +{ + /* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */ +#define PUSH_CNT 51 + unsigned int len = BPF_MAXINSNS; + struct bpf_insn *insn = self->insns; + int i = 0, j, k = 0; + + insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); +loop: + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1); + insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); + i++; + } + + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_pop), + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); + i++; + } + if (++k < 5) + goto loop; + + for (; i < len - 1; i++) + insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef); + insn[len - 1] = BPF_EXIT_INSN(); +} + +static void bpf_fill_jump_around_ld_abs(struct bpf_test *self) +{ + struct bpf_insn *insn = self->insns; + unsigned int len = BPF_MAXINSNS; + int i = 0; + + insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2); + i++; + while (i < len - 1) + insn[i++] = BPF_LD_ABS(BPF_B, 1); + insn[i] = BPF_EXIT_INSN(); +} + static struct bpf_test tests[] = { { "add+sub+mul", @@ -11725,6 +11783,197 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "ld_abs: invalid op 1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_DW, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "unknown opcode", + }, + { + "ld_abs: invalid op 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 256), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_IND(BPF_DW, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "unknown opcode", + }, + { + "ld_abs: nmap reduced", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26), + BPF_MOV32_IMM(BPF_REG_0, 18), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64), + BPF_LD_IND(BPF_W, BPF_REG_7, 14), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60), + BPF_MOV32_IMM(BPF_REG_0, 280971478), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60), + BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13), + BPF_MOV32_IMM(BPF_REG_0, 22), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56), + BPF_LD_IND(BPF_H, BPF_REG_7, 14), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52), + BPF_MOV32_IMM(BPF_REG_0, 17366), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52), + BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV32_IMM(BPF_REG_0, 256), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .data = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 256, + }, + { + "ld_abs: div + abs, test 1", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2), + BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0), + BPF_LD_ABS(BPF_B, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), + BPF_LD_IND(BPF_B, BPF_REG_8, -70), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 10, + }, + { + "ld_abs: div + abs, test 2", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2), + BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0), + BPF_LD_ABS(BPF_B, 128), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), + BPF_LD_IND(BPF_B, BPF_REG_8, -70), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: div + abs, test 3", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: div + abs, test 4", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), + BPF_LD_ABS(BPF_B, 256), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: vlan + abs, test 1", + .insns = { }, + .data = { + 0x34, + }, + .fill_helper = bpf_fill_ld_abs_vlan_push_pop, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0xbef, + }, + { + "ld_abs: vlan + abs, test 2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_6, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .data = { + 0x34, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 42, + }, + { + "ld_abs: jump around ld_abs", + .insns = { }, + .data = { + 10, 11, + }, + .fill_helper = bpf_fill_jump_around_ld_abs, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 10, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -11828,7 +12077,7 @@ static int create_map_in_map(void) return outer_map_fd; } -static char bpf_vlog[32768]; +static char bpf_vlog[UINT_MAX >> 8]; static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, int *map_fds) @@ -11839,6 +12088,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, int *fixup_prog = test->fixup_prog; int *fixup_map_in_map = test->fixup_map_in_map; + if (test->fill_helper) + test->fill_helper(test); + /* Allocating HTs with 1 elem is fine here, since we only test * for verifier and not do a runtime lookup, so the only thing * that really matters is value size in this case. @@ -11888,10 +12140,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { int fd_prog, expected_ret, reject_from_alignment; + int prog_len, prog_type = test->prog_type; struct bpf_insn *prog = test->insns; - int prog_len = probe_filter_length(prog); - char data_in[TEST_DATA_LEN] = {}; - int prog_type = test->prog_type; int map_fds[MAX_NR_MAPS]; const char *expected_err; uint32_t retval; @@ -11901,6 +12151,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, map_fds[i] = -1; do_test_fixup(test, prog, map_fds); + prog_len = probe_filter_length(prog); fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, @@ -11940,8 +12191,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } if (fd_prog >= 0) { - err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in), - NULL, NULL, &retval, NULL); + err = bpf_prog_test_run(fd_prog, 1, test->data, + sizeof(test->data), NULL, NULL, + &retval, NULL); if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { printf("Unexpected bpf_prog_test_run error\n"); goto fail_log; From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:14 +0200 Subject: [PATCH 42/66] bpf: implement ld_abs/ld_ind in native bpf The main part of this work is to finally allow removal of LD_ABS and LD_IND from the BPF core by reimplementing them through native eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and keeping them around in native eBPF caused way more trouble than actually worth it. To just list some of the security issues in the past: * fdfaf64e7539 ("x86: bpf_jit: support negative offsets") * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets") * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler") * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call") * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context") * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context") For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy these days due to their limitations and more efficient/flexible alternatives that have been developed over time such as direct packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a register, the load happens in host endianness and its exception handling can yield unexpected behavior. The latter is explained in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by div/mod by 0 exception") with similar cases of exceptions we had. In native eBPF more recent program types will disable LD_ABS/LD_IND altogether through may_access_skb() in verifier, and given the limitations in terms of exception handling, it's also disabled in programs that use BPF to BPF calls. In terms of cBPF, the LD_ABS/LD_IND is used in networking programs to access packet data. It is not used in seccomp-BPF but programs that use it for socket filtering or reuseport for demuxing with cBPF. This is mostly relevant for applications that have not yet migrated to native eBPF. The main complexity and source of bugs in LD_ABS/LD_IND is coming from their implementation in the various JITs. Most of them keep the model around from cBPF times by implementing a fastpath written in asm. They use typically two from the BPF program hidden CPU registers for caching the skb's headlen (skb->len - skb->data_len) and skb->data. Throughout the JIT phase this requires to keep track whether LD_ABS/LD_IND are used and if so, the two registers need to be recached each time a BPF helper would change the underlying packet data in native eBPF case. At least in eBPF case, available CPU registers are rare and the additional exit path out of the asm written JIT helper makes it also inflexible since not all parts of the JITer are in control from plain C. A LD_ABS/LD_IND implementation in eBPF therefore allows to significantly reduce the complexity in JITs with comparable performance results for them, e.g.: test_bpf tcpdump port 22 tcpdump complex x64 - before 15 21 10 14 19 18 - after 7 10 10 7 10 15 arm64 - before 40 91 92 40 91 151 - after 51 64 73 51 62 113 For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter() and cache the skb's headlen and data in the cBPF prologue. The BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just used as a local temporary variable. This allows to shrink the image on x86_64 also for seccomp programs slightly since mapping to %rsi is not an ereg. In callee-saved R8 and R9 we now track skb data and headlen, respectively. For normal prologue emission in the JITs this does not add any extra instructions since R8, R9 are pushed to stack in any case from eBPF side. cBPF uses the convert_bpf_ld_abs() emitter which probes the fast path inline already and falls back to bpf_skb_load_helper_{8,16,32}() helper relying on the cached skb data and headlen as well. R8 and R9 never need to be reloaded due to bpf_helper_changes_pkt_data() since all skb access in cBPF is read-only. Then, for the case of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally, does neither cache skb data and headlen nor has an inlined fast path. The reason for the latter is that native eBPF does not have any extra registers available anyway, but even if there were, it avoids any reload of skb data and headlen in the first place. Additionally, for the negative offsets, we provide an alternative bpf_skb_load_bytes_relative() helper in eBPF which operates similarly as bpf_skb_load_bytes() and allows for more flexibility. Tested myself on x64, arm64, s390x, from Sandipan on ppc64. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/filter.h | 4 +- kernel/bpf/core.c | 96 ++--------------- kernel/bpf/verifier.c | 24 +++++ net/core/filter.c | 236 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 262 insertions(+), 100 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0e3d7ef36a8..0e00a13ff01b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -235,6 +235,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_ld_abs)(const struct bpf_insn *orig, + struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index b7f81e3a70cb..da7e16523128 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -47,7 +47,9 @@ struct xdp_buff; /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 -#define BPF_REG_TMP BPF_REG_8 +#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register for hardening step. * Only used by eBPF JITs. It's nothing more than a temporary diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 90feeba3a1a1..1127552c8033 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -634,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -891,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -908,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -938,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1266,67 +1247,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d91f18b2eb5..6ba10a83909d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3884,6 +3884,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + if (env->subprog_cnt) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees @@ -5519,6 +5524,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) diff --git a/net/core/filter.c b/net/core/filter.c index 865500f6180d..a49729842b3d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); @@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -412,12 +554,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -460,6 +617,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -562,21 +724,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -658,6 +830,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1019,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1040,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1059,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -4330,6 +4507,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -5599,6 +5811,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5610,6 +5823,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:15 +0200 Subject: [PATCH 43/66] bpf: add skb_load_bytes_relative helper This adds a small BPF helper similar to bpf_skb_load_bytes() that is able to load relative to mac/net header offset from the skb's linear data. Compared to bpf_skb_load_bytes(), it takes a fifth argument namely start_header, which is either BPF_HDR_START_MAC or BPF_HDR_START_NET. This allows for a more flexible alternative compared to LD_ABS/LD_IND with negative offset. It's enabled for tc BPF programs as well as sock filter program types where it's mainly useful in reuseport programs to ease access to lower header data. Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++- net/core/filter.c | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a3a495052511..93d5a4eeec2a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1802,6 +1802,30 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1871,7 +1895,8 @@ union bpf_attr { FN(bind), \ FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ - FN(get_stack), + FN(get_stack), \ + FN(skb_load_bytes_relative), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, }; +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/net/core/filter.c b/net/core/filter.c index a49729842b3d..6877426c23a6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1684,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *ptr; + + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + ptr = skb_mac_header(skb) + offset; + break; + case BPF_HDR_START_NET: + ptr = skb_network_header(skb) + offset; + break; + default: + goto err_clear; + } + + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write @@ -4061,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -4078,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: From e782bdcf58c5ace7b7d58b2436177de9785a18e8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:16 +0200 Subject: [PATCH 44/66] bpf, x64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from x64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/x86/net/Makefile | 3 +- arch/x86/net/bpf_jit.S | 154 ------------------------------------ arch/x86/net/bpf_jit_comp.c | 144 +-------------------------------- 3 files changed, 5 insertions(+), 296 deletions(-) delete mode 100644 arch/x86/net/bpf_jit.S diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index c6b464a261bb..59e123da580c 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -5,6 +5,5 @@ ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o else - OBJECT_FILES_NON_STANDARD_bpf_jit.o += y - obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o + obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o endif diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S deleted file mode 100644 index b33093f84528..000000000000 --- a/arch/x86/net/bpf_jit.S +++ /dev/null @@ -1,154 +0,0 @@ -/* bpf_jit.S : BPF JIT helper functions - * - * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include -#include - -/* - * Calling convention : - * rbx : skb pointer (callee saved) - * esi : offset of byte(s) to fetch in skb (can be scratched) - * r10 : copy of skb->data - * r9d : hlen = skb->len - skb->data_len - */ -#define SKBDATA %r10 -#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ - -#define FUNC(name) \ - .globl name; \ - .type name, @function; \ - name: - -FUNC(sk_load_word) - test %esi,%esi - js bpf_slow_path_word_neg - -FUNC(sk_load_word_positive_offset) - mov %r9d,%eax # hlen - sub %esi,%eax # hlen - offset - cmp $3,%eax - jle bpf_slow_path_word - mov (SKBDATA,%rsi),%eax - bswap %eax /* ntohl() */ - ret - -FUNC(sk_load_half) - test %esi,%esi - js bpf_slow_path_half_neg - -FUNC(sk_load_half_positive_offset) - mov %r9d,%eax - sub %esi,%eax # hlen - offset - cmp $1,%eax - jle bpf_slow_path_half - movzwl (SKBDATA,%rsi),%eax - rol $8,%ax # ntohs() - ret - -FUNC(sk_load_byte) - test %esi,%esi - js bpf_slow_path_byte_neg - -FUNC(sk_load_byte_positive_offset) - cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ - jle bpf_slow_path_byte - movzbl (SKBDATA,%rsi),%eax - ret - -/* rsi contains offset and can be scratched */ -#define bpf_slow_path_common(LEN) \ - lea 32(%rbp), %rdx;\ - FRAME_BEGIN; \ - mov %rbx, %rdi; /* arg1 == skb */ \ - push %r9; \ - push SKBDATA; \ -/* rsi already has offset */ \ - mov $LEN,%ecx; /* len */ \ - call skb_copy_bits; \ - test %eax,%eax; \ - pop SKBDATA; \ - pop %r9; \ - FRAME_END - - -bpf_slow_path_word: - bpf_slow_path_common(4) - js bpf_error - mov 32(%rbp),%eax - bswap %eax - ret - -bpf_slow_path_half: - bpf_slow_path_common(2) - js bpf_error - mov 32(%rbp),%ax - rol $8,%ax - movzwl %ax,%eax - ret - -bpf_slow_path_byte: - bpf_slow_path_common(1) - js bpf_error - movzbl 32(%rbp),%eax - ret - -#define sk_negative_common(SIZE) \ - FRAME_BEGIN; \ - mov %rbx, %rdi; /* arg1 == skb */ \ - push %r9; \ - push SKBDATA; \ -/* rsi already has offset */ \ - mov $SIZE,%edx; /* size */ \ - call bpf_internal_load_pointer_neg_helper; \ - test %rax,%rax; \ - pop SKBDATA; \ - pop %r9; \ - FRAME_END; \ - jz bpf_error - -bpf_slow_path_word_neg: - cmp SKF_MAX_NEG_OFF, %esi /* test range */ - jl bpf_error /* offset lower -> error */ - -FUNC(sk_load_word_negative_offset) - sk_negative_common(4) - mov (%rax), %eax - bswap %eax - ret - -bpf_slow_path_half_neg: - cmp SKF_MAX_NEG_OFF, %esi - jl bpf_error - -FUNC(sk_load_half_negative_offset) - sk_negative_common(2) - mov (%rax),%ax - rol $8,%ax - movzwl %ax,%eax - ret - -bpf_slow_path_byte_neg: - cmp SKF_MAX_NEG_OFF, %esi - jl bpf_error - -FUNC(sk_load_byte_negative_offset) - sk_negative_common(1) - movzbl (%rax), %eax - ret - -bpf_error: -# force a return 0 from jit handler - xor %eax,%eax - mov (%rbp),%rbx - mov 8(%rbp),%r13 - mov 16(%rbp),%r14 - mov 24(%rbp),%r15 - add $40, %rbp - leaveq - ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1c3c81ddeb87..ce08b7bb0304 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -17,15 +17,6 @@ #include #include -/* - * Assembly code in arch/x86/net/bpf_jit.S - */ -extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; -extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[]; -extern u8 sk_load_byte_positive_offset[]; -extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[]; -extern u8 sk_load_byte_negative_offset[]; - static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { if (len == 1) @@ -107,9 +98,6 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JLE 0x7E #define X86_JG 0x7F -#define CHOOSE_LOAD_FUNC(K, func) \ - ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) - /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) @@ -120,8 +108,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * R9 caches skb->len - skb->data_len - * R10 caches skb->data, and used for blinding (if enabled) + * Also x86-64 register R9 is unused. x86-64 register R10 is + * used for blinding (if enabled). */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* RAX */ @@ -196,19 +184,15 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* Epilogue code offset */ - bool seen_ld_abs; - bool seen_ax_reg; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define AUX_STACK_SPACE \ - (32 /* Space for RBX, R13, R14, R15 */ + \ - 8 /* Space for skb_copy_bits() buffer */) +#define AUX_STACK_SPACE 40 /* Space for RBX, R13, R14, R15, tailcnt */ -#define PROLOGUE_SIZE 37 +#define PROLOGUE_SIZE 37 /* * Emit x86-64 prologue code for BPF program and check its size. @@ -232,20 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) /* sub rbp, AUX_STACK_SPACE */ EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); - /* All classic BPF filters use R6(rbx) save it */ - /* mov qword ptr [rbp+0],rbx */ EMIT4(0x48, 0x89, 0x5D, 0); - - /* - * bpf_convert_filter() maps classic BPF register X to R7 and uses R8 - * as temporary, so all tcpdump filters need to spill/fill R7(R13) and - * R8(R14). R9(R15) spill could be made conditional, but there is only - * one 'bpf_error' return path out of helper functions inside bpf_jit.S - * The overhead of extra spill is negligible for any filter other - * than synthetic ones. Therefore not worth adding complexity. - */ - /* mov qword ptr [rbp+8],r13 */ EMIT4(0x4C, 0x89, 0x6D, 8); /* mov qword ptr [rbp+16],r14 */ @@ -353,27 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } - -static void emit_load_skb_data_hlen(u8 **pprog) -{ - u8 *prog = *pprog; - int cnt = 0; - - /* - * r9d = skb->len - skb->data_len (headlen) - * r10 = skb->data - */ - /* mov %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len)); - - /* sub %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len)); - - /* mov %r10, off32(%rdi) */ - EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data)); - *pprog = prog; -} - static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -462,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, { struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; - bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); - bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0); bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0; @@ -473,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog)); - if (seen_ld_abs) - emit_load_skb_data_hlen(&prog); - for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; @@ -483,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; - bool reload_skb_data; int ilen; u8 *func; - if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) - ctx->seen_ax_reg = seen_ax_reg = true; - switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -916,36 +858,12 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); - if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_pkt_data(func); - if (reload_skb_data) { - EMIT1(0x57); /* push %rdi */ - jmp_offset += 22; /* pop, mov, sub, mov */ - } else { - EMIT2(0x41, 0x52); /* push %r10 */ - EMIT2(0x41, 0x51); /* push %r9 */ - /* - * We need to adjust jmp offset, since - * pop %r9, pop %r10 take 4 bytes after call insn - */ - jmp_offset += 4; - } - } if (!imm32 || !is_simm32(jmp_offset)) { pr_err("unsupported BPF func %d addr %p image %p\n", imm32, func, image); return -EINVAL; } EMIT1_off32(0xE8, jmp_offset); - if (seen_ld_abs) { - if (reload_skb_data) { - EMIT1(0x5F); /* pop %rdi */ - emit_load_skb_data_hlen(&prog); - } else { - EMIT2(0x41, 0x59); /* pop %r9 */ - EMIT2(0x41, 0x5A); /* pop %r10 */ - } - } break; case BPF_JMP | BPF_TAIL_CALL: @@ -1080,60 +998,6 @@ emit_jmp: } break; - case BPF_LD | BPF_IND | BPF_W: - func = sk_load_word; - goto common_load; - case BPF_LD | BPF_ABS | BPF_W: - func = CHOOSE_LOAD_FUNC(imm32, sk_load_word); -common_load: - ctx->seen_ld_abs = seen_ld_abs = true; - jmp_offset = func - (image + addrs[i]); - if (!func || !is_simm32(jmp_offset)) { - pr_err("unsupported BPF func %d addr %p image %p\n", - imm32, func, image); - return -EINVAL; - } - if (BPF_MODE(insn->code) == BPF_ABS) { - /* mov %esi, imm32 */ - EMIT1_off32(0xBE, imm32); - } else { - /* mov %rsi, src_reg */ - EMIT_mov(BPF_REG_2, src_reg); - if (imm32) { - if (is_imm8(imm32)) - /* add %esi, imm8 */ - EMIT3(0x83, 0xC6, imm32); - else - /* add %esi, imm32 */ - EMIT2_off32(0x81, 0xC6, imm32); - } - } - /* - * skb pointer is in R6 (%rbx), it will be copied into - * %rdi if skb_copy_bits() call is necessary. - * sk_load_* helpers also use %r10 and %r9d. - * See bpf_jit.S - */ - if (seen_ax_reg) - /* r10 = skb->data, mov %r10, off32(%rbx) */ - EMIT3_off32(0x4c, 0x8b, 0x93, - offsetof(struct sk_buff, data)); - EMIT1_off32(0xE8, jmp_offset); /* call */ - break; - - case BPF_LD | BPF_IND | BPF_H: - func = sk_load_half; - goto common_load; - case BPF_LD | BPF_ABS | BPF_H: - func = CHOOSE_LOAD_FUNC(imm32, sk_load_half); - goto common_load; - case BPF_LD | BPF_IND | BPF_B: - func = sk_load_byte; - goto common_load; - case BPF_LD | BPF_ABS | BPF_B: - func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte); - goto common_load; - case BPF_JMP | BPF_EXIT: if (seen_exit) { jmp_offset = ctx->cleanup_addr - addrs[i]; From 816d9ef32a8b29174cd5bb8c9f76c5b326c58a9b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:17 +0200 Subject: [PATCH 45/66] bpf, arm64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from arm64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 65 ----------------------------------- 1 file changed, 65 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a93350451e8e..0b40c8fb0706 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -723,71 +723,6 @@ emit_cond_jmp: emit(A64_CBNZ(0, tmp3, jmp_offset), ctx); break; - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */ - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - { - const u8 r0 = bpf2a64[BPF_REG_0]; /* r0 = return value */ - const u8 r6 = bpf2a64[BPF_REG_6]; /* r6 = pointer to sk_buff */ - const u8 fp = bpf2a64[BPF_REG_FP]; - const u8 r1 = bpf2a64[BPF_REG_1]; /* r1: struct sk_buff *skb */ - const u8 r2 = bpf2a64[BPF_REG_2]; /* r2: int k */ - const u8 r3 = bpf2a64[BPF_REG_3]; /* r3: unsigned int size */ - const u8 r4 = bpf2a64[BPF_REG_4]; /* r4: void *buffer */ - const u8 r5 = bpf2a64[BPF_REG_5]; /* r5: void *(*func)(...) */ - int size; - - emit(A64_MOV(1, r1, r6), ctx); - emit_a64_mov_i(0, r2, imm, ctx); - if (BPF_MODE(code) == BPF_IND) - emit(A64_ADD(0, r2, r2, src), ctx); - switch (BPF_SIZE(code)) { - case BPF_W: - size = 4; - break; - case BPF_H: - size = 2; - break; - case BPF_B: - size = 1; - break; - default: - return -EINVAL; - } - emit_a64_mov_i64(r3, size, ctx); - emit(A64_SUB_I(1, r4, fp, ctx->stack_size), ctx); - emit_a64_mov_i64(r5, (unsigned long)bpf_load_pointer, ctx); - emit(A64_BLR(r5), ctx); - emit(A64_MOV(1, r0, A64_R(0)), ctx); - - jmp_offset = epilogue_offset(ctx); - check_imm19(jmp_offset); - emit(A64_CBZ(1, r0, jmp_offset), ctx); - emit(A64_MOV(1, r5, r0), ctx); - switch (BPF_SIZE(code)) { - case BPF_W: - emit(A64_LDR32(r0, r5, A64_ZR), ctx); -#ifndef CONFIG_CPU_BIG_ENDIAN - emit(A64_REV32(0, r0, r0), ctx); -#endif - break; - case BPF_H: - emit(A64_LDRH(r0, r5, A64_ZR), ctx); -#ifndef CONFIG_CPU_BIG_ENDIAN - emit(A64_REV16(0, r0, r0), ctx); -#endif - break; - case BPF_B: - emit(A64_LDRB(r0, r5, A64_ZR), ctx); - break; - } - break; - } default: pr_err_once("unknown opcode %02x\n", code); return -EINVAL; From fe83963b7c38a2b30a26a90880c75a31c283d957 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:18 +0200 Subject: [PATCH 46/66] bpf, sparc64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from sparc64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Alexei Starovoitov --- arch/sparc/net/Makefile | 5 +- arch/sparc/net/bpf_jit_64.h | 29 ------ arch/sparc/net/bpf_jit_asm_64.S | 162 ------------------------------- arch/sparc/net/bpf_jit_comp_64.c | 79 +-------------- 4 files changed, 6 insertions(+), 269 deletions(-) delete mode 100644 arch/sparc/net/bpf_jit_asm_64.S diff --git a/arch/sparc/net/Makefile b/arch/sparc/net/Makefile index 76fa8e95b721..d32aac3a25b8 100644 --- a/arch/sparc/net/Makefile +++ b/arch/sparc/net/Makefile @@ -1,4 +1,7 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_$(BITS).o bpf_jit_comp_$(BITS).o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp_$(BITS).o +ifeq ($(BITS),32) +obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_32.o +endif diff --git a/arch/sparc/net/bpf_jit_64.h b/arch/sparc/net/bpf_jit_64.h index 428f7fd19175..fbc836f1c51c 100644 --- a/arch/sparc/net/bpf_jit_64.h +++ b/arch/sparc/net/bpf_jit_64.h @@ -33,35 +33,6 @@ #define I5 0x1d #define FP 0x1e #define I7 0x1f - -#define r_SKB L0 -#define r_HEADLEN L4 -#define r_SKB_DATA L5 -#define r_TMP G1 -#define r_TMP2 G3 - -/* assembly code in arch/sparc/net/bpf_jit_asm_64.S */ -extern u32 bpf_jit_load_word[]; -extern u32 bpf_jit_load_half[]; -extern u32 bpf_jit_load_byte[]; -extern u32 bpf_jit_load_byte_msh[]; -extern u32 bpf_jit_load_word_positive_offset[]; -extern u32 bpf_jit_load_half_positive_offset[]; -extern u32 bpf_jit_load_byte_positive_offset[]; -extern u32 bpf_jit_load_byte_msh_positive_offset[]; -extern u32 bpf_jit_load_word_negative_offset[]; -extern u32 bpf_jit_load_half_negative_offset[]; -extern u32 bpf_jit_load_byte_negative_offset[]; -extern u32 bpf_jit_load_byte_msh_negative_offset[]; - -#else -#define r_RESULT %o0 -#define r_SKB %o0 -#define r_OFF %o1 -#define r_HEADLEN %l4 -#define r_SKB_DATA %l5 -#define r_TMP %g1 -#define r_TMP2 %g3 #endif #endif /* _BPF_JIT_H */ diff --git a/arch/sparc/net/bpf_jit_asm_64.S b/arch/sparc/net/bpf_jit_asm_64.S deleted file mode 100644 index 7177867052a1..000000000000 --- a/arch/sparc/net/bpf_jit_asm_64.S +++ /dev/null @@ -1,162 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include - -#include "bpf_jit_64.h" - -#define SAVE_SZ 176 -#define SCRATCH_OFF STACK_BIAS + 128 -#define BE_PTR(label) be,pn %xcc, label -#define SIGN_EXTEND(reg) sra reg, 0, reg - -#define SKF_MAX_NEG_OFF (-0x200000) /* SKF_LL_OFF from filter.h */ - - .text - .globl bpf_jit_load_word -bpf_jit_load_word: - cmp r_OFF, 0 - bl bpf_slow_path_word_neg - nop - .globl bpf_jit_load_word_positive_offset -bpf_jit_load_word_positive_offset: - sub r_HEADLEN, r_OFF, r_TMP - cmp r_TMP, 3 - ble bpf_slow_path_word - add r_SKB_DATA, r_OFF, r_TMP - andcc r_TMP, 3, %g0 - bne load_word_unaligned - nop - retl - ld [r_TMP], r_RESULT -load_word_unaligned: - ldub [r_TMP + 0x0], r_OFF - ldub [r_TMP + 0x1], r_TMP2 - sll r_OFF, 8, r_OFF - or r_OFF, r_TMP2, r_OFF - ldub [r_TMP + 0x2], r_TMP2 - sll r_OFF, 8, r_OFF - or r_OFF, r_TMP2, r_OFF - ldub [r_TMP + 0x3], r_TMP2 - sll r_OFF, 8, r_OFF - retl - or r_OFF, r_TMP2, r_RESULT - - .globl bpf_jit_load_half -bpf_jit_load_half: - cmp r_OFF, 0 - bl bpf_slow_path_half_neg - nop - .globl bpf_jit_load_half_positive_offset -bpf_jit_load_half_positive_offset: - sub r_HEADLEN, r_OFF, r_TMP - cmp r_TMP, 1 - ble bpf_slow_path_half - add r_SKB_DATA, r_OFF, r_TMP - andcc r_TMP, 1, %g0 - bne load_half_unaligned - nop - retl - lduh [r_TMP], r_RESULT -load_half_unaligned: - ldub [r_TMP + 0x0], r_OFF - ldub [r_TMP + 0x1], r_TMP2 - sll r_OFF, 8, r_OFF - retl - or r_OFF, r_TMP2, r_RESULT - - .globl bpf_jit_load_byte -bpf_jit_load_byte: - cmp r_OFF, 0 - bl bpf_slow_path_byte_neg - nop - .globl bpf_jit_load_byte_positive_offset -bpf_jit_load_byte_positive_offset: - cmp r_OFF, r_HEADLEN - bge bpf_slow_path_byte - nop - retl - ldub [r_SKB_DATA + r_OFF], r_RESULT - -#define bpf_slow_path_common(LEN) \ - save %sp, -SAVE_SZ, %sp; \ - mov %i0, %o0; \ - mov %i1, %o1; \ - add %fp, SCRATCH_OFF, %o2; \ - call skb_copy_bits; \ - mov (LEN), %o3; \ - cmp %o0, 0; \ - restore; - -bpf_slow_path_word: - bpf_slow_path_common(4) - bl bpf_error - ld [%sp + SCRATCH_OFF], r_RESULT - retl - nop -bpf_slow_path_half: - bpf_slow_path_common(2) - bl bpf_error - lduh [%sp + SCRATCH_OFF], r_RESULT - retl - nop -bpf_slow_path_byte: - bpf_slow_path_common(1) - bl bpf_error - ldub [%sp + SCRATCH_OFF], r_RESULT - retl - nop - -#define bpf_negative_common(LEN) \ - save %sp, -SAVE_SZ, %sp; \ - mov %i0, %o0; \ - mov %i1, %o1; \ - SIGN_EXTEND(%o1); \ - call bpf_internal_load_pointer_neg_helper; \ - mov (LEN), %o2; \ - mov %o0, r_TMP; \ - cmp %o0, 0; \ - BE_PTR(bpf_error); \ - restore; - -bpf_slow_path_word_neg: - sethi %hi(SKF_MAX_NEG_OFF), r_TMP - cmp r_OFF, r_TMP - bl bpf_error - nop - .globl bpf_jit_load_word_negative_offset -bpf_jit_load_word_negative_offset: - bpf_negative_common(4) - andcc r_TMP, 3, %g0 - bne load_word_unaligned - nop - retl - ld [r_TMP], r_RESULT - -bpf_slow_path_half_neg: - sethi %hi(SKF_MAX_NEG_OFF), r_TMP - cmp r_OFF, r_TMP - bl bpf_error - nop - .globl bpf_jit_load_half_negative_offset -bpf_jit_load_half_negative_offset: - bpf_negative_common(2) - andcc r_TMP, 1, %g0 - bne load_half_unaligned - nop - retl - lduh [r_TMP], r_RESULT - -bpf_slow_path_byte_neg: - sethi %hi(SKF_MAX_NEG_OFF), r_TMP - cmp r_OFF, r_TMP - bl bpf_error - nop - .globl bpf_jit_load_byte_negative_offset -bpf_jit_load_byte_negative_offset: - bpf_negative_common(1) - retl - ldub [r_TMP], r_RESULT - -bpf_error: - /* Make the JIT program itself return zero. */ - ret - restore %g0, %g0, %o0 diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 48a25869349b..9f5918e0693a 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -48,10 +48,6 @@ static void bpf_flush_icache(void *start_, void *end_) } } -#define SEEN_DATAREF 1 /* might call external helpers */ -#define SEEN_XREG 2 /* ebx is used */ -#define SEEN_MEM 4 /* use mem[] for temporary storage */ - #define S13(X) ((X) & 0x1fff) #define S5(X) ((X) & 0x1f) #define IMMED 0x00002000 @@ -198,7 +194,6 @@ struct jit_ctx { bool tmp_1_used; bool tmp_2_used; bool tmp_3_used; - bool saw_ld_abs_ind; bool saw_frame_pointer; bool saw_call; bool saw_tail_call; @@ -207,9 +202,7 @@ struct jit_ctx { #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) -#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 2) -#define SKB_DATA_REG (MAX_BPF_JIT_REG + 3) -#define TMP_REG_3 (MAX_BPF_JIT_REG + 4) +#define TMP_REG_3 (MAX_BPF_JIT_REG + 2) /* Map BPF registers to SPARC registers */ static const int bpf2sparc[] = { @@ -238,9 +231,6 @@ static const int bpf2sparc[] = { [TMP_REG_1] = G1, [TMP_REG_2] = G2, [TMP_REG_3] = G3, - - [SKB_HLEN_REG] = L4, - [SKB_DATA_REG] = L5, }; static void emit(const u32 insn, struct jit_ctx *ctx) @@ -800,25 +790,6 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, return 0; } -static void load_skb_regs(struct jit_ctx *ctx, u8 r_skb) -{ - const u8 r_headlen = bpf2sparc[SKB_HLEN_REG]; - const u8 r_data = bpf2sparc[SKB_DATA_REG]; - const u8 r_tmp = bpf2sparc[TMP_REG_1]; - unsigned int off; - - off = offsetof(struct sk_buff, len); - emit(LD32I | RS1(r_skb) | S13(off) | RD(r_headlen), ctx); - - off = offsetof(struct sk_buff, data_len); - emit(LD32I | RS1(r_skb) | S13(off) | RD(r_tmp), ctx); - - emit(SUB | RS1(r_headlen) | RS2(r_tmp) | RD(r_headlen), ctx); - - off = offsetof(struct sk_buff, data); - emit(LDPTRI | RS1(r_skb) | S13(off) | RD(r_data), ctx); -} - /* Just skip the save instruction and the ctx register move. */ #define BPF_TAILCALL_PROLOGUE_SKIP 16 #define BPF_TAILCALL_CNT_SP_OFF (STACK_BIAS + 128) @@ -857,9 +828,6 @@ static void build_prologue(struct jit_ctx *ctx) emit_reg_move(I0, O0, ctx); /* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */ - - if (ctx->saw_ld_abs_ind) - load_skb_regs(ctx, bpf2sparc[BPF_REG_1]); } static void build_epilogue(struct jit_ctx *ctx) @@ -1225,16 +1193,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) u8 *func = ((u8 *)__bpf_call_base) + imm; ctx->saw_call = true; - if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) - emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx); emit_call((u32 *)func, ctx); emit_nop(ctx); emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); - - if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) - load_skb_regs(ctx, L7); break; } @@ -1412,43 +1375,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_nop(ctx); break; } -#define CHOOSE_LOAD_FUNC(K, func) \ - ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) - - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ - case BPF_LD | BPF_ABS | BPF_W: - func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_word); - goto common_load; - case BPF_LD | BPF_ABS | BPF_H: - func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_half); - goto common_load; - case BPF_LD | BPF_ABS | BPF_B: - func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_byte); - goto common_load; - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */ - case BPF_LD | BPF_IND | BPF_W: - func = bpf_jit_load_word; - goto common_load; - case BPF_LD | BPF_IND | BPF_H: - func = bpf_jit_load_half; - goto common_load; - - case BPF_LD | BPF_IND | BPF_B: - func = bpf_jit_load_byte; - common_load: - ctx->saw_ld_abs_ind = true; - - emit_reg_move(bpf2sparc[BPF_REG_6], O0, ctx); - emit_loadimm(imm, O1, ctx); - - if (BPF_MODE(code) == BPF_IND) - emit_alu(ADD, src, O1, ctx); - - emit_call(func, ctx); - emit_alu_K(SRA, O1, 0, ctx); - - emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); - break; default: pr_err_once("unknown opcode %02x\n", code); @@ -1583,12 +1509,11 @@ skip_init_ctx: build_epilogue(&ctx); if (bpf_jit_enable > 1) - pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c%c]\n", pass, + pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c]\n", pass, image_size - (ctx.idx * 4), ctx.tmp_1_used ? '1' : ' ', ctx.tmp_2_used ? '2' : ' ', ctx.tmp_3_used ? '3' : ' ', - ctx.saw_ld_abs_ind ? 'L' : ' ', ctx.saw_frame_pointer ? 'F' : ' ', ctx.saw_call ? 'C' : ' ', ctx.saw_tail_call ? 'T' : ' '); From 0d2d0cedc0814edaa2b6cb290c78ec333b3eed71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:19 +0200 Subject: [PATCH 47/66] bpf, arm32: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from arm32 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 77 --------------------------------------- 1 file changed, 77 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index b5030e1a41d8..82689b999257 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1452,83 +1452,6 @@ exit: emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code)); break; - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */ - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - { - const u8 r4 = bpf2a32[BPF_REG_6][1]; /* r4 = ptr to sk_buff */ - const u8 r0 = bpf2a32[BPF_REG_0][1]; /*r0: struct sk_buff *skb*/ - /* rtn value */ - const u8 r1 = bpf2a32[BPF_REG_0][0]; /* r1: int k */ - const u8 r2 = bpf2a32[BPF_REG_1][1]; /* r2: unsigned int size */ - const u8 r3 = bpf2a32[BPF_REG_1][0]; /* r3: void *buffer */ - const u8 r6 = bpf2a32[TMP_REG_1][1]; /* r6: void *(*func)(..) */ - int size; - - /* Setting up first argument */ - emit(ARM_MOV_R(r0, r4), ctx); - - /* Setting up second argument */ - emit_a32_mov_i(r1, imm, false, ctx); - if (BPF_MODE(code) == BPF_IND) - emit_a32_alu_r(r1, src_lo, false, sstk, ctx, - false, false, BPF_ADD); - - /* Setting up third argument */ - switch (BPF_SIZE(code)) { - case BPF_W: - size = 4; - break; - case BPF_H: - size = 2; - break; - case BPF_B: - size = 1; - break; - default: - return -EINVAL; - } - emit_a32_mov_i(r2, size, false, ctx); - - /* Setting up fourth argument */ - emit(ARM_ADD_I(r3, ARM_SP, imm8m(SKB_BUFFER)), ctx); - - /* Setting up function pointer to call */ - emit_a32_mov_i(r6, (unsigned int)bpf_load_pointer, false, ctx); - emit_blx_r(r6, ctx); - - emit(ARM_EOR_R(r1, r1, r1), ctx); - /* Check if return address is NULL or not. - * if NULL then jump to epilogue - * else continue to load the value from retn address - */ - emit(ARM_CMP_I(r0, 0), ctx); - jmp_offset = epilogue_offset(ctx); - check_imm24(jmp_offset); - _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); - - /* Load value from the address */ - switch (BPF_SIZE(code)) { - case BPF_W: - emit(ARM_LDR_I(r0, r0, 0), ctx); - emit_rev32(r0, r0, ctx); - break; - case BPF_H: - emit(ARM_LDRH_I(r0, r0, 0), ctx); - emit_rev16(r0, r0, ctx); - break; - case BPF_B: - emit(ARM_LDRB_I(r0, r0, 0), ctx); - /* No need to reverse */ - break; - } - break; - } /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: From 4db25cc988518239a2b9fd76716a829e6deca66a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:20 +0200 Subject: [PATCH 48/66] bpf, mips64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from mips64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/mips/net/ebpf_jit.c | 104 --------------------------------------- 1 file changed, 104 deletions(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 3e2798bfea4f..7ba7df9c28fc 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1267,110 +1267,6 @@ jeq_common: return -EINVAL; break; - case BPF_LD | BPF_B | BPF_ABS: - case BPF_LD | BPF_H | BPF_ABS: - case BPF_LD | BPF_W | BPF_ABS: - case BPF_LD | BPF_DW | BPF_ABS: - ctx->flags |= EBPF_SAVE_RA; - - gen_imm_to_reg(insn, MIPS_R_A1, ctx); - emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn)); - - if (insn->imm < 0) { - emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper); - } else { - emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer); - emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset); - } - goto ld_skb_common; - - case BPF_LD | BPF_B | BPF_IND: - case BPF_LD | BPF_H | BPF_IND: - case BPF_LD | BPF_W | BPF_IND: - case BPF_LD | BPF_DW | BPF_IND: - ctx->flags |= EBPF_SAVE_RA; - src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); - if (src < 0) - return src; - ts = get_reg_val_type(ctx, this_idx, insn->src_reg); - if (ts == REG_32BIT_ZERO_EX) { - /* sign extend */ - emit_instr(ctx, sll, MIPS_R_A1, src, 0); - src = MIPS_R_A1; - } - if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) { - emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm); - } else { - gen_imm_to_reg(insn, MIPS_R_AT, ctx); - emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src); - } - /* truncate to 32-bit int */ - emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0); - emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset); - emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO); - - emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper); - emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer); - emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn)); - emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT); - -ld_skb_common: - emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9); - /* delay slot move */ - emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO); - - /* Check the error value */ - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) { - target = j_target(ctx, exit_idx); - if (target == (unsigned int)-1) - return -E2BIG; - - if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) { - ctx->offsets[this_idx] |= OFFSETS_B_CONV; - ctx->long_b_conversion = 1; - } - emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3); - emit_instr(ctx, nop); - emit_instr(ctx, j, target); - emit_instr(ctx, nop); - } else { - emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off); - emit_instr(ctx, nop); - } - -#ifdef __BIG_ENDIAN - need_swap = false; -#else - need_swap = true; -#endif - dst = MIPS_R_V0; - switch (BPF_SIZE(insn->code)) { - case BPF_B: - emit_instr(ctx, lbu, dst, 0, MIPS_R_V0); - break; - case BPF_H: - emit_instr(ctx, lhu, dst, 0, MIPS_R_V0); - if (need_swap) - emit_instr(ctx, wsbh, dst, dst); - break; - case BPF_W: - emit_instr(ctx, lw, dst, 0, MIPS_R_V0); - if (need_swap) { - emit_instr(ctx, wsbh, dst, dst); - emit_instr(ctx, rotr, dst, dst, 16); - } - break; - case BPF_DW: - emit_instr(ctx, ld, dst, 0, MIPS_R_V0); - if (need_swap) { - emit_instr(ctx, dsbh, dst, dst); - emit_instr(ctx, dshd, dst, dst); - } - break; - } - - break; case BPF_ALU | BPF_END | BPF_FROM_BE: case BPF_ALU | BPF_END | BPF_FROM_LE: dst = ebpf_to_mips_reg(ctx, insn, dst_reg); From dbf44daf7c88bb0b378e3cb9dc101ae0c5b33832 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:21 +0200 Subject: [PATCH 49/66] bpf, ppc64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from ppc64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Naveen N. Rao Acked-by: Alexei Starovoitov Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/Makefile | 2 +- arch/powerpc/net/bpf_jit64.h | 37 ++---- arch/powerpc/net/bpf_jit_asm64.S | 180 ------------------------------ arch/powerpc/net/bpf_jit_comp64.c | 109 +----------------- 4 files changed, 11 insertions(+), 317 deletions(-) delete mode 100644 arch/powerpc/net/bpf_jit_asm64.S diff --git a/arch/powerpc/net/Makefile b/arch/powerpc/net/Makefile index 02d369ca6a53..809f019d3cba 100644 --- a/arch/powerpc/net/Makefile +++ b/arch/powerpc/net/Makefile @@ -3,7 +3,7 @@ # Arch-specific network modules # ifeq ($(CONFIG_PPC64),y) -obj-$(CONFIG_BPF_JIT) += bpf_jit_asm64.o bpf_jit_comp64.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp64.o else obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o endif diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 8bdef7ed28a8..3609be4692b3 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -20,7 +20,7 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 8*8 | + * [ nv gpr save area ] 6*8 | * [ tail_call_cnt ] 8 | * [ local_tmp_var ] 8 | * fp (r31) --> [ ebpf stack space ] upto 512 | @@ -28,8 +28,8 @@ * sp (r1) ---> [ stack pointer ] -------------- */ -/* for gpr non volatile registers BPG_REG_6 to 10, plus skb cache registers */ -#define BPF_PPC_STACK_SAVE (8*8) +/* for gpr non volatile registers BPG_REG_6 to 10 */ +#define BPF_PPC_STACK_SAVE (6*8) /* for bpf JIT code internal usage */ #define BPF_PPC_STACK_LOCALS 16 /* stack frame excluding BPF stack, ensure this is quadword aligned */ @@ -39,10 +39,8 @@ #ifndef __ASSEMBLY__ /* BPF register usage */ -#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 0) -#define SKB_DATA_REG (MAX_BPF_JIT_REG + 1) -#define TMP_REG_1 (MAX_BPF_JIT_REG + 2) -#define TMP_REG_2 (MAX_BPF_JIT_REG + 3) +#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) +#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* BPF to ppc register mappings */ static const int b2p[] = { @@ -63,40 +61,23 @@ static const int b2p[] = { [BPF_REG_FP] = 31, /* eBPF jit internal registers */ [BPF_REG_AX] = 2, - [SKB_HLEN_REG] = 25, - [SKB_DATA_REG] = 26, [TMP_REG_1] = 9, [TMP_REG_2] = 10 }; -/* PPC NVR range -- update this if we ever use NVRs below r24 */ -#define BPF_PPC_NVR_MIN 24 - -/* Assembly helpers */ -#define DECLARE_LOAD_FUNC(func) u64 func(u64 r3, u64 r4); \ - u64 func##_negative_offset(u64 r3, u64 r4); \ - u64 func##_positive_offset(u64 r3, u64 r4); - -DECLARE_LOAD_FUNC(sk_load_word); -DECLARE_LOAD_FUNC(sk_load_half); -DECLARE_LOAD_FUNC(sk_load_byte); - -#define CHOOSE_LOAD_FUNC(imm, func) \ - (imm < 0 ? \ - (imm >= SKF_LL_OFF ? func##_negative_offset : func) : \ - func##_positive_offset) +/* PPC NVR range -- update this if we ever use NVRs below r27 */ +#define BPF_PPC_NVR_MIN 27 #define SEEN_FUNC 0x1000 /* might call external helpers */ #define SEEN_STACK 0x2000 /* uses BPF stack */ -#define SEEN_SKB 0x4000 /* uses sk_buff */ -#define SEEN_TAILCALL 0x8000 /* uses tail calls */ +#define SEEN_TAILCALL 0x4000 /* uses tail calls */ struct codegen_context { /* * This is used to track register usage as well * as calls to external helpers. * - register usage is tracked with corresponding - * bits (r3-r10 and r25-r31) + * bits (r3-r10 and r27-r31) * - rest of the bits can be used to track other * things -- for now, we use bits 16 to 23 * encoded in SEEN_* macros above diff --git a/arch/powerpc/net/bpf_jit_asm64.S b/arch/powerpc/net/bpf_jit_asm64.S deleted file mode 100644 index 7e4c51430b84..000000000000 --- a/arch/powerpc/net/bpf_jit_asm64.S +++ /dev/null @@ -1,180 +0,0 @@ -/* - * bpf_jit_asm64.S: Packet/header access helper functions - * for PPC64 BPF compiler. - * - * Copyright 2016, Naveen N. Rao - * IBM Corporation - * - * Based on bpf_jit_asm.S by Matt Evans - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include -#include "bpf_jit64.h" - -/* - * All of these routines are called directly from generated code, - * with the below register usage: - * r27 skb pointer (ctx) - * r25 skb header length - * r26 skb->data pointer - * r4 offset - * - * Result is passed back in: - * r8 data read in host endian format (accumulator) - * - * r9 is used as a temporary register - */ - -#define r_skb r27 -#define r_hlen r25 -#define r_data r26 -#define r_off r4 -#define r_val r8 -#define r_tmp r9 - -_GLOBAL_TOC(sk_load_word) - cmpdi r_off, 0 - blt bpf_slow_path_word_neg - b sk_load_word_positive_offset - -_GLOBAL_TOC(sk_load_word_positive_offset) - /* Are we accessing past headlen? */ - subi r_tmp, r_hlen, 4 - cmpd r_tmp, r_off - blt bpf_slow_path_word - /* Nope, just hitting the header. cr0 here is eq or gt! */ - LWZX_BE r_val, r_data, r_off - blr /* Return success, cr0 != LT */ - -_GLOBAL_TOC(sk_load_half) - cmpdi r_off, 0 - blt bpf_slow_path_half_neg - b sk_load_half_positive_offset - -_GLOBAL_TOC(sk_load_half_positive_offset) - subi r_tmp, r_hlen, 2 - cmpd r_tmp, r_off - blt bpf_slow_path_half - LHZX_BE r_val, r_data, r_off - blr - -_GLOBAL_TOC(sk_load_byte) - cmpdi r_off, 0 - blt bpf_slow_path_byte_neg - b sk_load_byte_positive_offset - -_GLOBAL_TOC(sk_load_byte_positive_offset) - cmpd r_hlen, r_off - ble bpf_slow_path_byte - lbzx r_val, r_data, r_off - blr - -/* - * Call out to skb_copy_bits: - * Allocate a new stack frame here to remain ABI-compliant in - * stashing LR. - */ -#define bpf_slow_path_common(SIZE) \ - mflr r0; \ - std r0, PPC_LR_STKOFF(r1); \ - stdu r1, -(STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS)(r1); \ - mr r3, r_skb; \ - /* r4 = r_off as passed */ \ - addi r5, r1, STACK_FRAME_MIN_SIZE; \ - li r6, SIZE; \ - bl skb_copy_bits; \ - nop; \ - /* save r5 */ \ - addi r5, r1, STACK_FRAME_MIN_SIZE; \ - /* r3 = 0 on success */ \ - addi r1, r1, STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS; \ - ld r0, PPC_LR_STKOFF(r1); \ - mtlr r0; \ - cmpdi r3, 0; \ - blt bpf_error; /* cr0 = LT */ - -bpf_slow_path_word: - bpf_slow_path_common(4) - /* Data value is on stack, and cr0 != LT */ - LWZX_BE r_val, 0, r5 - blr - -bpf_slow_path_half: - bpf_slow_path_common(2) - LHZX_BE r_val, 0, r5 - blr - -bpf_slow_path_byte: - bpf_slow_path_common(1) - lbzx r_val, 0, r5 - blr - -/* - * Call out to bpf_internal_load_pointer_neg_helper - */ -#define sk_negative_common(SIZE) \ - mflr r0; \ - std r0, PPC_LR_STKOFF(r1); \ - stdu r1, -STACK_FRAME_MIN_SIZE(r1); \ - mr r3, r_skb; \ - /* r4 = r_off, as passed */ \ - li r5, SIZE; \ - bl bpf_internal_load_pointer_neg_helper; \ - nop; \ - addi r1, r1, STACK_FRAME_MIN_SIZE; \ - ld r0, PPC_LR_STKOFF(r1); \ - mtlr r0; \ - /* R3 != 0 on success */ \ - cmpldi r3, 0; \ - beq bpf_error_slow; /* cr0 = EQ */ - -bpf_slow_path_word_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_word_negative_offset - -_GLOBAL_TOC(sk_load_word_negative_offset) - sk_negative_common(4) - LWZX_BE r_val, 0, r3 - blr - -bpf_slow_path_half_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_half_negative_offset - -_GLOBAL_TOC(sk_load_half_negative_offset) - sk_negative_common(2) - LHZX_BE r_val, 0, r3 - blr - -bpf_slow_path_byte_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_byte_negative_offset - -_GLOBAL_TOC(sk_load_byte_negative_offset) - sk_negative_common(1) - lbzx r_val, 0, r3 - blr - -bpf_error_slow: - /* fabricate a cr0 = lt */ - li r_tmp, -1 - cmpdi r_tmp, 0 -bpf_error: - /* - * Entered with cr0 = lt - * Generated code will 'blt epilogue', returning 0. - */ - li r_val, 0 - blr diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0ef3d9580e98..1bdb1aff0619 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -59,7 +59,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 8*8 + * [ nv gpr save area ] 6*8 * [ tail_call_cnt ] 8 * [ local_tmp_var ] 8 * [ unused red zone ] 208 bytes protected @@ -88,21 +88,6 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) BUG(); } -static void bpf_jit_emit_skb_loads(u32 *image, struct codegen_context *ctx) -{ - /* - * Load skb->len and skb->data_len - * r3 points to skb - */ - PPC_LWZ(b2p[SKB_HLEN_REG], 3, offsetof(struct sk_buff, len)); - PPC_LWZ(b2p[TMP_REG_1], 3, offsetof(struct sk_buff, data_len)); - /* header_len = len - data_len */ - PPC_SUB(b2p[SKB_HLEN_REG], b2p[SKB_HLEN_REG], b2p[TMP_REG_1]); - - /* skb->data pointer */ - PPC_BPF_LL(b2p[SKB_DATA_REG], 3, offsetof(struct sk_buff, data)); -} - static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; @@ -145,18 +130,6 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) if (bpf_is_seen_register(ctx, i)) PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); - /* - * Save additional non-volatile regs if we cache skb - * Also, setup skb data - */ - if (ctx->seen & SEEN_SKB) { - PPC_BPF_STL(b2p[SKB_HLEN_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG])); - PPC_BPF_STL(b2p[SKB_DATA_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG])); - bpf_jit_emit_skb_loads(image, ctx); - } - /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, BPF_REG_FP)) PPC_ADDI(b2p[BPF_REG_FP], 1, @@ -172,14 +145,6 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx if (bpf_is_seen_register(ctx, i)) PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); - /* Restore non-volatile registers used for skb cache */ - if (ctx->seen & SEEN_SKB) { - PPC_BPF_LL(b2p[SKB_HLEN_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG])); - PPC_BPF_LL(b2p[SKB_DATA_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG])); - } - /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size); @@ -753,23 +718,10 @@ emit_clear: ctx->seen |= SEEN_FUNC; func = (u8 *) __bpf_call_base + imm; - /* Save skb pointer if we need to re-cache skb data */ - if ((ctx->seen & SEEN_SKB) && - bpf_helper_changes_pkt_data(func)) - PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); - bpf_jit_emit_func_call(image, ctx, (u64)func); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); - - /* refresh skb cache */ - if ((ctx->seen & SEEN_SKB) && - bpf_helper_changes_pkt_data(func)) { - /* reload skb pointer to r3 */ - PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); - bpf_jit_emit_skb_loads(image, ctx); - } break; /* @@ -886,65 +838,6 @@ cond_branch: PPC_BCC(true_cond, addrs[i + 1 + off]); break; - /* - * Loads from packet header/data - * Assume 32-bit input value in imm and X (src_reg) - */ - - /* Absolute loads */ - case BPF_LD | BPF_W | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_word); - goto common_load_abs; - case BPF_LD | BPF_H | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_half); - goto common_load_abs; - case BPF_LD | BPF_B | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_byte); -common_load_abs: - /* - * Load from [imm] - * Load into r4, which can just be passed onto - * skb load helpers as the second parameter - */ - PPC_LI32(4, imm); - goto common_load; - - /* Indirect loads */ - case BPF_LD | BPF_W | BPF_IND: - func = (u8 *)sk_load_word; - goto common_load_ind; - case BPF_LD | BPF_H | BPF_IND: - func = (u8 *)sk_load_half; - goto common_load_ind; - case BPF_LD | BPF_B | BPF_IND: - func = (u8 *)sk_load_byte; -common_load_ind: - /* - * Load from [src_reg + imm] - * Treat src_reg as a 32-bit value - */ - PPC_EXTSW(4, src_reg); - if (imm) { - if (imm >= -32768 && imm < 32768) - PPC_ADDI(4, 4, IMM_L(imm)); - else { - PPC_LI32(b2p[TMP_REG_1], imm); - PPC_ADD(4, 4, b2p[TMP_REG_1]); - } - } - -common_load: - ctx->seen |= SEEN_SKB; - ctx->seen |= SEEN_FUNC; - bpf_jit_emit_func_call(image, ctx, (u64)func); - - /* - * Helper returns 'lt' condition on error, and an - * appropriate return value in BPF_REG_0 - */ - PPC_BCC(COND_LT, exit_addr); - break; - /* * Tail call */ From e1cf4befa297b149149f633eff746593e400c030 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:22 +0200 Subject: [PATCH 50/66] bpf, s390x: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from s390x JIT. Tested on s390x instance on LinuxONE. Signed-off-by: Daniel Borkmann Cc: Michael Holzheu Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/s390/net/Makefile | 2 +- arch/s390/net/bpf_jit.S | 116 -------------------------------- arch/s390/net/bpf_jit.h | 20 +----- arch/s390/net/bpf_jit_comp.c | 127 +++-------------------------------- 4 files changed, 13 insertions(+), 252 deletions(-) delete mode 100644 arch/s390/net/bpf_jit.S diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index e0d5f245e42b..d4663b4bf509 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -2,4 +2,4 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S deleted file mode 100644 index 25bb4643c4f4..000000000000 --- a/arch/s390/net/bpf_jit.S +++ /dev/null @@ -1,116 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * BPF Jit compiler for s390, help functions. - * - * Copyright IBM Corp. 2012,2015 - * - * Author(s): Martin Schwidefsky - * Michael Holzheu - */ - -#include -#include "bpf_jit.h" - -/* - * Calling convention: - * registers %r7-%r10, %r11,%r13, and %r15 are call saved - * - * Input (64 bit): - * %r3 (%b2) = offset into skb data - * %r6 (%b5) = return address - * %r7 (%b6) = skb pointer - * %r12 = skb data pointer - * - * Output: - * %r14= %b0 = return value (read skb value) - * - * Work registers: %r2,%r4,%r5,%r14 - * - * skb_copy_bits takes 4 parameters: - * %r2 = skb pointer - * %r3 = offset into skb data - * %r4 = pointer to temp buffer - * %r5 = length to copy - * Return value in %r2: 0 = ok - * - * bpf_internal_load_pointer_neg_helper takes 3 parameters: - * %r2 = skb pointer - * %r3 = offset into data - * %r4 = length to copy - * Return value in %r2: Pointer to data - */ - -#define SKF_MAX_NEG_OFF -0x200000 /* SKF_LL_OFF from filter.h */ - -/* - * Load SIZE bytes from SKB - */ -#define sk_load_common(NAME, SIZE, LOAD) \ -ENTRY(sk_load_##NAME); \ - ltgr %r3,%r3; /* Is offset negative? */ \ - jl sk_load_##NAME##_slow_neg; \ -ENTRY(sk_load_##NAME##_pos); \ - aghi %r3,SIZE; /* Offset + SIZE */ \ - clg %r3,STK_OFF_HLEN(%r15); /* Offset + SIZE > hlen? */ \ - jh sk_load_##NAME##_slow; \ - LOAD %r14,-SIZE(%r3,%r12); /* Get data from skb */ \ - b OFF_OK(%r6); /* Return */ \ - \ -sk_load_##NAME##_slow:; \ - lgr %r2,%r7; /* Arg1 = skb pointer */ \ - aghi %r3,-SIZE; /* Arg2 = offset */ \ - la %r4,STK_OFF_TMP(%r15); /* Arg3 = temp bufffer */ \ - lghi %r5,SIZE; /* Arg4 = size */ \ - brasl %r14,skb_copy_bits; /* Get data from skb */ \ - LOAD %r14,STK_OFF_TMP(%r15); /* Load from temp bufffer */ \ - ltgr %r2,%r2; /* Set cc to (%r2 != 0) */ \ - br %r6; /* Return */ - -sk_load_common(word, 4, llgf) /* r14 = *(u32 *) (skb->data+offset) */ -sk_load_common(half, 2, llgh) /* r14 = *(u16 *) (skb->data+offset) */ - -/* - * Load 1 byte from SKB (optimized version) - */ - /* r14 = *(u8 *) (skb->data+offset) */ -ENTRY(sk_load_byte) - ltgr %r3,%r3 # Is offset negative? - jl sk_load_byte_slow_neg -ENTRY(sk_load_byte_pos) - clg %r3,STK_OFF_HLEN(%r15) # Offset >= hlen? - jnl sk_load_byte_slow - llgc %r14,0(%r3,%r12) # Get byte from skb - b OFF_OK(%r6) # Return OK - -sk_load_byte_slow: - lgr %r2,%r7 # Arg1 = skb pointer - # Arg2 = offset - la %r4,STK_OFF_TMP(%r15) # Arg3 = pointer to temp buffer - lghi %r5,1 # Arg4 = size (1 byte) - brasl %r14,skb_copy_bits # Get data from skb - llgc %r14,STK_OFF_TMP(%r15) # Load result from temp buffer - ltgr %r2,%r2 # Set cc to (%r2 != 0) - br %r6 # Return cc - -#define sk_negative_common(NAME, SIZE, LOAD) \ -sk_load_##NAME##_slow_neg:; \ - cgfi %r3,SKF_MAX_NEG_OFF; \ - jl bpf_error; \ - lgr %r2,%r7; /* Arg1 = skb pointer */ \ - /* Arg2 = offset */ \ - lghi %r4,SIZE; /* Arg3 = size */ \ - brasl %r14,bpf_internal_load_pointer_neg_helper; \ - ltgr %r2,%r2; \ - jz bpf_error; \ - LOAD %r14,0(%r2); /* Get data from pointer */ \ - xr %r3,%r3; /* Set cc to zero */ \ - br %r6; /* Return cc */ - -sk_negative_common(word, 4, llgf) -sk_negative_common(half, 2, llgh) -sk_negative_common(byte, 1, llgc) - -bpf_error: -# force a return 0 from jit handler - ltgr %r15,%r15 # Set condition code - br %r6 diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h index 5e1e5133132d..7822ea92e54a 100644 --- a/arch/s390/net/bpf_jit.h +++ b/arch/s390/net/bpf_jit.h @@ -16,9 +16,6 @@ #include #include -extern u8 sk_load_word_pos[], sk_load_half_pos[], sk_load_byte_pos[]; -extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; - #endif /* __ASSEMBLY__ */ /* @@ -36,15 +33,6 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * | | | * | BPF stack | | * | | | - * +---------------+ | - * | 8 byte skbp | | - * R15+176 -> +---------------+ | - * | 8 byte hlen | | - * R15+168 -> +---------------+ | - * | 4 byte align | | - * +---------------+ | - * | 4 byte temp | | - * | for bpf_jit.S | | * R15+160 -> +---------------+ | * | new backchain | | * R15+152 -> +---------------+ | @@ -57,17 +45,11 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * The stack size used by the BPF program ("BPF stack" above) is passed * via "aux->stack_depth". */ -#define STK_SPACE_ADD (8 + 8 + 4 + 4 + 160) +#define STK_SPACE_ADD (160) #define STK_160_UNUSED (160 - 12 * 8) #define STK_OFF (STK_SPACE_ADD - STK_160_UNUSED) -#define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */ -#define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */ -#define STK_OFF_SKBP 176 /* Offset of SKB pointer on stack */ #define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ #define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ -/* Offset to skip condition code check */ -#define OFF_OK 4 - #endif /* __ARCH_S390_NET_BPF_JIT_H */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 78a19c93b380..b020bea040b7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -47,23 +47,21 @@ struct bpf_jit { #define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */ -#define SEEN_SKB 1 /* skb access */ -#define SEEN_MEM 2 /* use mem[] for temporary storage */ -#define SEEN_RET0 4 /* ret0_ip points to a valid return 0 */ -#define SEEN_LITERAL 8 /* code uses literals */ -#define SEEN_FUNC 16 /* calls C functions */ -#define SEEN_TAIL_CALL 32 /* code uses tail calls */ -#define SEEN_REG_AX 64 /* code uses constant blinding */ -#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) +#define SEEN_MEM (1 << 0) /* use mem[] for temporary storage */ +#define SEEN_RET0 (1 << 1) /* ret0_ip points to a valid return 0 */ +#define SEEN_LITERAL (1 << 2) /* code uses literals */ +#define SEEN_FUNC (1 << 3) /* calls C functions */ +#define SEEN_TAIL_CALL (1 << 4) /* code uses tail calls */ +#define SEEN_REG_AX (1 << 5) /* code uses constant blinding */ +#define SEEN_STACK (SEEN_FUNC | SEEN_MEM) /* * s390 registers */ #define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */ #define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */ -#define REG_SKB_DATA (MAX_BPF_JIT_REG + 2) /* SKB data register */ -#define REG_L (MAX_BPF_JIT_REG + 3) /* Literal pool register */ -#define REG_15 (MAX_BPF_JIT_REG + 4) /* Register 15 */ +#define REG_L (MAX_BPF_JIT_REG + 2) /* Literal pool register */ +#define REG_15 (MAX_BPF_JIT_REG + 3) /* Register 15 */ #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ @@ -88,10 +86,8 @@ static const int reg2hex[] = { [BPF_REG_9] = 10, /* BPF stack pointer */ [BPF_REG_FP] = 13, - /* Register for blinding (shared with REG_SKB_DATA) */ + /* Register for blinding */ [BPF_REG_AX] = 12, - /* SKB data pointer */ - [REG_SKB_DATA] = 12, /* Work registers for s390x backend */ [REG_W0] = 0, [REG_W1] = 1, @@ -384,27 +380,6 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth) } while (re <= 15); } -/* - * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S" - * we store the SKB header length on the stack and the SKB data - * pointer in REG_SKB_DATA if BPF_REG_AX is not used. - */ -static void emit_load_skb_data_hlen(struct bpf_jit *jit) -{ - /* Header length: llgf %w1,(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1, - offsetof(struct sk_buff, len)); - /* s %w1,(%b1) */ - EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1, - offsetof(struct sk_buff, data_len)); - /* stg %w1,ST_OFF_HLEN(%r0,%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN); - if (!(jit->seen & SEEN_REG_AX)) - /* lg %skb_data,data_off(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, - BPF_REG_1, offsetof(struct sk_buff, data)); -} - /* * Emit function prologue * @@ -445,12 +420,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, 152); } - if (jit->seen & SEEN_SKB) { - emit_load_skb_data_hlen(jit); - /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, - STK_OFF_SKBP); - } } /* @@ -483,12 +452,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i { struct bpf_insn *insn = &fp->insnsi[i]; int jmp_off, last, insn_count = 1; - unsigned int func_addr, mask; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; + unsigned int mask; if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) jit->seen |= SEEN_REG_AX; @@ -970,13 +939,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if ((jit->seen & SEEN_SKB) && - bpf_helper_changes_pkt_data((void *)func)) { - /* lg %b1,ST_OFF_SKBP(%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, - REG_15, STK_OFF_SKBP); - emit_load_skb_data_hlen(jit); - } break; } case BPF_JMP | BPF_TAIL_CALL: @@ -1176,73 +1138,6 @@ branch_oc: jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4); EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off); break; - /* - * BPF_LD - */ - case BPF_LD | BPF_ABS | BPF_B: /* b0 = *(u8 *) (skb->data+imm) */ - case BPF_LD | BPF_IND | BPF_B: /* b0 = *(u8 *) (skb->data+imm+src) */ - if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0)) - func_addr = __pa(sk_load_byte_pos); - else - func_addr = __pa(sk_load_byte); - goto call_fn; - case BPF_LD | BPF_ABS | BPF_H: /* b0 = *(u16 *) (skb->data+imm) */ - case BPF_LD | BPF_IND | BPF_H: /* b0 = *(u16 *) (skb->data+imm+src) */ - if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0)) - func_addr = __pa(sk_load_half_pos); - else - func_addr = __pa(sk_load_half); - goto call_fn; - case BPF_LD | BPF_ABS | BPF_W: /* b0 = *(u32 *) (skb->data+imm) */ - case BPF_LD | BPF_IND | BPF_W: /* b0 = *(u32 *) (skb->data+imm+src) */ - if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0)) - func_addr = __pa(sk_load_word_pos); - else - func_addr = __pa(sk_load_word); - goto call_fn; -call_fn: - jit->seen |= SEEN_SKB | SEEN_RET0 | SEEN_FUNC; - REG_SET_SEEN(REG_14); /* Return address of possible func call */ - - /* - * Implicit input: - * BPF_REG_6 (R7) : skb pointer - * REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX) - * - * Calculated input: - * BPF_REG_2 (R3) : offset of byte(s) to fetch in skb - * BPF_REG_5 (R6) : return address - * - * Output: - * BPF_REG_0 (R14): data read from skb - * - * Scratch registers (BPF_REG_1-5) - */ - - /* Call function: llilf %w1,func_addr */ - EMIT6_IMM(0xc00f0000, REG_W1, func_addr); - - /* Offset: lgfi %b2,imm */ - EMIT6_IMM(0xc0010000, BPF_REG_2, imm); - if (BPF_MODE(insn->code) == BPF_IND) - /* agfr %b2,%src (%src is s32 here) */ - EMIT4(0xb9180000, BPF_REG_2, src_reg); - - /* Reload REG_SKB_DATA if BPF_REG_AX is used */ - if (jit->seen & SEEN_REG_AX) - /* lg %skb_data,data_off(%b6) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, - BPF_REG_6, offsetof(struct sk_buff, data)); - /* basr %b5,%w1 (%b5 is call saved) */ - EMIT2(0x0d00, BPF_REG_5, REG_W1); - - /* - * Note: For fast access we jump directly after the - * jnz instruction from bpf_jit.S - */ - /* jnz */ - EMIT4_PCREL(0xa7740000, jit->ret0_ip - jit->prg); - break; default: /* too complex, give up */ pr_err("Unknown opcode %02x\n", insn->code); return -1; From 24dea04767e6e5175f4750770281b0c17ac6a2fb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:23 +0200 Subject: [PATCH 51/66] bpf, x32: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from x32 JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp32.c | 136 +--------------------------------- 1 file changed, 1 insertion(+), 135 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 61e61341b777..0cc04e30adc1 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -175,19 +175,13 @@ static const u8 bpf2ia32[][2] = { #define SCRATCH_SIZE 96 /* Total stack size used in JITed code */ -#define _STACK_SIZE \ - (stack_depth + \ - + SCRATCH_SIZE + \ - + 4 /* Extra space for skb_copy_bits buffer */) +#define _STACK_SIZE (stack_depth + SCRATCH_SIZE) #define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT) /* Get the offset of eBPF REGISTERs stored on scratch space. */ #define STACK_VAR(off) (off) -/* Offset of skb_copy_bits buffer */ -#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE) - /* Encode 'dst_reg' register into IA32 opcode 'byte' */ static u8 add_1reg(u8 byte, u32 dst_reg) { @@ -2276,134 +2270,6 @@ emit_jmp: return -EFAULT; } break; - - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - { - int size; - const u8 *r6 = bpf2ia32[BPF_REG_6]; - - /* Setting up first argument */ - /* mov eax,dword ptr [ebp+off] */ - EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), - STACK_VAR(r6[0])); - - /* Setting up second argument */ - if (BPF_MODE(code) == BPF_ABS) { - /* mov %edx, imm32 */ - EMIT1_off32(0xBA, imm32); - } else { - if (sstk) - /* mov edx,dword ptr [ebp+off] */ - EMIT3(0x8B, add_2reg(0x40, IA32_EBP, - IA32_EDX), - STACK_VAR(src_lo)); - else - /* mov edx,src_lo */ - EMIT2(0x8B, add_2reg(0xC0, src_lo, - IA32_EDX)); - if (imm32) { - if (is_imm8(imm32)) - /* add %edx,imm8 */ - EMIT3(0x83, 0xC2, imm32); - else - /* add %edx,imm32 */ - EMIT2_off32(0x81, 0xC2, imm32); - } - } - - /* Setting up third argument */ - switch (BPF_SIZE(code)) { - case BPF_W: - size = 4; - break; - case BPF_H: - size = 2; - break; - case BPF_B: - size = 1; - break; - default: - return -EINVAL; - } - /* mov ecx,val */ - EMIT2(0xB1, size); - /* movzx ecx,ecx */ - EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX)); - - /* mov ebx,ebp */ - EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX)); - /* add %ebx,imm8 */ - EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER); - /* push ebx */ - EMIT1(0x53); - - /* Setting up function pointer to call */ - /* mov ebx,imm32*/ - EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), - (unsigned int)bpf_load_pointer); - - EMIT2(0xFF, add_1reg(0xD0, IA32_EBX)); - /* add %esp,4 */ - EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4); - /* xor edx,edx */ - EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX)); - - /* mov dword ptr [ebp+off],eax */ - EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), - STACK_VAR(r0[0])); - /* mov dword ptr [ebp+off],edx */ - EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), - STACK_VAR(r0[1])); - - /* - * Check if return address is NULL or not. - * If NULL then jump to epilogue else continue - * to load the value from retn address - */ - EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0); - jmp_offset = ctx->cleanup_addr - addrs[i]; - - switch (BPF_SIZE(code)) { - case BPF_W: - jmp_offset += 7; - break; - case BPF_H: - jmp_offset += 10; - break; - case BPF_B: - jmp_offset += 6; - break; - } - - EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset); - /* Load value from the address */ - switch (BPF_SIZE(code)) { - case BPF_W: - /* mov eax,[eax] */ - EMIT2(0x8B, 0x0); - /* Emit 'bswap eax' */ - EMIT2(0x0F, add_1reg(0xC8, IA32_EAX)); - break; - case BPF_H: - EMIT3(0x0F, 0xB7, 0x0); - EMIT1(0x66); - EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8); - break; - case BPF_B: - EMIT3(0x0F, 0xB6, 0x0); - break; - } - - /* mov dword ptr [ebp+off],eax */ - EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), - STACK_VAR(r0[0])); - break; - } /* STX XADD: lock *(u32 *)(dst + off) += src */ case BPF_STX | BPF_XADD | BPF_W: /* STX XADD: lock *(u64 *)(dst + off) += src */ From 32b3652c307ef62f624182fac1fd6328ccc8fcbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:24 +0200 Subject: [PATCH 52/66] bpf: sync tools bpf.h uapi header Only sync the header from include/uapi/linux/bpf.h. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8daef7326bb7..83a95ae388dd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1801,6 +1801,30 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1870,7 +1894,8 @@ union bpf_attr { FN(bind), \ FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ - FN(get_stack), + FN(get_stack), \ + FN(skb_load_bytes_relative), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1931,6 +1956,12 @@ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, }; +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ From f910cefa32b6cdabc96b126bcfc46d8940b1dc45 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:17 -0400 Subject: [PATCH 53/66] bpf: unify main prog and subprog Currently, verifier treat main prog and subprog differently. All subprogs detected are kept in env->subprog_starts while main prog is not kept there. Instead, main prog is implicitly defined as the prog start at 0. There is actually no difference between main prog and subprog, it is better to unify them, and register all progs detected into env->subprog_starts. This could also help simplifying some code logic. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 57 ++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7e61c395fddf..f655b926e432 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -191,7 +191,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ba10a83909d..8e8e582a7c03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -768,7 +768,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) ret = find_subprog(env, off); if (ret >= 0) return 0; - if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + if (env->subprog_cnt > BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; } @@ -784,6 +784,11 @@ static int check_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; + /* Add entry function. */ + ret = add_subprog(env, 0); + if (ret < 0) + return ret; + /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) @@ -809,10 +814,10 @@ static int check_subprogs(struct bpf_verifier_env *env) /* now check that all jumps are within the same subprog */ subprog_start = 0; - if (env->subprog_cnt == cur_subprog) + if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_end = env->subprog_starts[cur_subprog + 1]; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -836,11 +841,13 @@ next: verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; } + cur_subprog++; subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog) + if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_end = + env->subprog_starts[cur_subprog + 1]; } } return 0; @@ -1508,10 +1515,10 @@ process_func: return -EACCES; } continue_func: - if (env->subprog_cnt == subprog) + if (env->subprog_cnt == subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[subprog]; + subprog_end = env->subprog_starts[subprog + 1]; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1529,7 +1536,6 @@ continue_func: i); return -EFAULT; } - subprog++; frame++; if (frame >= MAX_CALL_FRAMES) { WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); @@ -1561,7 +1567,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - subprog++; return env->subprog_stack_depth[subprog]; } #endif @@ -2099,7 +2104,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); return -EINVAL; } @@ -2272,7 +2277,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* remember the callsite, it will be used by bpf_exit */ *insn_idx /* callsite */, state->curframe + 1 /* frameno within this callchain */, - subprog + 1 /* subprog number within this prog */); + subprog /* subprog number within this prog */); /* copy r1 - r5 args that callee can access */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) @@ -3889,7 +3894,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees * that can have arbitrary arguments. It's problematic @@ -4920,11 +4925,11 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt + 1; i++) { + for (i = 0; i < env->subprog_cnt; i++) { u32 depth = env->subprog_stack_depth[i]; verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt + 1) + if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); @@ -5301,7 +5306,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) void *old_bpf_func; int err = -ENOMEM; - if (env->subprog_cnt == 0) + if (env->subprog_cnt <= 1) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5317,7 +5322,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ - insn->off = subprog + 1; + insn->off = subprog; /* remember original imm in case JIT fails and fallback * to interpreter will be needed */ @@ -5326,16 +5331,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } - func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL); if (!func) return -ENOMEM; - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i) + if (env->subprog_cnt == i + 1) subprog_end = prog->len; else - subprog_end = env->subprog_starts[i]; + subprog_end = env->subprog_starts[i + 1]; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5365,7 +5370,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * now populate all bpf_calls with correct addresses and * run last pass of JIT */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || @@ -5378,7 +5383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) __bpf_call_base; } } - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { @@ -5392,7 +5397,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* finally lock prog and jit images for all functions and * populate kallsysm */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -5409,7 +5414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog + 1]->bpf_func; + addr = (unsigned long)func[subprog]->bpf_func; addr &= PAGE_MASK; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) addr - __bpf_call_base; @@ -5418,10 +5423,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt + 1; + prog->aux->func_cnt = env->subprog_cnt; return 0; out_free: - for (i = 0; i <= env->subprog_cnt; i++) + for (i = 0; i < env->subprog_cnt; i++) if (func[i]) bpf_jit_free(func[i]); kfree(func); From 9c8105bd4402236b1bb0f8f10709c5cec1440a0c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:18 -0400 Subject: [PATCH 54/66] bpf: centre subprog information fields It is better to centre all subprog information fields into one structure. This structure could later serve as function node in call graph. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 9 ++++-- kernel/bpf/verifier.c | 62 +++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f655b926e432..8f70dc181e23 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_info { + u32 start; /* insn idx of function entry point */ + u16 stack_depth; /* max. stack depth used by this function */ +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -191,9 +196,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; - /* computes the stack depth of each bpf function */ - u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; + struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e8e582a7c03..5b293b4abb70 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -741,18 +741,19 @@ enum reg_arg_type { static int cmp_subprogs(const void *a, const void *b) { - return *(int *)a - *(int *)b; + return ((struct bpf_subprog_info *)a)->start - + ((struct bpf_subprog_info *)b)->start; } static int find_subprog(struct bpf_verifier_env *env, int off) { - u32 *p; + struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs); + p = bsearch(&off, env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs); if (!p) return -ENOENT; - return p - env->subprog_starts; + return p - env->subprog_info; } @@ -772,15 +773,16 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } - env->subprog_starts[env->subprog_cnt++] = off; - sort(env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + env->subprog_info[env->subprog_cnt++].start = off; + sort(env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return 0; } static int check_subprogs(struct bpf_verifier_env *env) { int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -810,14 +812,14 @@ static int check_subprogs(struct bpf_verifier_env *env) if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) - verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ subprog_start = 0; if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog + 1]; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -846,8 +848,7 @@ next: if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = - env->subprog_starts[cur_subprog + 1]; + subprog_end = subprog[cur_subprog + 1].start; } } return 0; @@ -1480,13 +1481,13 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno]; + u16 stack = env->subprog_info[func->subprogno].stack_depth; if (stack >= -off) return 0; /* update known max for given subprogram */ - env->subprog_stack_depth[func->subprogno] = -off; + env->subprog_info[func->subprogno].stack_depth = -off; return 0; } @@ -1498,7 +1499,8 @@ static int update_stack_depth(struct bpf_verifier_env *env, */ static int check_max_stack_depth(struct bpf_verifier_env *env) { - int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; @@ -1508,17 +1510,17 @@ process_func: /* round up to 32-bytes, since this is granularity * of interpreter stack size */ - depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); return -EACCES; } continue_func: - if (env->subprog_cnt == subprog + 1) + if (env->subprog_cnt == idx + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[subprog + 1]; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1526,12 +1528,12 @@ continue_func: continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; - ret_prog[frame] = subprog; + ret_prog[frame] = idx; /* find the callee */ i = i + insn[i].imm + 1; - subprog = find_subprog(env, i); - if (subprog < 0) { + idx = find_subprog(env, i); + if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", i); return -EFAULT; @@ -1548,10 +1550,10 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); frame--; i = ret_insn[frame]; - subprog = ret_prog[frame]; + idx = ret_prog[frame]; goto continue_func; } @@ -1567,7 +1569,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - return env->subprog_stack_depth[subprog]; + return env->subprog_info[subprog].stack_depth; } #endif @@ -4926,14 +4928,14 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_stack_depth[i]; + u32 depth = env->subprog_info[i].stack_depth; verbose(env, "%d", depth); if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); - env->prog->aux->stack_depth = env->subprog_stack_depth[0]; + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -5140,9 +5142,9 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; for (i = 0; i < env->subprog_cnt; i++) { - if (env->subprog_starts[i] < off) + if (env->subprog_info[i].start < off) continue; - env->subprog_starts[i] += len - 1; + env->subprog_info[i].start += len - 1; } } @@ -5340,7 +5342,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (env->subprog_cnt == i + 1) subprog_end = prog->len; else - subprog_end = env->subprog_starts[i + 1]; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5357,7 +5359,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { From 4cb3d99c84ccbf728ff0e381c7c9815c3fa2bd5e Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:19 -0400 Subject: [PATCH 55/66] bpf: add faked "ending" subprog There are quite a few code snippet like the following in verifier: subprog_start = 0; if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else subprog_end = env->subprog_info[cur_subprog + 1].start; The reason is there is no marker in subprog_info array to tell the end of it. We could resolve this issue by introducing a faked "ending" subprog. The special "ending" subprog is with "insn_cnt" as start offset, so it is serving as the end mark whenever we iterate over all subprogs. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b293b4abb70..37e0affa515e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -769,7 +769,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) ret = find_subprog(env, off); if (ret >= 0) return 0; - if (env->subprog_cnt > BPF_MAX_SUBPROGS) { + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; } @@ -810,16 +810,18 @@ static int check_subprogs(struct bpf_verifier_env *env) return ret; } + /* Add a fake 'exit' subprog which could simplify subprog iteration + * logic. 'subprog_cnt' should not be increased. + */ + subprog[env->subprog_cnt].start = insn_cnt; + if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ - subprog_start = 0; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[cur_subprog + 1].start; + subprog_start = subprog[cur_subprog].start; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -843,11 +845,9 @@ next: verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; } - cur_subprog++; subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else + cur_subprog++; + if (cur_subprog < env->subprog_cnt) subprog_end = subprog[cur_subprog + 1].start; } } @@ -1502,7 +1502,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; @@ -1517,10 +1516,7 @@ process_func: return -EACCES; } continue_func: - if (env->subprog_cnt == idx + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[idx + 1].start; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -5141,7 +5137,8 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; - for (i = 0; i < env->subprog_cnt; i++) { + /* NOTE: fake 'exit' subprog should be updated as well. */ + for (i = 0; i <= env->subprog_cnt; i++) { if (env->subprog_info[i].start < off) continue; env->subprog_info[i].start += len - 1; @@ -5339,10 +5336,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i + 1) - subprog_end = prog->len; - else - subprog_end = env->subprog_info[i + 1].start; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); From 0cd3cbed3caf6eae3bc0fa4afa4f26a9babfe55a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:08 -0700 Subject: [PATCH 56/66] bpf: offload: allow offloaded programs to use perf event arrays BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes. The map only holds glue to perf ring, not actual data. Allow non-offloaded perf event arrays to be used in offloaded programs. Offload driver can extract the events from HW and put them in the map for user space to retrieve. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 +++++ kernel/bpf/offload.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0e00a13ff01b..321969da67b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_offload_neutral(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +} + static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return map->ops->map_seq_show_elem && map->ops->map_check_btf; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index c9401075b58c..ac747d5cf7c6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux)) return false; + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); down_read(&bpf_devs_lock); offload = prog->aux->offload; From 630a4d3874a06aa9f9481cbfc688981aad7a834c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:09 -0700 Subject: [PATCH 57/66] nfp: bpf: record offload neutral maps in the driver For asynchronous events originating from the device, like perf event output, we need to be able to make sure that objects being referred to by the FW message are valid on the host. FW events can get queued and reordered. Even if we had a FW message "barrier" we should still protect ourselves from bogus FW output. Add a reverse-mapping hash table and record in it all raw map pointers FW may refer to. Only record neutral maps, i.e. perf event arrays. These are currently the only objects FW can refer to. Use RCU protection on the read side, update side is under RTNL. Since program vs map destruction order is slightly painful for offload simply take an extra reference on all the recorded maps to make sure they don't disappear. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 25 +++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 20 ++- .../net/ethernet/netronome/nfp/bpf/offload.c | 125 +++++++++++++++++- drivers/net/ethernet/netronome/nfp/nfp_app.c | 2 +- kernel/bpf/syscall.c | 2 + 5 files changed, 168 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 1dc424685f4e..f65df341c557 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -43,6 +43,14 @@ #include "fw.h" #include "main.h" +const struct rhashtable_params nfp_bpf_maps_neutral_params = { + .nelem_hint = 4, + .key_len = FIELD_SIZEOF(struct nfp_bpf_neutral_map, ptr), + .key_offset = offsetof(struct nfp_bpf_neutral_map, ptr), + .head_offset = offsetof(struct nfp_bpf_neutral_map, l), + .automatic_shrinking = true, +}; + static bool nfp_net_ebpf_capable(struct nfp_net *nn) { #ifdef __LITTLE_ENDIAN @@ -401,17 +409,28 @@ static int nfp_bpf_init(struct nfp_app *app) init_waitqueue_head(&bpf->cmsg_wq); INIT_LIST_HEAD(&bpf->map_list); - err = nfp_bpf_parse_capabilities(app); + err = rhashtable_init(&bpf->maps_neutral, &nfp_bpf_maps_neutral_params); if (err) goto err_free_bpf; + err = nfp_bpf_parse_capabilities(app); + if (err) + goto err_free_neutral_maps; + return 0; +err_free_neutral_maps: + rhashtable_destroy(&bpf->maps_neutral); err_free_bpf: kfree(bpf); return err; } +static void nfp_check_rhashtable_empty(void *ptr, void *arg) +{ + WARN_ON_ONCE(1); +} + static void nfp_bpf_clean(struct nfp_app *app) { struct nfp_app_bpf *bpf = app->priv; @@ -419,6 +438,8 @@ static void nfp_bpf_clean(struct nfp_app *app) WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); + rhashtable_free_and_destroy(&bpf->maps_neutral, + nfp_check_rhashtable_empty, NULL); kfree(bpf); } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 68b5d326483d..5db6284a44ba 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -114,6 +115,8 @@ enum pkt_vec { * @maps_in_use: number of currently offloaded maps * @map_elems_in_use: number of elements allocated to offloaded maps * + * @maps_neutral: hash table of offload-neutral maps (on pointer) + * * @adjust_head: adjust head capability * @adjust_head.flags: extra flags for adjust head * @adjust_head.off_min: minimal packet offset within buffer required @@ -150,6 +153,8 @@ struct nfp_app_bpf { unsigned int maps_in_use; unsigned int map_elems_in_use; + struct rhashtable maps_neutral; + struct nfp_bpf_cap_adjust_head { u32 flags; int off_min; @@ -199,6 +204,14 @@ struct nfp_bpf_map { enum nfp_bpf_map_use use_map[]; }; +struct nfp_bpf_neutral_map { + struct rhash_head l; + struct bpf_map *ptr; + u32 count; +}; + +extern const struct rhashtable_params nfp_bpf_maps_neutral_params; + struct nfp_prog; struct nfp_insn_meta; typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); @@ -367,6 +380,8 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) * @error: error code if something went wrong * @stack_depth: max stack depth from the verifier * @adjust_head_location: if program has single adjust head call - the insn no. + * @map_records_cnt: the number of map pointers recorded for this prog + * @map_records: the map record pointers from bpf->maps_neutral * @insns: list of BPF instruction wrappers (struct nfp_insn_meta) */ struct nfp_prog { @@ -390,6 +405,9 @@ struct nfp_prog { unsigned int stack_depth; unsigned int adjust_head_location; + unsigned int map_records_cnt; + struct nfp_bpf_neutral_map **map_records; + struct list_head insns; }; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 42d98792bd25..e3ead906cc60 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -56,6 +56,126 @@ #include "../nfp_net_ctrl.h" #include "../nfp_net.h" +static int +nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, + struct bpf_map *map) +{ + struct nfp_bpf_neutral_map *record; + int err; + + /* Map record paths are entered via ndo, update side is protected. */ + ASSERT_RTNL(); + + /* Reuse path - other offloaded program is already tracking this map. */ + record = rhashtable_lookup_fast(&bpf->maps_neutral, &map, + nfp_bpf_maps_neutral_params); + if (record) { + nfp_prog->map_records[nfp_prog->map_records_cnt++] = record; + record->count++; + return 0; + } + + /* Grab a single ref to the map for our record. The prog destroy ndo + * happens after free_used_maps(). + */ + map = bpf_map_inc(map, false); + if (IS_ERR(map)) + return PTR_ERR(map); + + record = kmalloc(sizeof(*record), GFP_KERNEL); + if (!record) { + err = -ENOMEM; + goto err_map_put; + } + + record->ptr = map; + record->count = 1; + + err = rhashtable_insert_fast(&bpf->maps_neutral, &record->l, + nfp_bpf_maps_neutral_params); + if (err) + goto err_free_rec; + + nfp_prog->map_records[nfp_prog->map_records_cnt++] = record; + + return 0; + +err_free_rec: + kfree(record); +err_map_put: + bpf_map_put(map); + return err; +} + +static void +nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog) +{ + bool freed = false; + int i; + + ASSERT_RTNL(); + + for (i = 0; i < nfp_prog->map_records_cnt; i++) { + if (--nfp_prog->map_records[i]->count) { + nfp_prog->map_records[i] = NULL; + continue; + } + + WARN_ON(rhashtable_remove_fast(&bpf->maps_neutral, + &nfp_prog->map_records[i]->l, + nfp_bpf_maps_neutral_params)); + freed = true; + } + + if (freed) { + synchronize_rcu(); + + for (i = 0; i < nfp_prog->map_records_cnt; i++) + if (nfp_prog->map_records[i]) { + bpf_map_put(nfp_prog->map_records[i]->ptr); + kfree(nfp_prog->map_records[i]); + } + } + + kfree(nfp_prog->map_records); + nfp_prog->map_records = NULL; + nfp_prog->map_records_cnt = 0; +} + +static int +nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, + struct bpf_prog *prog) +{ + int i, cnt, err; + + /* Quickly count the maps we will have to remember */ + cnt = 0; + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (bpf_map_offload_neutral(prog->aux->used_maps[i])) + cnt++; + if (!cnt) + return 0; + + nfp_prog->map_records = kmalloc_array(cnt, + sizeof(nfp_prog->map_records[0]), + GFP_KERNEL); + if (!nfp_prog->map_records) + return -ENOMEM; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (bpf_map_offload_neutral(prog->aux->used_maps[i])) { + err = nfp_map_ptr_record(bpf, nfp_prog, + prog->aux->used_maps[i]); + if (err) { + nfp_map_ptrs_forget(bpf, nfp_prog); + return err; + } + } + WARN_ON(cnt != nfp_prog->map_records_cnt); + + return 0; +} + static int nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, unsigned int cnt) @@ -151,7 +271,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64); prog->aux->offload->jited_image = nfp_prog->prog; - return 0; + return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog); } static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) @@ -159,6 +279,7 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; kvfree(nfp_prog->prog); + nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog); nfp_prog_free(nfp_prog); return 0; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 6aedef0ad433..0e0253c7e17b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 263e13ede029..9b87198deea2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -282,6 +282,7 @@ void bpf_map_put(struct bpf_map *map) { __bpf_map_put(map, true); } +EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { @@ -543,6 +544,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) atomic_inc(&map->usercnt); return map; } +EXPORT_SYMBOL_GPL(bpf_map_inc); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { From 6cb5fb3891db9d3f6feb0f7669946550c9abc420 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:10 -0700 Subject: [PATCH 58/66] bpf: export bpf_event_output() bpf_event_output() is useful for offloads to add events to BPF event rings, export it. Note that export is placed near the stub since tracing is optional and kernel/bpf/core.c is always going to be built. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1127552c8033..d0d7d9462368 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1719,6 +1719,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { From f4e3ec0d573e238f383b3da365127002579a07d6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:11 -0700 Subject: [PATCH 59/66] bpf: replace map pointer loads before calling into offloads Offloads may find host map pointers more useful than map fds. Map pointers can be used to identify the map, while fds are only valid within the context of loading process. Jump to skip_full_check on error in case verifier log overflow has to be handled (replace_map_fd_with_map_ptr() prints to the log, driver prep may do that too in the future). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 37e0affa515e..ccac552ac861 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5741,16 +5741,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); - if (ret) - goto err_unlock; - } - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto skip_full_check; + } + env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); From 9816dd35ececc095f3e3be29d30d3adc755908d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:12 -0700 Subject: [PATCH 60/66] nfp: bpf: perf event output helpers support Add support for the perf_event_output family of helpers. The implementation on the NFP will not match the host code exactly. The state of the host map and rings is unknown to the device, hence device can't return errors when rings are not installed. The device simply packs the data into a firmware notification message and sends it over to the host, returning success to the program. There is no notion of a host CPU on the device when packets are being processed. Device will only offload programs which set BPF_F_CURRENT_CPU. Still, if map index doesn't match CPU no error will be returned (see above). Dropped/lost firmware notification messages will not cause "lost events" event on the perf ring, they are only visible via device error counters. Firmware notification messages may also get reordered in respect to the packets which caused their generation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 16 ++++- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 20 +++++- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 32 ++++++++- drivers/net/ethernet/netronome/nfp/bpf/main.c | 3 + drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 ++ .../net/ethernet/netronome/nfp/bpf/offload.c | 47 +++++++++++++ .../net/ethernet/netronome/nfp/bpf/verifier.c | 69 ++++++++++++++++++- 7 files changed, 187 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 7e298148ca26..cb87fccb9f6a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -102,6 +102,15 @@ nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n) return nfp_bpf_cmsg_alloc(bpf, size); } +static u8 nfp_bpf_cmsg_get_type(struct sk_buff *skb) +{ + struct cmsg_hdr *hdr; + + hdr = (struct cmsg_hdr *)skb->data; + + return hdr->type; +} + static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb) { struct cmsg_hdr *hdr; @@ -431,6 +440,11 @@ void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb) goto err_free; } + if (nfp_bpf_cmsg_get_type(skb) == CMSG_TYPE_BPF_EVENT) { + nfp_bpf_event_output(bpf, skb); + return; + } + nfp_ctrl_lock(bpf->app->ctrl); tag = nfp_bpf_cmsg_get_tag(skb); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 39639ac28b01..3dbc21653ce5 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -37,6 +37,14 @@ #include #include +/* Kernel's enum bpf_reg_type is not uABI so people may change it breaking + * our FW ABI. In that case we will do translation in the driver. + */ +#define NFP_BPF_SCALAR_VALUE 1 +#define NFP_BPF_MAP_VALUE 4 +#define NFP_BPF_STACK 6 +#define NFP_BPF_PACKET_DATA 8 + enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_FUNC = 1, NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, @@ -81,6 +89,7 @@ enum nfp_bpf_cmsg_type { CMSG_TYPE_MAP_DELETE = 5, CMSG_TYPE_MAP_GETNEXT = 6, CMSG_TYPE_MAP_GETFIRST = 7, + CMSG_TYPE_BPF_EVENT = 8, __CMSG_TYPE_MAP_MAX, }; @@ -155,4 +164,13 @@ struct cmsg_reply_map_op { __be32 resv; struct cmsg_key_value_pair elem[0]; }; + +struct cmsg_bpf_event { + struct cmsg_hdr hdr; + __be32 cpu_id; + __be64 map_ptr; + __be32 data_size; + __be32 pkt_size; + u8 data[0]; +}; #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 65f0791cae0c..2127cf1548dd 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -1456,6 +1456,31 @@ nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int +nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + swreg ptr_type; + u32 ret_tgt; + + ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog)); + + ret_tgt = nfp_prog_current_offset(nfp_prog) + 3; + + emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, + 2, RELO_BR_HELPER); + + /* Load ptr type into A1 */ + wrp_mov(nfp_prog, reg_a(1), ptr_type); + + /* Load the return address into B0 */ + wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); + + if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt)) + return -EINVAL; + + return 0; +} + /* --- Callbacks --- */ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -2411,6 +2436,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return map_call_stack_common(nfp_prog, meta); case BPF_FUNC_get_prandom_u32: return nfp_get_prandom_u32(nfp_prog, meta); + case BPF_FUNC_perf_event_output: + return nfp_perf_event_output(nfp_prog, meta); default: WARN_ONCE(1, "verifier allowed unsupported function\n"); return -EOPNOTSUPP; @@ -3353,6 +3380,9 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) case BPF_FUNC_map_delete_elem: val = nfp_prog->bpf->helpers.map_delete; break; + case BPF_FUNC_perf_event_output: + val = nfp_prog->bpf->helpers.perf_event_output; + break; default: pr_err("relocation of unknown helper %d\n", val); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index f65df341c557..d72f9e7f42da 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -298,6 +298,9 @@ nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) case BPF_FUNC_map_delete_elem: bpf->helpers.map_delete = readl(&cap->func_addr); break; + case BPF_FUNC_perf_event_output: + bpf->helpers.perf_event_output = readl(&cap->func_addr); + break; } return 0; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 5db6284a44ba..82682378d57f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -136,6 +136,7 @@ enum pkt_vec { * @helpers.map_lookup: map lookup helper address * @helpers.map_update: map update helper address * @helpers.map_delete: map delete helper address + * @helpers.perf_event_output: output perf event to a ring buffer * * @pseudo_random: FW initialized the pseudo-random machinery (CSRs) */ @@ -176,6 +177,7 @@ struct nfp_app_bpf { u32 map_lookup; u32 map_update; u32 map_delete; + u32 perf_event_output; } helpers; bool pseudo_random; @@ -458,5 +460,7 @@ int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap, int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, void *key, void *next_key); +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb); + void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index e3ead906cc60..4db0ac1e42a8 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -441,6 +441,53 @@ int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) } } +static unsigned long +nfp_bpf_perf_event_copy(void *dst, const void *src, + unsigned long off, unsigned long len) +{ + memcpy(dst, src + off, len); + return 0; +} + +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb) +{ + struct cmsg_bpf_event *cbe = (void *)skb->data; + u32 pkt_size, data_size; + struct bpf_map *map; + + if (skb->len < sizeof(struct cmsg_bpf_event)) + goto err_drop; + + pkt_size = be32_to_cpu(cbe->pkt_size); + data_size = be32_to_cpu(cbe->data_size); + map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr); + + if (skb->len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) + goto err_drop; + if (cbe->hdr.ver != CMSG_MAP_ABI_VERSION) + goto err_drop; + + rcu_read_lock(); + if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map, + nfp_bpf_maps_neutral_params)) { + rcu_read_unlock(); + pr_warn("perf event: dest map pointer %px not recognized, dropping event\n", + map); + goto err_drop; + } + + bpf_event_output(map, be32_to_cpu(cbe->cpu_id), + &cbe->data[round_up(pkt_size, 4)], data_size, + cbe->data, pkt_size, nfp_bpf_perf_event_copy); + rcu_read_unlock(); + + dev_consume_skb_any(skb); + return 0; +err_drop: + dev_kfree_skb_any(skb); + return -EINVAL; +} + static int nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog, struct netlink_ext_ack *extack) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 06ad53ce4ad9..c76b622743ae 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -36,6 +36,8 @@ #include #include +#include "../nfp_app.h" +#include "../nfp_main.h" #include "fw.h" #include "main.h" @@ -216,6 +218,71 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n"); return -EOPNOTSUPP; + case BPF_FUNC_perf_event_output: + BUILD_BUG_ON(NFP_BPF_SCALAR_VALUE != SCALAR_VALUE || + NFP_BPF_MAP_VALUE != PTR_TO_MAP_VALUE || + NFP_BPF_STACK != PTR_TO_STACK || + NFP_BPF_PACKET_DATA != PTR_TO_PACKET); + + if (!bpf->helpers.perf_event_output) { + pr_vlog(env, "event_output: not supported by FW\n"); + return -EOPNOTSUPP; + } + + /* Force current CPU to make sure we can report the event + * wherever we get the control message from FW. + */ + if (reg3->var_off.mask & BPF_F_INDEX_MASK || + (reg3->var_off.value & BPF_F_INDEX_MASK) != + BPF_F_CURRENT_CPU) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg3->var_off); + pr_vlog(env, "event_output: must use BPF_F_CURRENT_CPU, var_off: %s\n", + tn_buf); + return -EOPNOTSUPP; + } + + /* Save space in meta, we don't care about arguments other + * than 4th meta, shove it into arg1. + */ + reg1 = cur_regs(env) + BPF_REG_4; + + if (reg1->type != SCALAR_VALUE /* NULL ptr */ && + reg1->type != PTR_TO_STACK && + reg1->type != PTR_TO_MAP_VALUE && + reg1->type != PTR_TO_PACKET) { + pr_vlog(env, "event_output: unsupported ptr type: %d\n", + reg1->type); + return -EOPNOTSUPP; + } + + if (reg1->type == PTR_TO_STACK && + !nfp_bpf_stack_arg_ok("event_output", env, reg1, NULL)) + return -EOPNOTSUPP; + + /* Warn user that on offload NFP may return success even if map + * is not going to accept the event, since the event output is + * fully async and device won't know the state of the map. + * There is also FW limitation on the event length. + * + * Lost events will not show up on the perf ring, driver + * won't see them at all. Events may also get reordered. + */ + dev_warn_once(&nfp_prog->bpf->app->pf->pdev->dev, + "bpf: note: return codes and behavior of bpf_event_output() helper differs for offloaded programs!\n"); + pr_vlog(env, "warning: return codes and behavior of event_output helper differ for offload!\n"); + + if (!meta->func_id) + break; + + if (reg1->type != meta->arg1.type) { + pr_vlog(env, "event_output: ptr type changed: %d %d\n", + meta->arg1.type, reg1->type); + return -EINVAL; + } + break; + default: pr_vlog(env, "unsupported function id: %d\n", func_id); return -EOPNOTSUPP; From b4264c96b5cbc00c4c07deb9fbab928d43dffcf9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:13 -0700 Subject: [PATCH 61/66] nfp: bpf: rewrite map pointers with NFP TIDs Kernel will now replace map fds with actual pointer before calling the offload prepare. We can identify those pointers and replace them with NFP table IDs instead of loading the table ID in code generated for CALL instruction. This allows us to support having the same CALL being used with different maps. Since we don't want to change the FW ABI we still need to move the TID from R1 to portion of R0 before the jump. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 44 ++++++++++++++----- .../net/ethernet/netronome/nfp/bpf/verifier.c | 9 ---- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 2127cf1548dd..326a2085d650 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1395,15 +1395,9 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - struct bpf_offloaded_map *offmap; - struct nfp_bpf_map *nfp_map; bool load_lm_ptr; u32 ret_tgt; s64 lm_off; - swreg tid; - - offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr; - nfp_map = offmap->dev_priv; /* We only have to reload LM0 if the key is not at start of stack */ lm_off = nfp_prog->stack_depth; @@ -1416,17 +1410,12 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) if (meta->func_id == BPF_FUNC_map_update_elem) emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2); - /* Load map ID into a register, it should actually fit as an immediate - * but in case it doesn't deal with it here, not in the delay slots. - */ - tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog)); - emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, 2, RELO_BR_HELPER); ret_tgt = nfp_prog_current_offset(nfp_prog) + 2; /* Load map ID into A0 */ - wrp_mov(nfp_prog, reg_a(0), tid); + wrp_mov(nfp_prog, reg_a(0), reg_a(2)); /* Load the return address into B0 */ wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); @@ -3254,6 +3243,33 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) return 0; } +static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog) +{ + struct nfp_insn_meta *meta1, *meta2; + struct nfp_bpf_map *nfp_map; + struct bpf_map *map; + + nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) { + if (meta1->skip || meta2->skip) + continue; + + if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) || + meta1->insn.src_reg != BPF_PSEUDO_MAP_FD) + continue; + + map = (void *)(unsigned long)((u32)meta1->insn.imm | + (u64)meta2->insn.imm << 32); + if (bpf_map_offload_neutral(map)) + continue; + nfp_map = map_to_offmap(map)->dev_priv; + + meta1->insn.imm = nfp_map->tid; + meta2->insn.imm = 0; + } + + return 0; +} + static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len) { __le64 *ustore = (__force __le64 *)prog; @@ -3290,6 +3306,10 @@ int nfp_bpf_jit(struct nfp_prog *nfp_prog) { int ret; + ret = nfp_bpf_replace_map_ptrs(nfp_prog); + if (ret) + return ret; + ret = nfp_bpf_optimize(nfp_prog); if (ret) return ret; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index c76b622743ae..e163f3cfa47d 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -151,15 +151,6 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env, return false; } - /* Rest of the checks is only if we re-parse the same insn */ - if (!meta->func_id) - return true; - - if (meta->arg1.map_ptr != reg1->map_ptr) { - pr_vlog(env, "%s: called for different map\n", fname); - return false; - } - return true; } From c642ea265445873901041fa0b892a45ea7c6ff33 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:14 -0700 Subject: [PATCH 62/66] tools: bpftool: fold hex keyword in command help Instead of spelling [hex] BYTES everywhere use DATA as keyword for generalized value. This will help us keep the messages concise when longer command are added in the future. It will also be useful once BTF support comes. We will only have to change the definition of DATA. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- .../bpf/bpftool/Documentation/bpftool-map.rst | 19 ++++++++++--------- tools/bpf/bpftool/map.c | 13 +++++++------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 5f512b14bff9..c3eef8c972cd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -23,16 +23,17 @@ MAP COMMANDS | **bpftool** **map { show | list }** [*MAP*] | **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* **key** [**hex**] *BYTES* -| **bpftool** **map getnext** *MAP* [**key** [**hex**] *BYTES*] -| **bpftool** **map delete** *MAP* **key** [**hex**] *BYTES* +| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* **key** *DATA* +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* | **bpftool** **map pin** *MAP* *FILE* | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *DATA* := { [**hex**] *BYTES* } | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *VALUE* := { *BYTES* | *MAP* | *PROG* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } DESCRIPTION @@ -48,7 +49,7 @@ DESCRIPTION **bpftool map dump** *MAP* Dump all entries in a given *MAP*. - **bpftool map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] + **bpftool map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] Update map entry for a given *KEY*. *UPDATE_FLAGS* can be one of: **any** update existing entry @@ -61,13 +62,13 @@ DESCRIPTION the bytes are parsed as decimal values, unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. - **bpftool map lookup** *MAP* **key** [**hex**] *BYTES* + **bpftool map lookup** *MAP* **key** *DATA* Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** [**hex**] *BYTES*] + **bpftool map getnext** *MAP* [**key** *DATA*] Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** [**hex**] *BYTES* + **bpftool map delete** *MAP* **key** *DATA* Remove entry from the map. **bpftool map pin** *MAP* *FILE* diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index a6cdb640a0d7..7da77e4166ec 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -874,16 +874,17 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" " %s %s dump MAP\n" - " %s %s update MAP key [hex] BYTES value [hex] VALUE [UPDATE_FLAGS]\n" - " %s %s lookup MAP key [hex] BYTES\n" - " %s %s getnext MAP [key [hex] BYTES]\n" - " %s %s delete MAP key [hex] BYTES\n" + " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" + " %s %s lookup MAP key DATA\n" + " %s %s getnext MAP [key DATA]\n" + " %s %s delete MAP key DATA\n" " %s %s pin MAP FILE\n" " %s %s help\n" "\n" " MAP := { id MAP_ID | pinned FILE }\n" + " DATA := { [hex] BYTES }\n" " " HELP_SPEC_PROGRAM "\n" - " VALUE := { BYTES | MAP | PROG }\n" + " VALUE := { DATA | MAP | PROG }\n" " UPDATE_FLAGS := { any | exist | noexist }\n" " " HELP_SPEC_OPTIONS "\n" "", From e64d52569f6e847495091db40ab58d2d379748ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:15 -0700 Subject: [PATCH 63/66] tools: bpftool: move get_possible_cpus() to common code Move the get_possible_cpus() function to shared code. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/common.c | 58 +++++++++++++++++++++++++++++++++++++- tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/map.c | 56 ------------------------------------ 3 files changed, 59 insertions(+), 58 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 465995281dcd..9c620770c6ed 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -33,6 +33,7 @@ /* Author: Jakub Kicinski */ +#include #include #include #include @@ -420,6 +421,61 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab) } } +unsigned int get_possible_cpus(void) +{ + static unsigned int result; + char buf[128]; + long int n; + char *ptr; + int fd; + + if (result) + return result; + + fd = open("/sys/devices/system/cpu/possible", O_RDONLY); + if (fd < 0) { + p_err("can't open sysfs possible cpus"); + exit(-1); + } + + n = read(fd, buf, sizeof(buf)); + if (n < 2) { + p_err("can't read sysfs possible cpus"); + exit(-1); + } + close(fd); + + if (n == sizeof(buf)) { + p_err("read sysfs possible cpus overflow"); + exit(-1); + } + + ptr = buf; + n = 0; + while (*ptr && *ptr != '\n') { + unsigned int a, b; + + if (sscanf(ptr, "%u-%u", &a, &b) == 2) { + n += b - a + 1; + + ptr = strchr(ptr, '-') + 1; + } else if (sscanf(ptr, "%u", &a) == 1) { + n++; + } else { + assert(0); + } + + while (isdigit(*ptr)) + ptr++; + if (*ptr == ',') + ptr++; + } + + result = n; + + return result; +} + static char * ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) { diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8e9584d6246..cbf8985da362 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -125,6 +125,7 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, const char *arch); void print_hex_data_json(uint8_t *data, size_t len); +unsigned int get_possible_cpus(void); const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 7da77e4166ec..5efefde5f578 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -34,7 +34,6 @@ /* Author: Jakub Kicinski */ #include -#include #include #include #include @@ -69,61 +68,6 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_CPUMAP] = "cpumap", }; -static unsigned int get_possible_cpus(void) -{ - static unsigned int result; - char buf[128]; - long int n; - char *ptr; - int fd; - - if (result) - return result; - - fd = open("/sys/devices/system/cpu/possible", O_RDONLY); - if (fd < 0) { - p_err("can't open sysfs possible cpus"); - exit(-1); - } - - n = read(fd, buf, sizeof(buf)); - if (n < 2) { - p_err("can't read sysfs possible cpus"); - exit(-1); - } - close(fd); - - if (n == sizeof(buf)) { - p_err("read sysfs possible cpus overflow"); - exit(-1); - } - - ptr = buf; - n = 0; - while (*ptr && *ptr != '\n') { - unsigned int a, b; - - if (sscanf(ptr, "%u-%u", &a, &b) == 2) { - n += b - a + 1; - - ptr = strchr(ptr, '-') + 1; - } else if (sscanf(ptr, "%u", &a) == 1) { - n++; - } else { - assert(0); - } - - while (isdigit(*ptr)) - ptr++; - if (*ptr == ',') - ptr++; - } - - result = n; - - return result; -} - static bool map_is_per_cpu(__u32 type) { return type == BPF_MAP_TYPE_PERCPU_HASH || From f412eed9dfdeeb6becd7de2ffe8b5d0a8b3f81ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:16 -0700 Subject: [PATCH 64/66] tools: bpftool: add simple perf event output reader Users of BPF sooner or later discover perf_event_output() helpers and BPF_MAP_TYPE_PERF_EVENT_ARRAY. Dumping this array type is not possible, however, we can add simple reading of perf events. Create a new event_pipe subcommand for maps, this sub command will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps. Parts of the code from samples/bpf/trace_output_user.c. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- .../bpf/bpftool/Documentation/bpftool-map.rst | 29 +- tools/bpf/bpftool/Documentation/bpftool.rst | 2 +- tools/bpf/bpftool/Makefile | 7 +- tools/bpf/bpftool/bash-completion/bpftool | 36 +- tools/bpf/bpftool/common.c | 19 + tools/bpf/bpftool/main.h | 4 + tools/bpf/bpftool/map.c | 19 +- tools/bpf/bpftool/map_perf_ring.c | 347 ++++++++++++++++++ 8 files changed, 444 insertions(+), 19 deletions(-) create mode 100644 tools/bpf/bpftool/map_perf_ring.c diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index c3eef8c972cd..a6258bc8ec4f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -22,12 +22,13 @@ MAP COMMANDS ============= | **bpftool** **map { show | list }** [*MAP*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* **key** *DATA* -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* **key** *DATA* +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -76,6 +77,22 @@ DESCRIPTION Note: *FILE* must be located in *bpffs* mount. + **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] + Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. + + Install perf rings into a perf event array map and dump + output of any bpf_perf_event_output() call in the kernel. + By default read the number of CPUs on the system and + install perf ring for each CPU in the corresponding index + in the array. + + If **cpu** and **index** are specified, install perf ring + for given **cpu** at **index** in the array (single ring). + + Note that installing a perf ring into an array will silently + replace any existing ring. Any other application will stop + receiving events if it installed its rings earlier. + **bpftool map help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 20689a321ffe..564cb0d9692b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -23,7 +23,7 @@ SYNOPSIS *MAP-COMMANDS* := { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **help** } + | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | **load** | **help** } diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 4e69782c4a79..892dbf095bff 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -39,7 +39,12 @@ CC = gcc CFLAGS += -O2 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers -CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ +CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ + -I$(srctree)/kernel/bpf/ \ + -I$(srctree)/tools/include \ + -I$(srctree)/tools/include/uapi \ + -I$(srctree)/tools/lib/bpf \ + -I$(srctree)/tools/perf CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' LIBS = -lelf -lbfd -lopcodes $(LIBBPF) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 852d84a98acd..b301c9b315f1 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1,6 +1,6 @@ # bpftool(8) bash completion -*- shell-script -*- # -# Copyright (C) 2017 Netronome Systems, Inc. +# Copyright (C) 2017-2018 Netronome Systems, Inc. # # This software is dual licensed under the GNU General License # Version 2, June 1991 as shown in the file COPYING in the top-level @@ -79,6 +79,14 @@ _bpftool_get_map_ids() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_perf_map_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command grep -C2 perf_event_array | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + + _bpftool_get_prog_ids() { COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ @@ -359,10 +367,34 @@ _bpftool() fi return 0 ;; + event_pipe) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_perf_map_ids + return 0 + ;; + cpu) + return 0 + ;; + index) + return 0 + ;; + *) + _bpftool_once_attr 'cpu' + _bpftool_once_attr 'index' + return 0 + ;; + esac + ;; *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin show list update' -- "$cur" ) ) + lookup pin event_pipe show list update' -- \ + "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 9c620770c6ed..32f9e397a6c0 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -331,6 +331,16 @@ char *get_fdinfo(int fd, const char *key) return NULL; } +void print_data_json(uint8_t *data, size_t len) +{ + unsigned int i; + + jsonw_start_array(json_wtr); + for (i = 0; i < len; i++) + jsonw_printf(json_wtr, "%d", data[i]); + jsonw_end_array(json_wtr); +} + void print_hex_data_json(uint8_t *data, size_t len) { unsigned int i; @@ -421,6 +431,15 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab) } } +unsigned int get_page_size(void) +{ + static int result; + + if (!result) + result = getpagesize(); + return result; +} + unsigned int get_possible_cpus(void) { static unsigned int result; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index cbf8985da362..6173cd997e7a 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -117,14 +117,18 @@ int do_pin_fd(int fd, const char *name); int do_prog(int argc, char **arg); int do_map(int argc, char **arg); +int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, const char *arch); +void print_data_json(uint8_t *data, size_t len); void print_hex_data_json(uint8_t *data, size_t len); +unsigned int get_page_size(void); unsigned int get_possible_cpus(void); const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 5efefde5f578..af6766e956ba 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -130,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv) return -1; } -static int -map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) { int err; int fd; @@ -817,12 +816,13 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" - " %s %s dump MAP\n" - " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" - " %s %s lookup MAP key DATA\n" - " %s %s getnext MAP [key DATA]\n" - " %s %s delete MAP key DATA\n" - " %s %s pin MAP FILE\n" + " %s %s dump MAP\n" + " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" + " %s %s lookup MAP key DATA\n" + " %s %s getnext MAP [key DATA]\n" + " %s %s delete MAP key DATA\n" + " %s %s pin MAP FILE\n" + " %s %s event_pipe MAP [cpu N index M]\n" " %s %s help\n" "\n" " MAP := { id MAP_ID | pinned FILE }\n" @@ -834,7 +834,7 @@ static int do_help(int argc, char **argv) "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -849,6 +849,7 @@ static const struct cmd cmds[] = { { "getnext", do_getnext }, { "delete", do_delete }, { "pin", do_pin }, + { "event_pipe", do_event_pipe }, { 0 } }; diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c new file mode 100644 index 000000000000..c5a2ced8552d --- /dev/null +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2018 Netronome Systems, Inc. */ +/* This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "main.h" + +#define MMAP_PAGE_CNT 16 + +static bool stop; + +struct event_ring_info { + int fd; + int key; + unsigned int cpu; + void *mem; +}; + +struct perf_event_sample { + struct perf_event_header header; + __u32 size; + unsigned char data[]; +}; + +static void int_exit(int signo) +{ + fprintf(stderr, "Stopping...\n"); + stop = true; +} + +static void +print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e) +{ + struct { + struct perf_event_header header; + __u64 id; + __u64 lost; + } *lost = (void *)e; + struct timespec ts; + + if (clock_gettime(CLOCK_MONOTONIC, &ts)) { + perror("Can't read clock for timestamp"); + return; + } + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "timestamp"); + jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec); + jsonw_name(json_wtr, "type"); + jsonw_uint(json_wtr, e->header.type); + jsonw_name(json_wtr, "cpu"); + jsonw_uint(json_wtr, ring->cpu); + jsonw_name(json_wtr, "index"); + jsonw_uint(json_wtr, ring->key); + if (e->header.type == PERF_RECORD_SAMPLE) { + jsonw_name(json_wtr, "data"); + print_data_json(e->data, e->size); + } else if (e->header.type == PERF_RECORD_LOST) { + jsonw_name(json_wtr, "lost"); + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "id"); + jsonw_uint(json_wtr, lost->id); + jsonw_name(json_wtr, "count"); + jsonw_uint(json_wtr, lost->lost); + jsonw_end_object(json_wtr); + } + jsonw_end_object(json_wtr); + } else { + if (e->header.type == PERF_RECORD_SAMPLE) { + printf("== @%ld.%ld CPU: %d index: %d =====\n", + (long)ts.tv_sec, ts.tv_nsec, + ring->cpu, ring->key); + fprint_hex(stdout, e->data, e->size, " "); + printf("\n"); + } else if (e->header.type == PERF_RECORD_LOST) { + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } +} + +static void +perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len) +{ + volatile struct perf_event_mmap_page *header = ring->mem; + __u64 buffer_size = MMAP_PAGE_CNT * get_page_size(); + __u64 data_tail = header->data_tail; + __u64 data_head = header->data_head; + void *base, *begin, *end; + + asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ + if (data_head == data_tail) + return; + + base = ((char *)header) + get_page_size(); + + begin = base + data_tail % buffer_size; + end = base + data_head % buffer_size; + + while (begin != end) { + struct perf_event_sample *e; + + e = begin; + if (begin + e->header.size > base + buffer_size) { + long len = base + buffer_size - begin; + + if (*buf_len < e->header.size) { + free(*buf); + *buf = malloc(e->header.size); + if (!*buf) { + fprintf(stderr, + "can't allocate memory"); + stop = true; + return; + } + *buf_len = e->header.size; + } + + memcpy(*buf, begin, len); + memcpy(*buf + len, base, e->header.size - len); + e = (void *)*buf; + begin = base + e->header.size - len; + } else if (begin + e->header.size == base + buffer_size) { + begin = base; + } else { + begin += e->header.size; + } + + print_bpf_output(ring, e); + } + + __sync_synchronize(); /* smp_mb() */ + header->data_tail = data_head; +} + +static int perf_mmap_size(void) +{ + return get_page_size() * (MMAP_PAGE_CNT + 1); +} + +static void *perf_event_mmap(int fd) +{ + int mmap_size = perf_mmap_size(); + void *base; + + base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + p_err("event mmap failed: %s\n", strerror(errno)); + return NULL; + } + + return base; +} + +static void perf_event_unmap(void *mem) +{ + if (munmap(mem, perf_mmap_size())) + fprintf(stderr, "Can't unmap ring memory!\n"); +} + +static int bpf_perf_event_open(int map_fd, int key, int cpu) +{ + struct perf_event_attr attr = { + .sample_type = PERF_SAMPLE_RAW, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + }; + int pmu_fd; + + pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); + if (pmu_fd < 0) { + p_err("failed to open perf event %d for CPU %d", key, cpu); + return -1; + } + + if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) { + p_err("failed to update map for event %d for CPU %d", key, cpu); + goto err_close; + } + if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { + p_err("failed to enable event %d for CPU %d", key, cpu); + goto err_close; + } + + return pmu_fd; + +err_close: + close(pmu_fd); + return -1; +} + +int do_event_pipe(int argc, char **argv) +{ + int i, nfds, map_fd, index = -1, cpu = -1; + struct bpf_map_info map_info = {}; + struct event_ring_info *rings; + size_t tmp_buf_sz = 0; + void *tmp_buf = NULL; + struct pollfd *pfds; + __u32 map_info_len; + bool do_all = true; + + map_info_len = sizeof(map_info); + map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len); + if (map_fd < 0) + return -1; + + if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + p_err("map is not a perf event array"); + goto err_close_map; + } + + while (argc) { + if (argc < 2) + BAD_ARG(); + + if (is_prefix(*argv, "cpu")) { + char *endptr; + + NEXT_ARG(); + cpu = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as CPU ID", **argv); + goto err_close_map; + } + + NEXT_ARG(); + } else if (is_prefix(*argv, "index")) { + char *endptr; + + NEXT_ARG(); + index = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as index", **argv); + goto err_close_map; + } + + NEXT_ARG(); + } else { + BAD_ARG(); + } + + do_all = false; + } + + if (!do_all) { + if (index == -1 || cpu == -1) { + p_err("cpu and index must be specified together"); + goto err_close_map; + } + + nfds = 1; + } else { + nfds = min(get_possible_cpus(), map_info.max_entries); + cpu = 0; + index = 0; + } + + rings = calloc(nfds, sizeof(rings[0])); + if (!rings) + goto err_close_map; + + pfds = calloc(nfds, sizeof(pfds[0])); + if (!pfds) + goto err_free_rings; + + for (i = 0; i < nfds; i++) { + rings[i].cpu = cpu + i; + rings[i].key = index + i; + + rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key, + rings[i].cpu); + if (rings[i].fd < 0) + goto err_close_fds_prev; + + rings[i].mem = perf_event_mmap(rings[i].fd); + if (!rings[i].mem) + goto err_close_fds_current; + + pfds[i].fd = rings[i].fd; + pfds[i].events = POLLIN; + } + + signal(SIGINT, int_exit); + signal(SIGHUP, int_exit); + signal(SIGTERM, int_exit); + + if (json_output) + jsonw_start_array(json_wtr); + + while (!stop) { + poll(pfds, nfds, 200); + for (i = 0; i < nfds; i++) + perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz); + } + free(tmp_buf); + + if (json_output) + jsonw_end_array(json_wtr); + + for (i = 0; i < nfds; i++) { + perf_event_unmap(rings[i].mem); + close(rings[i].fd); + } + free(pfds); + free(rings); + close(map_fd); + + return 0; + +err_close_fds_prev: + while (i--) { + perf_event_unmap(rings[i].mem); +err_close_fds_current: + close(rings[i].fd); + } + free(pfds); +err_free_rings: + free(rings); +err_close_map: + close(map_fd); + return -1; +} From ab7f5bf0928be2f148d000a6eaa6c0a36e74750e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:17 -0700 Subject: [PATCH 65/66] bpf: fix references to free_bpf_prog_info() in comments Comments in the verifier refer to free_bpf_prog_info() which seems to have never existed in tree. Replace it with free_used_maps(). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ccac552ac861..d5e1a6c4165d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5055,7 +5055,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_bpf_prog_info() + * and all maps are released in free_used_maps() */ map = bpf_map_inc(map, false); if (IS_ERR(map)) { @@ -5821,7 +5821,7 @@ skip_full_check: err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release - * them now. Otherwise free_bpf_prog_info() will release them. + * them now. Otherwise free_used_maps() will release them. */ release_maps(env); *prog = env->prog; From e94fa1d93117e7f1eb783dc9cae6c70650944449 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 16:27:53 +0200 Subject: [PATCH 66/66] bpf, xskmap: fix crash in xsk_map_alloc error path handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If bpf_map_precharge_memlock() did not fail, then we set err to zero. However, any subsequent failure from either alloc_percpu() or the bpf_map_area_alloc() will return ERR_PTR(0) which in find_and_alloc_map() will cause NULL pointer deref. In devmap we have the convention that we return -EINVAL on page count overflow, so keep the same logic here and just set err to -ENOMEM after successful bpf_map_precharge_memlock(). Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Signed-off-by: Daniel Borkmann Cc: Björn Töpel Acked-by: David S. Miller Signed-off-by: Alexei Starovoitov --- kernel/bpf/xskmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 869dbb11b612..cb3a12137404 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -56,6 +56,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (err) goto free_m; + err = -ENOMEM; + m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) goto free_m;