mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-04-17 20:04:02 +00:00
== The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov <rdna@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
164 lines
5.3 KiB
C
164 lines
5.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _BPF_CGROUP_H
|
|
#define _BPF_CGROUP_H
|
|
|
|
#include <linux/jump_label.h>
|
|
#include <uapi/linux/bpf.h>
|
|
|
|
struct sock;
|
|
struct sockaddr;
|
|
struct cgroup;
|
|
struct sk_buff;
|
|
struct bpf_sock_ops_kern;
|
|
|
|
#ifdef CONFIG_CGROUP_BPF
|
|
|
|
extern struct static_key_false cgroup_bpf_enabled_key;
|
|
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
|
|
|
|
struct bpf_prog_list {
|
|
struct list_head node;
|
|
struct bpf_prog *prog;
|
|
};
|
|
|
|
struct bpf_prog_array;
|
|
|
|
struct cgroup_bpf {
|
|
/* array of effective progs in this cgroup */
|
|
struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
|
|
|
|
/* attached progs to this cgroup and attach flags
|
|
* when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
|
|
* have either zero or one element
|
|
* when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
|
|
*/
|
|
struct list_head progs[MAX_BPF_ATTACH_TYPE];
|
|
u32 flags[MAX_BPF_ATTACH_TYPE];
|
|
|
|
/* temp storage for effective prog array used by prog_attach/detach */
|
|
struct bpf_prog_array __rcu *inactive;
|
|
};
|
|
|
|
void cgroup_bpf_put(struct cgroup *cgrp);
|
|
int cgroup_bpf_inherit(struct cgroup *cgrp);
|
|
|
|
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
|
enum bpf_attach_type type, u32 flags);
|
|
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
|
enum bpf_attach_type type, u32 flags);
|
|
int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
|
union bpf_attr __user *uattr);
|
|
|
|
/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
|
|
int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
|
|
enum bpf_attach_type type, u32 flags);
|
|
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
|
enum bpf_attach_type type, u32 flags);
|
|
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
|
union bpf_attr __user *uattr);
|
|
|
|
int __cgroup_bpf_run_filter_skb(struct sock *sk,
|
|
struct sk_buff *skb,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_run_filter_sk(struct sock *sk,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
|
|
struct sockaddr *uaddr,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
|
|
struct bpf_sock_ops_kern *sock_ops,
|
|
enum bpf_attach_type type);
|
|
|
|
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
|
|
short access, enum bpf_attach_type type);
|
|
|
|
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
|
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled) \
|
|
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \
|
|
BPF_CGROUP_INET_INGRESS); \
|
|
\
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
|
|
typeof(sk) __sk = sk_to_full_sk(sk); \
|
|
if (sk_fullsock(__sk)) \
|
|
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
|
|
BPF_CGROUP_INET_EGRESS); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled) { \
|
|
__ret = __cgroup_bpf_run_filter_sk(sk, \
|
|
BPF_CGROUP_INET_SOCK_CREATE); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled) \
|
|
__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
|
|
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
|
|
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
|
|
|
|
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled && (sock_ops)->sk) { \
|
|
typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \
|
|
if (__sk && sk_fullsock(__sk)) \
|
|
__ret = __cgroup_bpf_run_filter_sock_ops(__sk, \
|
|
sock_ops, \
|
|
BPF_CGROUP_SOCK_OPS); \
|
|
} \
|
|
__ret; \
|
|
})
|
|
|
|
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \
|
|
({ \
|
|
int __ret = 0; \
|
|
if (cgroup_bpf_enabled) \
|
|
__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
|
|
access, \
|
|
BPF_CGROUP_DEVICE); \
|
|
\
|
|
__ret; \
|
|
})
|
|
#else
|
|
|
|
struct cgroup_bpf {};
|
|
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
|
|
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
|
|
|
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
|
|
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
|
|
|
|
#endif /* CONFIG_CGROUP_BPF */
|
|
|
|
#endif /* _BPF_CGROUP_H */
|