#ifndef HEADER_fd_src_waltz_xdp_fd_xsk_h #define HEADER_fd_src_waltz_xdp_fd_xsk_h #if defined(__linux__) /* fd_xsk manages an XSK file descriptor and provides RX/TX buffers. ### Background AF_XDP is a Linux API providing kernel-bypass networking in the form of shared memory ring buffers accessible from userspace. The kernel redirects packets from/to these buffers with the appropriate XDP configuration (XDP_REDIRECT). AF_XDP is hardware-agnostic and allows sharing a NIC with the Linux networking stack (unlike e.g. DPDK). This allows for deployment in existing, heterogeneous networks. An AF_XDP socket is called "XSK". The shared memory region storing the packet data flowing through an XSK is called "UMEM". XDP (eXpress Data Path) is a framework for installing hooks in the form of eBPF programs at an early stage of packet processing (i.e. before tc and netfilter). eBPF is user-deployable JIT-compiled bytecode that usually runs inside the kernel. Some hardware/driver combinations optionally allow offloading eBPF processing to NICs. This is not to be confused with other BPF-derived ISAs such as sBPF (Solana BPF). +--- Figure 1: AF_XDP RX Block Diagram -----------------+ | | | ┌─────┐ ┌────────┐ ┌─────┐ XDP_PASS ┌─────────┐ | | │ NIC ├──> Driver ├──> XDP ├──────────> sk_buff │ | | └─────┘ └────────┘ └─┬───┘ └─────────┘ | | │ | | │ XDP_REDIRECT | | │ | | ┌──▼───────┐ ┌─────────┐ | | │ XSK/UMEM ├──────> fd_aio │ | | └──────────┘ └─────────┘ | | | +-------------------------------------------------------+ Figure 1 shows a simplified block diagram of RX packet flow within the kernel in `XDP_FLAGS_DRV_MODE` mode. Notably, the chain of eBPF programs installed in the XDP facility get invoked for every incoming packet. If all programs return the `XDP_PASS` action, the packet continues its usual path to the Linux networking stack, where it will be allocated in sk_buff, and eventually flow through ip_rcv(), tc, and netfilter before reaching downstream sockets. If the `XDP_REDIRECT` action is taken however, the packet is copied to the UMEM of an XSK, and a RX queue entry is allocated. An fd_aio backend is provided by fd_xdp_aio. The more generic `XDP_FLAGS_SKB_MODE` XDP mode falls back to sk_buff- based memory mgmt (still skipping the rest of the generic path), but is more widely available. +--- Figure 2: AF_XDP TX Block Diagram -------------+ | | | ┌────────┐ ┌──────────┐ ┌────────┐ ┌─────┐ | | │ fd_aio ├──> XSK/UMEM ├──> Driver ├──> NIC │ | | └────────┘ └──────────┘ └────────┘ └─────┘ | | | +---------------------------------------------------+ Figure 2 shows a simplified block diagram of the TX packet flow. Userspace applications deliver packets to the XSK/UMEM buffers. The kernel then forwards these packets to the NIC. This also means that the application is responsible for maintaining a routing table to resolve layer-3 dest addrs to NICs and layer-2 addrs. As in the RX flow, netfilter (iptables, nftables) is not available. ### Memory Management The UMEM area is allocated from userspace. It is recommended to use the fd_util shmem/wksp APIs to obtain large page-backed memory. UMEM is divided into equally sized frames. At any point in time, each frame is either owned by userspace or the kernel. On initialization, all frames are owned by userspace. Changes in UMEM frame ownership and packet RX/TX events are transmitted via four rings allocated by the kernel (mmap()ed in by the user). This allows for out-of-order processing of packets. Data flow: (U->K) is userspace-to-kernel communication, and (K->U) is kernel-to-userspace. FILL Free frames are provided to the kernel using the FILL (U->K) ring. The kernel may populate these frames with RX packet data. RX Once the kernel has populated a FILL frame with RX (K->U) packet data, it passes back the frame to userspace via the RX queue. TX TX frames sent by userspace are provided to the (U->K) kernel using the TX ring. COMPLETION Once the kernel has processed a TX frame, it passes (K->U) back the frame to the userspace via the COMPLETION queue. Combined, the FILL-RX and TX-COMPLETION rings form two pairs. The kernel will not move frames between the pairs. */ #include <linux/if_link.h> #include <linux/if_xdp.h> #include <net/if.h> #include "../../util/fd_util_base.h" /* FD_XSK_UMEM_ALIGN: byte alignment of UMEM area within fd_xsk_t. This requirement is set by the kernel as of Linux 4.18. */ #define FD_XSK_UMEM_ALIGN (4096UL) /* fd_xdp_ring_t describes an XSK descriptor ring in the thread group's local address space. All pointers fall into kernel-managed XSK descriptor buffer at [mem;mem+mem_sz) that are valid during the lifetime of an fd_xsk_t join. The ring producer and consumer are synchronized via incrementing sequence numbers that wrap at 2^64. */ struct __attribute__((aligned(64UL))) fd_xdp_ring { /* This point is 64-byte aligned */ /* mmap() params, only used during join/leave for munmap() */ void * mem; /* Points to start of shared descriptor ring mmap region */ ulong map_sz; /* Size of shared descriptor ring mmap region */ ulong _pad_0x10; ulong _pad_0x18; /* This point is 64-byte aligned */ /* Pointers to fields opaque XSK ring structure. This indirection is required because the memory layout of the kernel-provided descriptor rings is unstable. The field offsets can be queried using getsockopt(SOL_XDP, XDP_MMAP_OFFSETS). */ union { void * ptr; /* Opaque pointer */ struct xdp_desc * packet_ring; /* For RX, TX rings */ ulong * frame_ring; /* For FILL, COMPLETION rings */ }; uint * flags; /* Points to flags in shared descriptor ring */ uint * prod; /* Points to producer seq in shared descriptor ring */ uint * cons; /* Points to consumer seq in shared descriptor ring */ /* This point is 64-byte aligned */ /* Managed by fd_xsk_t */ uint depth; /* Capacity of ring in no of entries */ uint cached_prod; /* Cached value of *prod */ uint cached_cons; /* Cached value of *cons */ }; typedef struct fd_xdp_ring fd_xdp_ring_t; /* fd_xsk_params_t: Memory layout parameters of XSK. Can be retrieved using fd_xsk_get_params() */ struct fd_xsk_params { /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX, TX, Completion XSK rings respectively. */ ulong fr_depth; ulong rx_depth; ulong tx_depth; ulong cr_depth; /* umem_addr: Pointer to UMEM in local address space */ void * umem_addr; /* frame_sz: Controls the frame size used in the UMEM ring buffers. */ ulong frame_sz; /* umem_sz: Total size of XSK ring shared memory area (contiguous). Aligned by FD_XSK_ALIGN. */ ulong umem_sz; /* Linux interface index */ uint if_idx; /* Interface queue index */ uint if_queue_id; /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */ uint bind_flags; }; typedef struct fd_xsk_params fd_xsk_params_t; struct fd_xsk { /* Informational */ uint if_idx; /* index of net device */ uint if_queue_id; /* net device combined queue index */ long log_suppress_until_ns; /* suppress log messages until this time */ /* Kernel descriptor of XSK rings in local address space returned by getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) */ struct xdp_mmap_offsets offsets; /* AF_XDP socket file descriptor */ int xsk_fd; /* ring_{rx,tx,fr,cr}: XSK ring descriptors */ fd_xdp_ring_t ring_rx; fd_xdp_ring_t ring_tx; fd_xdp_ring_t ring_fr; fd_xdp_ring_t ring_cr; }; typedef struct fd_xsk fd_xsk_t; FD_PROTOTYPES_BEGIN /* fd_xsk_init creates an XSK, registers UMEM, maps rings, and binds the socket to the given interface queue. This is a potentially destructive operation. As of 2024-Jun, AF_XDP zero copy support is still buggy in some device drivers. Assume that all traffic sent to this interface is compromised. On some devices, the NIC is instructed to DMA all incoming packets into UMEM, even ones not belonging to Firedancer. Those are then later on software-copied out to skbs again. This further implies that enabling AF_XDP can slow down the regular kernel receive path. Requires CAP_SYS_ADMIN. May issue the following syscalls: - socket( AF_XDP, SOCK_RAW, 0 ) = fd - setsockopt( fd, SOL_XDP, ... ) - getsockopt( fd, SOL_XDP, ... ) - mmap( ..., fd, ... ) - bind( fd, ... ) - munmap ; on fail - close ; on fail */ fd_xsk_t * fd_xsk_init( fd_xsk_t * xsk, fd_xsk_params_t const * params ); void * fd_xsk_delete( void * shxsk ); /* fd_xsk_rx_need_wakeup: returns whether a wakeup is required to complete a rx operation */ static inline int fd_xsk_rx_need_wakeup( fd_xsk_t * xsk ) { return !!( *xsk->ring_fr.flags & XDP_RING_NEED_WAKEUP ); } /* fd_xsk_tx_need_wakeup: returns whether a wakeup is required to complete a tx operation */ static inline int fd_xsk_tx_need_wakeup( fd_xsk_t * xsk ) { return !!( *xsk->ring_tx.flags & XDP_RING_NEED_WAKEUP ); } FD_PROTOTYPES_END #endif /* defined(__linux__) */ #endif /* HEADER_fd_src_waltz_xdp_fd_xsk_h */