// fork: process creation, fork() and move-to-user setup. // Layouts (TaskStruct, KeRegs, ...) come from src/task_layout.zig. const layout = #import("task_layout") const TaskStruct = layout.TaskStruct const CoreContext = layout.CoreContext const KeRegs = layout.KeRegs const TASK_RUNNING = layout.TASK_RUNNING const KTHREAD = layout.KTHREAD const MAX_PAGE_COUNT = layout.MAX_PAGE_COUNT const fdtable = #import("fdtable") // User VA layout (STACK_TOP, HEAP_BASE) + page-permission flags. The ELF // loader prepare_move_to_user_elf_argv picks per-region flags from these // (text = RWX — the default bag is EL0 read/write + executable and no // read-only (AP[2]) descriptor bit is defined, so W^X is not yet // enforced; data/heap/stack add TD_USER_XN for RW-NX), and do_data_abort // (src/mm_user.zig) reuses the same bag when demand-allocating heap/stack // pages on a fault. const user_layout = #import("user_layout") // ELF parser — the named module "elf" (src/elf.flash), also the module // the host tests cover. It moved from a sibling @import to a named // module when it was ported to Flash: the generated .zig lives in the // build cache, so a file-relative import can no longer resolve it. // build.zig wires the same module into the kernel and host-test builds. const elf = #import("elf") // Argv-on-stack block type, encoded by execve.encodeArgvBlock and // written into the top stack page by the argv-aware loader below. Named // module (not a sibling @import) because src/execve.zig is the "execve" // module in the kernel build; the fork host test wires the same module // in via build.zig. const execve = #import("execve") const NR_TASKS usize = 64 const PAGE_SIZE u64 = 1 << 12 const THREAD_SIZE u64 = PAGE_SIZE const SPSR_EL1_MODE_EL0t u64 = 0 const MU i32 = 0 const builtin = #import("builtin") // Opt-in fork tracing (default off). See build.zig `-Dverbose-fork`. const build_options = #import("build_options") // Kernel-thread PCs must run via TTBR1 (high-mem linear map). Otherwise // the moment a process does set_pgd() to a user pgd, TTBR0 stops mapping // the kernel's low-VA copy and the next ret/blr to a kernel function // faults. ORing instead of adding is idempotent if the address is // already high. const LINEAR_MAP_BASE u64 = if (builtin.target.os.tag == .freestanding) 0xFFFF000000000000 else 0 extern fn get_kernel_page() u64 extern fn free_kernel_page(kp u64) void extern fn release_user_mm(t *mut TaskStruct) void extern fn allocate_user_page(tsk *mut TaskStruct, uva u64, flags u64) u64 extern fn copy_virt_memory(dst *mut TaskStruct) i32 extern fn memzero(start u64, size u64) void extern fn memcpy(dst *mut anyopaque, src *anyopaque, bytes u64) *mut anyopaque extern fn copy_ke_regs(to *mut KeRegs, from *mut KeRegs) void extern fn set_pgd(pgd u64) void extern fn preempt_disable() void extern fn preempt_enable() void extern fn ret_from_fork() void extern fn main_output(interface i32, str [*:0]u8) void extern fn main_output_u64(interface i32, inw u64) void extern fn main_output_char(interface i32, ch u8) void extern var current ?*mut TaskStruct extern var task [NR_TASKS]?*mut TaskStruct extern var nr_tasks i32 extern var next_pid i32 export fn task_ke_regs(tsk *mut TaskStruct) *mut KeRegs { // KeRegs sits at the top of the task's kernel-stack page. Tasks made by // copy_process carry a dedicated stack page in `kstack`; init_task // (kstack == 0) falls back to its own page for the boot context. const base u64 = if (tsk.kstack != 0) tsk.kstack else #intFromPtr(tsk) return #ptrFromInt(base + THREAD_SIZE - #sizeOf(KeRegs)) } export fn copy_process_impl(clone_flags u64, fn_addr u64, arg u64) i32 { preempt_disable() // OOM: no kernel page for the child TaskStruct. Bail before any // dereference of the (null) pointer; preempt was disabled above. const kp = get_kernel_page() if kp == 0 { preempt_enable() return -1 } const p *mut TaskStruct = #ptrFromInt(kp) // Dedicated kernel-stack page: the child's kernel stack lives // in its own page, decoupled from the TaskStruct page, so a deep // syscall plus a nested timer-IRQ frame-save can never overflow into // the credential tail (the recurring stack-into-creds class). Freed // alongside the TaskStruct page on every exit path; task_ke_regs(p) // resolves KeRegs against it. const ksp = get_kernel_page() if ksp == 0 { free_kernel_page(kp) preempt_enable() return -1 } p.kstack = ksp const childregs = task_ke_regs(p) memzero(#intFromPtr(childregs), #sizeOf(KeRegs)) memzero(#intFromPtr(&p.core_context), #sizeOf(CoreContext)) if (clone_flags & KTHREAD) != 0 { p.core_context.x19 = fn_addr | LINEAR_MAP_BASE p.core_context.x20 = arg } else { const cur_regs = task_ke_regs(current.?) // copy_ke_regs avoids gcc emitting a memcpy call copy_ke_regs(childregs, cur_regs) // child returns 0 from fork childregs.regs[0] = 0 if copy_virt_memory(p) != 0 { // copy_virt_memory may have mapped part of the child mm before // failing (OOM mid-copy, or the child's page cap). Release those // pages so this path is baseline-neutral, then the TaskStruct // page. preempt was disabled at entry and must be re-enabled. release_user_mm(p) free_kernel_page(p.kstack) free_kernel_page(#intFromPtr(p)) preempt_enable() return -1 } // Dup the parent's fd table: each installed slot is a shared // reference to the same kernel-resident Pipe, and the refcount // bumps once per inherited slot. POSIX-equivalent without // CLOEXEC for now (future work wires CLOEXEC + close-on-exec). // KTHREAD branch skips this — kernel threads cannot reach the // EL0 syscall path that fills fd_table. fdtable.dupAll(current.?, p) // Inherit the parent's working directory. cwd lives // on the child task's kernel page (zeroed by get_kernel_page), // so without this copy the child would come up with cwd = "" // and the next relative-path open would fall back to root with // a stray leading byte. KTHREADs skip the copy along with fds — // their default cwd = "/" from the TaskStruct field initialiser // is fine for sched-only code paths. #memcpy(&p.cwd, ¤t.?.cwd) // Inherit process credentials: a forked child runs as // the same user as its parent until it (or an image it execs) // drops privilege via setuid/setgid. KTHREADs skip this along // with fds/cwd — their 0/root default suits sched-only paths. p.uid = current.?.uid p.gid = current.?.gid p.euid = current.?.euid p.egid = current.?.egid } p.flags = clone_flags p.priority = current.?.priority p.state = TASK_RUNNING // Halved so a freshly forked child doesn't out-budget a parent that has // already burned ticks; gives the round-robin path a chance to interleave // parent/child during fork-stress instead of running parent in a tight // burst. p.counter = #divTrunc(p.priority, 2) p.preempt_count = 1 p.parent = current p.core_context.lr = #intFromPtr(&ret_from_fork) | LINEAR_MAP_BASE p.core_context.sp = #intFromPtr(childregs) // First-null-slot scan instead of monotonic nr_tasks bump so that slots // freed by do_wait get reused; otherwise long fork-stress runs hit // NR_TASKS=64 well before allocator pressure. nr_tasks is kept as a // high-water mark only. var slot i32 = -1 var i usize = 0 while i < NR_TASKS { if task[i] == null { slot = #intCast(i) break } i += 1 } if slot < 0 { // Out of task slots: the child mm is fully built (copy_virt_memory // succeeded), so release it before freeing the TaskStruct page — // otherwise the child's user + page-table pages leak. release_user_mm(p) free_kernel_page(p.kstack) free_kernel_page(#intFromPtr(p)) preempt_enable() return -1 } // Pid is monotonic (next_pid++), independent of the reusable slot index. p.pid = next_pid next_pid += 1 task[#intCast(slot)] = p if slot + 1 > nr_tasks { nr_tasks = slot + 1 } if build_options.verbose_fork { main_output(MU, "created pid ") if p.pid < 10 { main_output_char(MU, #intCast('0' + p.pid)) } else { main_output_char(MU, #intCast('0' + #divTrunc(p.pid, 10))) main_output_char(MU, #intCast('0' + #mod(p.pid, 10))) } main_output(MU, " at ") main_output_u64(MU, #intFromPtr(p)) main_output(MU, "\n") } preempt_enable() return p.pid } // Loads an ELF image into the current task's address space. Callers // (kernel boot for the PID 1 init image, and sys_execve via the argv // trampoline below) snapshot the ELF bytes into a kernel-owned region at // `blob_addr_kva`, free the old user pages, and zero `current.mm.pgd` // before calling. // Walks PT_LOAD segments via src/elf.zig, allocates fresh user pages // per segment with region-aware flags (text=RWX — writable, no // read-only page bit; data/heap/stack=RW-NX), // memcpys file-backed bytes from the blob, eagerly maps one stack page // at the top of the user VA, then sets ELR=e_entry / SP=STACK_TOP and // installs the new pgd. Returns 0 on success, -1 on parse failure / // alloc failure / non-page-aligned p_vaddr / inconsistent memsz= ph.p_vaddr && ehdr.e_entry < ph.p_vaddr + ph.p_memsz { if (ph.p_flags & elf.PF_X) != 0 { entry_mapped = true } } // Sanity: page-aligned vaddr and memsz >= filesz. Mis-aligned // segments would force partial-page memcpys that break the // page-grain free-page accounting; reject and document. if (ph.p_vaddr & (PAGE_SIZE - 1)) != 0 { return -1 } if ph.p_memsz < ph.p_filesz { return -1 } if ph.p_memsz == 0 { continue } const flags u64 = if ((ph.p_flags & elf.PF_X) != 0) user_layout.TD_USER_PAGE_FLAGS_DEFAULT else user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN const num_pages u64 = (ph.p_memsz + PAGE_SIZE - 1) / PAGE_SIZE var i u64 = 0 while i < num_pages { const uva = ph.p_vaddr + i * PAGE_SIZE const kva = allocate_user_page(current.?, uva, flags) if kva == 0 { return -1 } const seg_off u64 = i * PAGE_SIZE if seg_off < ph.p_filesz { const remaining u64 = ph.p_filesz - seg_off const copy_bytes u64 = if (remaining > PAGE_SIZE) PAGE_SIZE else remaining _ = memcpy(#ptrFromInt(kva), #ptrFromInt(blob_addr_kva + ph.p_offset + seg_off), copy_bytes) } // Trailing memsz-filesz BSS bytes are implicitly zero // because get_free_page returns zeroed pages. i += 1 } } if !entry_mapped { return -1 } // Eagerly map the top stack page so EL0 entry doesn't fault before // the first instruction. Lazy stack growth + guard-page handling // arrives in 2.5 / 2.6. const stack_uva u64 = user_layout.STACK_TOP - PAGE_SIZE const stack_kva = allocate_user_page( current.?, stack_uva, user_layout.TD_USER_PAGE_FLAGS_DEFAULT | user_layout.TD_USER_XN ) if stack_kva == 0 { return -1 } const regs = task_ke_regs(current.?) memzero(#intFromPtr(regs), #sizeOf(KeRegs)) regs.elr = ehdr.e_entry regs.pstate = SPSR_EL1_MODE_EL0t if argv_block |ab| { // Copy the encoded argv image into the eagerly-mapped top stack // page via its KVA alias (TTBR0 still holds the old pgd until // set_pgd below). encodeArgvBlock laid the block flush against // STACK_TOP, so it lands at PAGE_SIZE - len from the page base. const dst [*]mut u8 = #ptrFromInt(stack_kva + (PAGE_SIZE - ab.bytes.len)) #memcpy(dst[0..ab.bytes.len], ab.bytes) // x1 = argv and sp = &argv[0] survive to the new program: kernel_exit // restores them from this frame and ret_from_syscall (arch/aarch64/entry.S) // does not touch them. x0 = argc is the AAPCS64 contract, but for the // sole caller (execve via the SVC path) ret_from_syscall overwrites // the saved-x0 slot with execveKernel's return value — so execveKernel // returns argc to satisfy it. This frame write keeps the register // setup complete for any future direct (non-syscall) caller. regs.regs[0] = ab.argc // x0 = argc (see note above) regs.regs[1] = ab.argv_uva // x1 = argv regs.sp = ab.sp } else { regs.sp = user_layout.STACK_TOP } // Heap starts empty at HEAP_BASE — sys_brk grows / shrinks from // here, do_data_abort demand-allocates pages as the heap is touched. current.?.mm.brk = user_layout.HEAP_BASE set_pgd(current.?.mm.pgd) return 0 } // C-ABI trampoline: src/execve.zig is a leaf module and cannot import the // root kernel_mod where prepare_move_to_user_elf_argv lives, so it reaches // the argv-aware loader through this exported symbol — the same pattern // sys.zig uses to call prepare_move_to_user_elf via `extern fn`. A direct // call between kernel functions in syscall context works (only the // indirect dispatch table needs the | LINEAR_MAP_BASE alias). argv_block_ptr // is a kernel pointer to an execve.ArgvBlock, or 0 for the no-argv path. export fn move_to_user_elf_argv(blob_addr_kva u64, blob_size u64, argv_block_ptr u64) i32 { const ab ?execve.ArgvBlock = if (argv_block_ptr == 0) null else #as(*execve.ArgvBlock, #ptrFromInt(argv_block_ptr)).* return prepare_move_to_user_elf_argv(blob_addr_kva, blob_size, ab) } // ---- Host Tests ---- const std = #import("std") const testing = std.testing extern fn reset_fork_test() void test "fork: copy_process_impl creates a child" { reset_fork_test() var p TaskStruct = undefined #memset(std.mem.asBytes(&p), 0) p.priority = 10 current = &p const child_pid = copy_process_impl(0, 0, 0) try testing.expect(child_pid > 0) try testing.expectEqual(#as(i32, 1), nr_tasks) try testing.expect(task[0] != null) try testing.expectEqual(child_pid, task[0].?.pid) try testing.expectEqual(p.priority, task[0].?.priority) try testing.expectEqual(#as(i64, 5), task[0].?.counter) } test "fork: task_ke_regs returns correct pointer" { var t TaskStruct = undefined #memset(std.mem.asBytes(&t), 0) // kstack == 0: KeRegs resolves against the task page itself — the // init_task / boot-context fallback. const regs = task_ke_regs(&t) const offset = #intFromPtr(regs) - #intFromPtr(&t) try testing.expectEqual(#as(u64, THREAD_SIZE - #sizeOf(KeRegs)), offset) // kstack set: KeRegs resolves against the dedicated kernel-stack page, // not the TaskStruct page — the decoupling that keeps a deep syscall + // nested IRQ frame off the credential tail. var stack_page [THREAD_SIZE]u8 align(16) = undefined t.kstack = #intFromPtr(&stack_page) const regs2 = task_ke_regs(&t) try testing.expectEqual( #intFromPtr(&stack_page) + THREAD_SIZE - #sizeOf(KeRegs), #intFromPtr(regs2) ) } extern fn set_fail_copy_virt(v bool) void test "fork: copy_process_impl returns -1 when the kernel page OOMs" { reset_fork_test() var p TaskStruct = undefined #memset(std.mem.asBytes(&p), 0) p.priority = 10 current = &p // Drain the stub's page pool so get_kernel_page returns the sentinel. // Without the :75 null-check, copy_process_impl would deref a null // TaskStruct pointer and crash this test. var i usize = 0 while i < 256 { _ = get_kernel_page() i += 1 } try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0)) } test "fork: copy_process_impl returns -1 when copy_virt_memory fails" { reset_fork_test() var p TaskStruct = undefined #memset(std.mem.asBytes(&p), 0) p.priority = 10 current = &p set_fail_copy_virt(true) // The copy_virt_memory-failure path releases the child mm + the // TaskStruct page and returns -1 (no slot consumed). try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0)) try testing.expectEqual(#as(?*mut TaskStruct, null), task[0]) } test "fork: copy_process_impl returns -1 when all task slots are full" { reset_fork_test() var p TaskStruct = undefined #memset(std.mem.asBytes(&p), 0) p.priority = 10 current = &p // Occupy every task[] slot so the first-null-slot scan fails. var dummy TaskStruct = undefined #memset(std.mem.asBytes(&dummy), 0) var i usize = 0 while i < NR_TASKS { task[i] = &dummy i += 1 } try testing.expectEqual(#as(i32, -1), copy_process_impl(0, 0, 0)) }