// execve: path-resolved ELF loader. Streams PT_LOAD segments from an open // VFS file into a kernel buffer, then hands off to the ELF loader in // src/fork.zig. There is no per-image size cap beyond MAX_EXEC_BYTES and // no double-copy. Argv strings + pointer array live in the eagerly-mapped // top stack page; entry contract is x0 = argc, x1 = argv (AAPCS64). // // Wired into sys.zig via execve_impl + the SYS_EXECVE dispatch slot. // execve_impl resolves the path through the VFS shim, streams the ELF // into a static kernel buffer, encodes argv, and hands off to the // argv-aware loader in src/fork.zig. The kernel body sits behind a // comptime is_kernel guard so the host-test build compiles only the pure // encodeArgvBlock (build.zig wires this file with no kernel imports). const std = #import("std") const builtin = #import("builtin") // The real execve_impl body runs only on the freestanding kernel; the // host-test build compiles encodeArgvBlock alone. A comptime-known guard // keeps the kernel-only branch — and therefore the kernel-only imports // and externs below — out of host analysis: Zig only analyses the taken // branch of a comptime if, so execveKernel and its dependencies are never // referenced (and never resolved) when is_kernel is false. const is_kernel = builtin.target.os.tag == .freestanding // Kernel-only imports. Referenced solely inside execveKernel, so on the // host build they are never analysed and need not resolve. const task_layout = #import("task_layout") const vfs = #import("vfs") const user_layout = #import("user_layout") const path_mod = #import("path") // Permission gate: exec-intent check + the shared EACCES // constant. Same lazy-analysis posture as the imports above. const perm = #import("perm") const defs = #import("syscall_defs") // Kernel-only externs (same lazy-analysis posture as the imports). extern var current ?*mut task_layout.TaskStruct extern fn free_page(p u64) void extern fn copy_from_user(kbuf [*]mut u8, uva u64, len u64) i32 extern fn preempt_disable() void extern fn preempt_enable() void // C-ABI trampoline into the argv-aware ELF loader (src/fork.zig). A leaf // module cannot import the root kernel_mod where prepare_move_to_user_elf_argv // lives, so fork.zig exports this thin shim. argv_block_ptr is a kernel // pointer to an ArgvBlock, or 0 for the no-argv path. extern fn move_to_user_elf_argv(blob_addr_kva u64, blob_size u64, argv_block_ptr u64) i32 // OOM-after-teardown diagnostics. A loader -1 past the point of no return // cannot return to userland (the caller's pgd is gone), so it emits this // marker and zombies the task, mirroring do_data_abort's fault-context OOM. extern fn main_output(interface i32, str [*:0]u8) void extern fn exit_process() void const MU i32 = 0 // Largest ELF the path-resolved loader will stream in. Sits well above // PAGE_SIZE (the retired blob loader's cap) so multi-page programs load; // argv_echo.elf is ~4.5 KiB, fsh will stay under 16 KiB. A larger file // resolves to a clean -1 rather than a silent clamp. Baseline-neutral: // exec_buf lives in kernel .bss (below MALLOC_START), not the page pool. pub const MAX_EXEC_BYTES usize = 0x10000 // One exec at a time (uniprocessor; a future SMP release revisits, same posture as // argv_scratch). exec_buf snapshots the whole ELF contiguously so the // loader's per-PT_LOAD memcpy walks a single blob (get_free_page would // hand back non-contiguous pages); arg_storage holds the copied-in argv // strings before encodeArgvBlock serialises them. var exec_buf [MAX_EXEC_BYTES]u8 = undefined var arg_storage [MAX_ARGV_BYTES]u8 = undefined // execveKernel frame relief. These were execveKernel stack // locals; they moved up here — same one-exec-at-a-time posture as // exec_buf / arg_storage — because the per-task kernel stack shares its // 4 KiB page with TaskStruct (~2.4 KiB usable above KeRegs) and this // ~1.8 KiB of path / join / argv-slice buffers pushed the frame past it. // The overflow lands in the TaskStruct tail: it had been silently // clipping the unused tail of `cwd[]` all along, and the appended // credential fields (added after cwd) made it visible as garbage // euid/gid right after an exec. Container-level analysis is lazy, so the // host-test build (which never analyses execveKernel) never sees these. // exec_join_buf is sized to task_layout.CWD_SIZE; the comptime check in // execveKernel keeps the literal honest without importing task_layout // at container scope (the host build has no task_layout module). var exec_kpath [1024]u8 = undefined var exec_join_buf [256]u8 = undefined var exec_argv_slices [MAX_ARGV][]u8 = undefined // Maximum argv string count surfaced to userland. Bounded by the top // stack page (one PAGE_SIZE for strings + pointer array). pub const MAX_ARGV usize = 32 // Maximum total argv byte budget — strings + pointer array combined. // Picked under PAGE_SIZE so the eagerly-mapped top stack page holds // the whole block with headroom for the initial sp alignment. pub const MAX_ARGV_BYTES usize = 3072 // Encoded argv-on-stack image. encodeArgvBlock fills `bytes` against a // kernel-side scratch buffer; prepare_move_to_user_elf copies it into // the top stack page's KVA alias and writes argc/argv/sp into the // task's saved register frame before eret. pub const ArgvBlock = struct { sp u64, argv_uva u64, argc u64, bytes []mut u8, } export fn execve_impl(path_ptr u64, argv_ptr u64) i32 { // is_kernel is comptime-known, so Zig analyses only the taken branch: // execveKernel (and its kernel-only imports) stay out of the host build. if is_kernel { return execveKernel(path_ptr, argv_ptr) } else { return -1 // host: only encodeArgvBlock is exercised } } // Real path-resolve → copy-argv → stream-PT_LOAD → set-regs flow. Every // user copy and validation happens BEFORE the address-space teardown // ("point of no return"), so a wild path/argv UVA soft-fails to -1 with // the caller intact — the same contract gate-4's [TEST] efault-syscall // proves for sys_openFile. fn execveKernel(path_ptr u64, argv_ptr u64) i32 { const c = current orelse return -1 // Serialise the WHOLE of execveKernel. It fills, then much later consumes, // a pile of shared kernel statics (exec_kpath / exec_join_buf / // exec_argv_slices, arg_storage, argv_scratch, exec_buf — the "one exec at // a time" posture at exec_buf's decl). The final consume is // move_to_user_elf_argv, which memcpys out of BOTH exec_buf and // argv_scratch — long after the fill — so a timer preempt anywhere from the // first static write down to that consume could schedule a second task // through execveKernel, clobber the buffers, and leave this task loading a // corrupted image. preempt_count is per-task and timer_tick honours // preempt_count > 0 (src/sched.zig), so this one disable defers // rescheduling across the entire body; the defer re-balances on every // return. (The OOM branch calls noreturn exit_process without running the // defer, but exit_process zombifies this task and voluntary _schedule // switches away regardless of preempt_count, so the leaked count is inert — // the next `current` carries its own.) The inner open/fill/close guards // below now nest harmlessly under this. NB this supersedes the earlier // fill-only guard, which re-enabled preemption BEFORE the consume and so // left the buffer clobberable in the gap between fill and load. preempt_disable() defer preempt_enable() // The static join buffer must stay in lockstep with the cwd budget // (see the container-scope comment at exec_join_buf). Both operands are // comptime-known, so the untaken arm — and #compileError with it — is // never analysed unless the lengths drift. (Flash has no in-function // comptime block, so this comptime-known discard carries the assert.) _ = if (exec_join_buf.len == task_layout.CWD_SIZE) 0 else #compileError("exec_join_buf must match task_layout.CWD_SIZE") // 1. Copy the path in (byte loop, soft-fail on a wild UVA — mirrors // sys_openFile:195-204). No teardown yet → the child survives a fault. const kpath = &exec_kpath var pi usize = 0 var nul_found bool = false while pi < kpath.len - 1 { var b u8 = 0 if copy_from_user(#ptrCast(&b), path_ptr + pi, 1) < 0 { return -1 } kpath[pi] = b if b == 0 { nul_found = true break } pi += 1 } if !nul_found { return -1 } // not NUL-terminated within the buffer const raw_path = std.mem.span(#as([*:0]u8, #ptrCast(kpath))) // Relative paths (no leading '/') are joined against current.cwd // and `.` / `..` collapsed via the host-tested helper in // src/path.zig; absolute paths pass through. Still pre-teardown // (the VFS open below is the next failable step), so an oversize // join returns -1 with the caller intact. var path []u8 = undefined if raw_path.len > 0 && raw_path[0] == '/' { path = raw_path } else { const cwd_slice = std.mem.sliceTo(#as([*:0]u8, #ptrCast(&c.cwd)), 0) path = path_mod.joinResolve(cwd_slice, raw_path, &exec_join_buf) orelse return -1 } // 2. Copy argv in: walk the NULL-terminated user pointer array, copy // each NUL-terminated string into arg_storage, build kernel slices. // Bounded by MAX_ARGV count and MAX_ARGV_BYTES total; any // fault/overflow → -1 (still pre-teardown). const slices = &exec_argv_slices var argc usize = 0 var store_off usize = 0 if argv_ptr != 0 { while true { if argc >= MAX_ARGV { return -1 } var p u64 = 0 if copy_from_user(#ptrCast(&p), argv_ptr + argc * 8, 8) < 0 { return -1 } if p == 0 { break } const start = store_off while true { var b u8 = 0 if copy_from_user(#ptrCast(&b), p + (store_off - start), 1) < 0 { return -1 } if b == 0 { break } if store_off >= MAX_ARGV_BYTES { return -1 } arg_storage[store_off] = b store_off += 1 } slices[argc] = arg_storage[start..store_off] argc += 1 } } // 3. Serialise the argv block (lands in argv_scratch, a static that // survives the teardown below). Soft-fail → -1. const blk = encodeArgvBlock(user_layout.STACK_TOP, argc, slices) orelse return -1 // 4. Resolve the path through the VFS shim (preempt-guarded like // sys_openFile:208-210). Backend miss → -1. var open_result vfs.OpenResult = .{} preempt_disable() const sb_opt = vfs.vfs_open(path, &open_result) preempt_enable() const sb = sb_opt orelse return -1 // Permission gate: exec-intent check against the caller's // effective ids. Still pre-teardown, so a denied exec soft-fails to // -EACCES with the caller's address space intact — same contract as // the path/argv faults above. (A check after the teardown would // zombie the task instead of returning.) if !perm.checkAccess( open_result.mode, open_result.uid, open_result.gid, c.euid, c.egid, .exec ) { return -defs.EACCES } if open_result.size > MAX_EXEC_BYTES { return -1 } // 5. Stream the whole file into exec_buf via a local stack File (no // file_mod.alloc → no page → baseline-neutral). preempt-guard per // read on the unified read path; EOF (n == 0) ends the loop. var f task_layout.File = .{} f.private = open_result.private f.size = open_result.size f.offset = 0 var off usize = 0 // Hold preemption disabled across the ENTIRE fill, not per chunk: // exec_buf is a shared kernel static, so a timer preempt between // chunks could schedule a second task into execveKernel that // overwrites the same buffer mid-stream → corrupted image. preempt is // a counter, so every exit path below re-balances exactly once. preempt_disable() while off < MAX_EXEC_BYTES { const take u64 = MAX_EXEC_BYTES - off const n = vfs.vfs_read(sb, &f, exec_buf[off..].ptr, take) if n < 0 { preempt_enable() return -1 } if n == 0 { break } off += #intCast(n) } preempt_enable() const file_size u64 = off // ELF magic gate: reject a non-ELF file. Still pre-teardown. const is_elf = file_size >= 4 && exec_buf[0] == 0x7F && exec_buf[1] == 'E' && exec_buf[2] == 'L' && exec_buf[3] == 'F' if !is_elf { return -1 } // vfs_close is inert for initramfs but call it for backend symmetry. preempt_disable() vfs.vfs_close(sb, &f) preempt_enable() // 6. POINT OF NO RETURN — tear down the caller's address space. // Nothing below can soft-fail. // c.fds is deliberately NOT touched: POSIX execve preserves the // fd table so a shell can hand a child its redirected stdio. // c.uid/gid/euid/egid are likewise preserved (the same TaskStruct // survives the image swap), so a privilege drop done in /bin/login // before execve carries into the shell. Only mm pages + pgd go away. var i usize = 0 while i < task_layout.MAX_PAGE_COUNT { const pa = c.mm.user_pages[i].pa if pa != 0 { free_page(pa) } c.mm.user_pages[i] = .{} i += 1 } i = 0 while i < task_layout.MAX_PAGE_COUNT { const kp = c.mm.kernel_pages[i] if kp != 0 { free_page(kp) } c.mm.kernel_pages[i] = 0 i += 1 } c.mm.pgd = 0 // 7. Hand off to the argv-aware loader: PT_LOAD map + eager stack + // argv memcpy + x0/x1/sp + set_pgd. Returns 0 (eret jumps to // e_entry, so the caller's post-svc PC is unreachable) or -1. blk is // a stack local — the trampoline derefs it by value immediately, and // blk.bytes points into argv_scratch (static). const rc = move_to_user_elf_argv(#intFromPtr(&exec_buf), file_size, #intFromPtr(&blk)) if rc < 0 { // Past the point of no return: the address space is already torn // down (pgd == 0), so the caller cannot resume. A loader -1 here is // OOM (allocate_user_page exhausted mid-PT_LOAD / stack). Emit the // marker and zombie the task. exit_process never returns. main_output(MU, "[KERN] OOM\n") exit_process() } // Success: the eret jumps to e_entry, so this "return value" is never // read by the (now-replaced) caller. Instead ret_from_syscall // (arch/aarch64/entry.S) does `str x0, [sp, 0]` AFTER the loader runs, storing // this value into the saved-x0 slot — which becomes the new program's // x0. The AAPCS64 entry contract is x0 = argc, so success MUST return // argc: the loader's `regs.regs[0] = argc` frame write is otherwise // clobbered by that str (x1 = argv survives — ret_from_syscall touches // only x0). argc <= MAX_ARGV (32), so the i32 cast cannot truncate. return #intCast(argc) } // Kernel-side scratch buffer the encoder serialises into. Single- // threaded exec path + sequential host tests, so a module-level buffer // is safe; prepare_move_to_user_elf copies the returned slice into the // top stack page before any reuse. var argv_scratch [MAX_ARGV_BYTES]u8 = undefined // Lay out the argv block (pointer array + NUL-terminated strings) for a // fresh user stack, high → low inside the top stack page: // // top_stack_uva ← exclusive end of the mapped page // NULL guard (8 B) // argv[argc-1] string … argv[0] string (NUL-terminated, packed) // NULL terminator (8 B, == argv[argc]) // argv[argc-1] ptr … argv[0] ptr (8 B each, UVA into strings) // ← sp == argv_uva == &argv[0] // // The returned `bytes` are the serialised image whose lowest byte lands // at top_stack_uva - bytes.len; prepare_move_to_user_elf memcpys it into // the page's KVA alias at offset PAGE_SIZE - bytes.len. Pointers are // computed as user VAs against that final placement, so `top_stack_uva` // must be the user VA of the top of the stack page (STACK_TOP), not the // kernel alias. sp is 16-byte aligned per AAPCS64 (STACK_TOP is page- // aligned, so aligning the total length to 16 suffices). // // Returns null on a soft fault: more than MAX_ARGV strings, or a total // image larger than MAX_ARGV_BYTES (callers turn this into a clean -1 // rather than a half-built stack). pub fn encodeArgvBlock( top_stack_uva u64, argc usize, kargv [*][]u8 ) ?ArgvBlock { if argc > MAX_ARGV { return null } // String bytes = each arg plus its NUL terminator. Bail early if the // strings alone blow the budget (guards against usize overflow on a // pathological length too). var str_bytes usize = 0 var i usize = 0 while i < argc { str_bytes += kargv[i].len + 1 if str_bytes > MAX_ARGV_BYTES { return null } i += 1 } // Region sizes. The pointer array is argc entries; argv[argc] NULL // terminator and the top NULL guard are 8 B each. const ptr_bytes = argc * 8 const core = ptr_bytes + 8 + str_bytes + 8 const total = std.mem.alignForward(usize, core, 16) if total > MAX_ARGV_BYTES { return null } // scratch[0] is the lowest byte → final user VA top_stack_uva - total. const base_uva = top_stack_uva - total #memset(argv_scratch[0..total], 0) // Pointer array at [0, ptr_bytes); argv[argc] NULL at [ptr_bytes, // ptr_bytes+8) is left zero. Strings packed ascending from there, // argv[0] lowest. Each pointer is the user VA of its string. var str_off usize = ptr_bytes + 8 i = 0 while i < argc { const s = kargv[i] std.mem.writeInt(u64, argv_scratch[i * 8 ..][0..8], base_uva + str_off, .little) #memcpy(argv_scratch[str_off..][0..s.len], s) argv_scratch[str_off + s.len] = 0 str_off += s.len + 1 i += 1 } // [str_off, total) is the NULL guard + 16-byte alignment pad, already // zeroed by the memset above. return .{ .sp = base_uva, .argv_uva = base_uva, .argc = argc, .bytes = argv_scratch[0..total] } } // ---- Host Tests ---- const testing = std.testing // Page-aligned top-of-stack user VA for layout assertions (the real // call site passes user_layout.STACK_TOP, itself page-aligned). const TEST_TOP u64 = 0x0000_0FFF_FFFF_F000 const TEST_PAGE u64 = 1 << 12 // Resolve argv[i] back to its string by walking the encoded image: the // pointer is a user VA whose offset from base (== block start) indexes // straight into `bytes`. fn argAt(blk ArgvBlock, i usize) []u8 { const p = std.mem.readInt(u64, blk.bytes[i * 8 ..][0..8], .little) const off usize = #intCast(p - blk.sp) return std.mem.sliceTo(#as([*:0]u8, #ptrCast(&blk.bytes[off])), 0) } test "execve: encodeArgvBlock lays out argc=3" { const kargv = [_][]u8{ "argv_echo", "A", "B" } const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull try testing.expectEqual(#as(u64, 3), blk.argc) try testing.expectEqual(blk.sp, blk.argv_uva) try testing.expectEqual(#as(u64, 0), blk.sp % 16) // Block sits entirely inside the top stack page and butts STACK_TOP. try testing.expectEqual(TEST_TOP, blk.sp + blk.bytes.len) try testing.expect(blk.sp >= TEST_TOP - TEST_PAGE) try testing.expectEqualStrings("argv_echo", argAt(blk, 0)) try testing.expectEqualStrings("A", argAt(blk, 1)) try testing.expectEqualStrings("B", argAt(blk, 2)) // argv[argc] is the NULL terminator. try testing.expectEqual(#as(u64, 0), std.mem.readInt(u64, blk.bytes[3 * 8 ..][0..8], .little)) } test "execve: encodeArgvBlock empty argv is a lone NULL" { const kargv = [_][]u8{} const blk = encodeArgvBlock(TEST_TOP, 0, &kargv) orelse return error.UnexpectedNull try testing.expectEqual(#as(u64, 0), blk.argc) try testing.expectEqual(blk.sp, blk.argv_uva) try testing.expectEqual(#as(u64, 0), blk.sp % 16) // argv[0] is immediately NULL: argc=0 + a NULL-terminated empty array. try testing.expectEqual(#as(u64, 0), std.mem.readInt(u64, blk.bytes[0..8], .little)) } test "execve: encodeArgvBlock rejects more than MAX_ARGV strings" { var kargv [MAX_ARGV + 1][]u8 = undefined for *s in &kargv { s.* = "x" } try testing.expectEqual(#as(?ArgvBlock, null), encodeArgvBlock(TEST_TOP, kargv.len, &kargv)) } test "execve: encodeArgvBlock rejects oversize byte budget" { var big [MAX_ARGV_BYTES]u8 = undefined const kargv = [_][]u8{big[0..]} try testing.expectEqual(#as(?ArgvBlock, null), encodeArgvBlock(TEST_TOP, kargv.len, &kargv)) } test "execve: encodeArgvBlock keeps sp 16-aligned for odd lengths" { // Lengths chosen so the unaligned `core` size is not a multiple of 16. const kargv = [_][]u8{ "abc", "de" } const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull try testing.expectEqual(#as(u64, 0), blk.sp % 16) try testing.expectEqual(TEST_TOP, blk.sp + blk.bytes.len) try testing.expectEqualStrings("abc", argAt(blk, 0)) try testing.expectEqualStrings("de", argAt(blk, 1)) } test "execve: encodeArgvBlock pointers stay inside the stack page" { const kargv = [_][]u8{ "one", "two", "three" } const blk = encodeArgvBlock(TEST_TOP, kargv.len, &kargv) orelse return error.UnexpectedNull var i usize = 0 while i < blk.argc { const p = std.mem.readInt(u64, blk.bytes[i * 8 ..][0..8], .little) try testing.expect(p >= TEST_TOP - TEST_PAGE) try testing.expect(p < TEST_TOP) i += 1 } }