# SkyPilot task template for autonomous code optimization experiments. # # Each experiment: # 1. Installs dependencies (setup, runs once per VM) # 2. Builds the project # 3. Runs the benchmark (autoresearch.sh) # 4. Runs correctness checks (autoresearch.checks.sh) # 5. Reports METRIC lines and check results # # Usage: # sky launch -c vm-01 experiment.yaml \ # --env EXPERIMENT_ID=exp-01 \ # --env EXPERIMENT_DESC="add SIMD to hash lookup" \ # -d -y # # The workdir is snapshotted at submission time, so parallel experiments # with different code edits use --workdir to point at separate copies. resources: cpus: 4+ memory: 8+ # No GPU needed -- this is for CPU-bound code optimization. # Override with --gpus if your target needs GPU benchmarking. workdir: . envs: EXPERIMENT_ID: baseline EXPERIMENT_DESC: "baseline measurement" # Override these per-project via --env: BUILD_CMD: "make -j$(nproc)" BENCH_TIMEOUT: 300 CHECK_TIMEOUT: 300 setup: | # Install project-specific dependencies. # Runs once when the VM is first provisioned. cd ~/sky_workdir if [ -f setup_deps.sh ]; then echo ">>> Running setup_deps.sh..." bash setup_deps.sh else echo ">>> No setup_deps.sh found, running BUILD_CMD: ${BUILD_CMD}" bash -c "${BUILD_CMD}" fi run: | cd ~/sky_workdir echo "=========================================" echo "EXPERIMENT: ${EXPERIMENT_ID}" echo "DESC: ${EXPERIMENT_DESC}" echo "=========================================" # --- Build --- echo ">>> Building..." set -o pipefail if ! bash -c "${BUILD_CMD}" 2>&1 | tail -30; then echo "EXPERIMENT_STATUS: crash" echo "EXPERIMENT_ERROR: build failed" exit 0 # exit 0 so SkyPilot marks the job as SUCCEEDED for log retrieval fi # --- Benchmark --- if [ ! -f autoresearch.sh ]; then echo "EXPERIMENT_STATUS: crash" echo "EXPERIMENT_ERROR: autoresearch.sh not found" exit 0 fi echo ">>> Running benchmark (timeout: ${BENCH_TIMEOUT}s)..." BENCH_OUTPUT=$(timeout "${BENCH_TIMEOUT}" bash autoresearch.sh 2>&1) || { EXIT_CODE=$? echo "$BENCH_OUTPUT" if [ $EXIT_CODE -eq 124 ]; then echo "EXPERIMENT_STATUS: crash" echo "EXPERIMENT_ERROR: benchmark timed out after ${BENCH_TIMEOUT}s" else echo "EXPERIMENT_STATUS: crash" echo "EXPERIMENT_ERROR: benchmark failed (exit $EXIT_CODE)" fi exit 0 } echo "$BENCH_OUTPUT" # Extract METRIC lines METRICS=$(echo "$BENCH_OUTPUT" | grep -E '^METRIC\s+\S+=\S+' || true) if [ -z "$METRICS" ]; then echo "EXPERIMENT_STATUS: crash" echo "EXPERIMENT_ERROR: no METRIC lines found in benchmark output" exit 0 fi # --- Correctness checks --- CHECKS_PASSED=true if [ -f autoresearch.checks.sh ]; then echo ">>> Running correctness checks (timeout: ${CHECK_TIMEOUT}s)..." CHECK_OUTPUT=$(timeout "${CHECK_TIMEOUT}" bash autoresearch.checks.sh 2>&1) || { EXIT_CODE=$? echo "$CHECK_OUTPUT" if [ $EXIT_CODE -eq 124 ]; then echo "CHECKS_STATUS: timeout" else echo "CHECKS_STATUS: failed (exit $EXIT_CODE)" fi CHECKS_PASSED=false } if [ "$CHECKS_PASSED" = true ]; then echo "CHECKS_STATUS: passed" fi else echo "CHECKS_STATUS: skipped (no autoresearch.checks.sh)" fi # --- Report --- echo "=========================================" echo "EXPERIMENT_ID: ${EXPERIMENT_ID}" echo "EXPERIMENT_DESC: ${EXPERIMENT_DESC}" echo "$METRICS" if [ "$CHECKS_PASSED" = true ]; then echo "EXPERIMENT_STATUS: done" else echo "EXPERIMENT_STATUS: checks_failed" fi echo "========================================="