{ "cells": [ { "cell_type": "markdown", "id": "ohuujbmsz7", "metadata": {}, "source": [ "# Autoresearch Experiment Analysis\n", "\n", "Analysis of autonomous hyperparameter tuning results from `results.tsv`." ] }, { "cell_type": "code", "execution_count": null, "id": "v3r8c77lxhs", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# Load the TSV (tab-separated, 5 columns: commit, val_bpb, memory_gb, status, description)\n", "df = pd.read_csv(\"results.tsv\", sep=\"\\t\")\n", "df[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\n", "df[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\n", "df[\"status\"] = df[\"status\"].str.strip().str.upper()\n", "\n", "print(f\"Total experiments: {len(df)}\")\n", "print(f\"Columns: {list(df.columns)}\")\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "0v37bji707o", "metadata": {}, "outputs": [], "source": [ "counts = df[\"status\"].value_counts()\n", "print(\"Experiment outcomes:\")\n", "print(counts.to_string())\n", "\n", "n_keep = counts.get(\"KEEP\", 0)\n", "n_discard = counts.get(\"DISCARD\", 0)\n", "n_crash = counts.get(\"CRASH\", 0)\n", "n_decided = n_keep + n_discard\n", "if n_decided > 0:\n", " print(f\"\\nKeep rate: {n_keep}/{n_decided} = {n_keep / n_decided:.1%}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "j887idiuu5", "metadata": {}, "outputs": [], "source": [ "# Show all KEPT experiments (the improvements that stuck)\n", "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", "print(f\"KEPT experiments ({len(kept)} total):\\n\")\n", "for i, row in kept.iterrows():\n", " bpb = row[\"val_bpb\"]\n", " desc = row[\"description\"]\n", " print(f\" #{i:3d} bpb={bpb:.6f} mem={row['memory_gb']:.1f}GB {desc}\")" ] }, { "cell_type": "markdown", "id": "99l0xlw0lv", "metadata": {}, "source": [ "## Val BPB Over Time\n", "\n", "Track how the best (kept) val_bpb evolves as experiments progress. The running minimum shows the \"frontier\" -- the best result achieved so far." ] }, { "cell_type": "code", "execution_count": null, "id": "79jh74veqg9", "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(16, 8))\n", "\n", "# Filter out crashes for plotting\n", "valid = df[df[\"status\"] != \"CRASH\"].copy()\n", "valid = valid.reset_index(drop=True)\n", "\n", "baseline_bpb = valid.loc[0, \"val_bpb\"]\n", "\n", "# Only plot points at or below baseline (the interesting region)\n", "below = valid[valid[\"val_bpb\"] <= baseline_bpb + 0.0005]\n", "\n", "# Plot discarded as faint background dots\n", "disc = below[below[\"status\"] == \"DISCARD\"]\n", "ax.scatter(disc.index, disc[\"val_bpb\"],\n", " c=\"#cccccc\", s=12, alpha=0.5, zorder=2, label=\"Discarded\")\n", "\n", "# Plot kept experiments as prominent green dots\n", "kept_v = below[below[\"status\"] == \"KEEP\"]\n", "ax.scatter(kept_v.index, kept_v[\"val_bpb\"],\n", " c=\"#2ecc71\", s=50, zorder=4, label=\"Kept\", edgecolors=\"black\", linewidths=0.5)\n", "\n", "# Running minimum step line\n", "kept_mask = valid[\"status\"] == \"KEEP\"\n", "kept_idx = valid.index[kept_mask]\n", "kept_bpb = valid.loc[kept_mask, \"val_bpb\"]\n", "running_min = kept_bpb.cummin()\n", "ax.step(kept_idx, running_min, where=\"post\", color=\"#27ae60\",\n", " linewidth=2, alpha=0.7, zorder=3, label=\"Running best\")\n", "\n", "# Label each kept experiment with its description\n", "for idx, bpb in zip(kept_idx, kept_bpb):\n", " desc = str(valid.loc[idx, \"description\"]).strip()\n", " if len(desc) > 45:\n", " desc = desc[:42] + \"...\"\n", "\n", " ax.annotate(desc, (idx, bpb),\n", " textcoords=\"offset points\",\n", " xytext=(6, 6), fontsize=8.0,\n", " color=\"#1a7a3a\", alpha=0.9,\n", " rotation=30, ha=\"left\", va=\"bottom\")\n", "\n", "n_total = len(df)\n", "n_kept = len(df[df[\"status\"] == \"KEEP\"])\n", "ax.set_xlabel(\"Experiment #\", fontsize=12)\n", "ax.set_ylabel(\"Validation BPB (lower is better)\", fontsize=12)\n", "ax.set_title(f\"Autoresearch Progress: {n_total} Experiments, {n_kept} Kept Improvements\", fontsize=14)\n", "ax.legend(loc=\"upper right\", fontsize=9)\n", "ax.grid(True, alpha=0.2)\n", "\n", "# Y-axis: from just below best to just above baseline\n", "margin = (baseline_bpb - best) * 0.15\n", "ax.set_ylim(best - margin, baseline_bpb + margin)\n", "\n", "plt.tight_layout()\n", "plt.savefig(\"progress.png\", dpi=150, bbox_inches=\"tight\")\n", "plt.show()\n", "print(\"Saved to progress.png\")" ] }, { "cell_type": "markdown", "id": "ce48phivyou", "metadata": {}, "source": [ "## Summary Statistics" ] }, { "cell_type": "code", "execution_count": null, "id": "re1f8za8oj9", "metadata": {}, "outputs": [], "source": [ "# Summary stats\n", "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", "baseline_bpb = df.iloc[0][\"val_bpb\"]\n", "best_bpb = kept[\"val_bpb\"].min()\n", "best_row = kept.loc[kept[\"val_bpb\"].idxmin()]\n", "\n", "print(f\"Baseline val_bpb: {baseline_bpb:.6f}\")\n", "print(f\"Best val_bpb: {best_bpb:.6f}\")\n", "print(f\"Total improvement: {baseline_bpb - best_bpb:.6f} ({(baseline_bpb - best_bpb) / baseline_bpb * 100:.2f}%)\")\n", "print(f\"Best experiment: {best_row['description']}\")\n", "print()\n", "\n", "# How many experiments to find each improvement\n", "print(\"Cumulative effort per improvement:\")\n", "kept_sorted = kept.reset_index()\n", "for i, (_, row) in enumerate(kept_sorted.iterrows()):\n", " desc = str(row[\"description\"]).strip()\n", " print(f\" Experiment #{row['index']:3d}: bpb={row['val_bpb']:.6f} {desc}\")" ] }, { "cell_type": "markdown", "id": "oxri9h5c9gs", "metadata": {}, "source": [ "## Top Hits (Kept Experiments by Improvement)" ] }, { "cell_type": "code", "execution_count": null, "id": "q86hxu10djk", "metadata": {}, "outputs": [], "source": [ "# Each kept experiment's delta is measured vs the previous kept experiment's bpb\n", "# (since experiments are cumulative -- each one builds on the last kept state)\n", "kept = df[df[\"status\"] == \"KEEP\"].copy()\n", "kept[\"prev_bpb\"] = kept[\"val_bpb\"].shift(1)\n", "kept[\"delta\"] = kept[\"prev_bpb\"] - kept[\"val_bpb\"]\n", "\n", "# Drop baseline (no delta)\n", "hits = kept.iloc[1:].copy()\n", "\n", "# Sort by delta improvement (biggest first)\n", "hits = hits.sort_values(\"delta\", ascending=False)\n", "\n", "print(f\"{'Rank':>4} {'Delta':>8} {'BPB':>10} Description\")\n", "print(\"-\" * 80)\n", "for rank, (_, row) in enumerate(hits.iterrows(), 1):\n", " print(f\"{rank:4d} {row['delta']:+.6f} {row['val_bpb']:.6f} {row['description']}\")\n", "\n", "print(f\"\\n{'':>4} {hits['delta'].sum():+.6f} {'':>10} TOTAL improvement over baseline\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f9bffe89", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }