#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "nbconvert>=7.0",  # For converting .ipynb files
#     "pyperclip>=1.8",  # For cross-platform clipboard access
#     "tiktoken>=0.7",   # For tokenization
#     "requests",        # requests
# ]
# ///

import sys
import subprocess
import platform
from pathlib import Path
import io
import logging
import argparse
import re
from typing import NoReturn
import os
import requests
import tempfile

# --- Configuration ---
DEFAULT_UPDATE_URL = (
    "https://raw.githubusercontent.com/wkaisertexas/wkaisertexas.github.io/refs/heads/main/src/blog/personal-development-setup/cpp-ctx.py"
)

MAX_CSV_LINES = 10  # Max number of lines to show from CSV files
MAX_TOKENS = 24_000 # max amount of context. Need to give a bit of a buffer for reasoning
EXCLUDED_DIRS = { # Set of lowercase directory names to exclude
    'build',
    'install',
    'log',
    'logs',
    '.git',
    '__pycache__',
    '.pytest_cache',
    'node_modules',
    '.venv',
    'venv',
    'env',
    '.env',
    'dist',
    'bin', # Often contains compiled outputs or scripts not meant as source
    'obj', # Common intermediate build folder name
    '.vscode', # Editor-specific config
    '.idea',  # Editor-specific config
}

# File types and their priorities (lower number = higher priority)
FILE_PRIORITIES = {
    "CMakeLists.txt": 0,
    ".md": -1,
    ".cmake": 1,
    ".sh": 2,
    "Makefile": 3,
    ".py": 10,
    ".ipynb": 10,
    ".cpp": 20,
    ".hpp": 20,
    ".c": 20,
    ".h": 20,
    ".cc": 20,
    ".hh": 20,
    ".cu": 20,
    ".cuh": 20,
    ".rs": 21,
    ".csv": 30,
    ".action": 40,
    ".msg": 40,
    ".srv": 40,
    ".tsx": 50,
    ".ts": 50,
    ".jsx": 51,
    ".js": 51,
    # Add other build system files if needed
}

# Whether or something contains a header or an implementation
HEADER_EXTS = {'.h', '.hpp', '.hh', '.cuh'}
IMPL_EXTS = {'.c', '.cpp', '.cc', '.cu'}

# Mapping file extensions to markdown language identifiers
LANG_MAP = {
    ".md": "md",
    ".py": "python",
    ".ipynb": "python", # After conversion
    ".sh": "sh",
    "Makefile": "make",
    ".cpp": "cpp",
    ".hpp": "cpp",
    ".c": "c",
    ".h": "c",
    ".cc": "cpp",
    ".hh": "cpp",
    ".cu": "cuda",
    ".cuh": "cuda",
    ".cmake": "cmake",
    ".rs": "rust",
    "CMakeLists.txt": "cmake",
    ".csv": "csv",
    ".action": "console",
    ".msg": "console",
    ".srv": "console",
    ".tsx": "tsx",
    ".ts": "typescript",
    ".jsx": "tsx",
    ".js": "javascript",
    # Add other mappings as needed
}

MAX_LINES_PARAMETER = 600
HEAD_LINES = 10

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def self_update() -> None:
    if os.environ.get("CPP_CTX_NO_UPDATE"):
      logging.debug("Self-update: disabled by environment.")
      return
    
    me = Path(__file__).resolve()
    if not me.exists() or not me.is_file() or "wkaisertexas.github.io" in str(me):
        logging.debug("Self-update: current path is not a regular file; skipping.")
        return
    
    try:
        r = requests.get(
            DEFAULT_UPDATE_URL,
            timeout=10.0,
        )
        r.raise_for_status()

        new_bytes = r.content
    except requests.RequestException as e:
        logging.warning(f"Self update fetch failed with error {e}")
        return None
    
    try:
         current_bytes = me.read_bytes()
    except Exception as e:
        logging.debug(f"Self-update: couldn't read current script for comparison: {e}")
        current_bytes = None

    if current_bytes is not None and current_bytes == new_bytes:
        logging.debug("Self-update: already up to date; no replacement needed.")
        return
    
    with tempfile.NamedTemporaryFile("wb", delete=False, dir=me.parent) as tf:
        tf.write(new_bytes)
        tf.flush()
        os.fsync(tf.fileno())
        tmp = Path(tf.name)

    os.chmod(tmp, me.stat().st_mode)
    os.replace(tmp, me)

    env = os.environ.copy()
    env["CPP_CTX_NO_UPDATE"] = "1"
    os.execve(sys.executable, [sys.executable, str(me), *sys.argv[1:]], env)
    sys.exit(0)

try:
    import tiktoken
    tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
except ModuleNotFoundError:
    tiktoken_encoder = None
    logging.warning("tiktoken not installed; per-file token counts will be skipped.")

# --- Helper Functions ---

def get_file_priority(path: Path) -> int:
    """Determines the sort priority of a file."""
    if path.name == "CMakeLists.txt":
        return FILE_PRIORITIES.get("CMakeLists.txt", 99)
    return FILE_PRIORITIES.get(path.suffix.lower(), 99)

def get_markdown_language(path: Path) -> str:
    """Gets the markdown language identifier for a file."""
    if path.name == "CMakeLists.txt":
        return LANG_MAP.get("CMakeLists.txt", "")
    return LANG_MAP.get(path.suffix.lower(), "")

def convert_notebook_to_script(notebook_path: Path) -> str | None:
    """Converts an .ipynb notebook to a Python script string using nbconvert."""
    logging.info(f"Converting notebook: {notebook_path}")
    try:
        # Ensure using the python executable managed by uv/the script runner
        python_executable = sys.executable
        result = subprocess.run(
            [python_executable, "-m", "nbconvert", "--to", "script", "--stdout", str(notebook_path)],
            capture_output=True,
            text=True,
            check=True,
            encoding='utf-8'
        )
        logging.info(f"Successfully converted {notebook_path}")
        return result.stdout
    except FileNotFoundError:
        logging.error("Error: 'nbconvert' command not found. Is it installed in the uv environment?")
        return None
    except subprocess.CalledProcessError as e:
        logging.error(f"Error converting notebook {notebook_path}:")
        logging.error(f"Stderr: {e.stderr}")
        return None
    except Exception as e:
        logging.error(f"An unexpected error occurred during notebook conversion {notebook_path}: {e}")
        return None


strip_content_single = re.compile(r"//[^\n]*\bTODO\b[^\n]*\n?", flags=re.IGNORECASE)
strip_content_block  = re.compile(r"/\*[^*]*?\bTODO\b[\s\S]*?\*/", flags=re.IGNORECASE | re.DOTALL)
def strip_content(content: str) -> str:
    """
    Strips all comments using a regex
    """

    content = strip_content_single.sub("", content)
    content = strip_content_block.sub("", content)

    return content

def read_file_content(path: Path) -> str | None:
    """Reads content based on file type."""
    content = None
    try:
        if path.suffix.lower() == ".ipynb":
            content = convert_notebook_to_script(path)
        elif path.suffix.lower() == ".csv":
            with path.open('r', encoding='utf-8', errors='ignore') as f:
                lines = [next(f) for _ in range(MAX_CSV_LINES)]
            content = "".join(lines)
            # Check if more lines might exist by trying to read one more line
            try:
                with path.open('r', encoding='utf-8', errors='ignore') as f:
                    for i, _ in enumerate(f):
                        if i >= MAX_CSV_LINES:
                            content += "\n... (truncated)\n"
                            break
            except StopIteration: # Reached end of file within MAX_CSV_LINES
                pass

        else:
             # For text files, attempt to read as UTF-8, fallback if needed
             with path.open('r', encoding='utf-8', errors='ignore') as f:
                 lines = f.readlines()
             if len(lines) > MAX_LINES_PARAMETER:
                 content = "".join(lines[:HEAD_LINES]) + "\n... (truncated)\n"
             else:
                 content = "".join(lines)

    except FileNotFoundError:
        logging.error(f"File not found during read: {path}")
        return None
    except UnicodeDecodeError:
        logging.warning(f"Could not decode file {path} as UTF-8. Skipping binary or incompatible file.")
        return None # Skip files that cannot be decoded
    except Exception as e:
        logging.error(f"Error reading file {path}: {e}")
        return None

    if path.suffix.lower() in {'.c', '.cpp', '.cc', '.cu', '.h', '.hpp', '.hh', '.cuh'}:
        content = strip_content(content)

    token_count = (len(tiktoken_encoder.encode_ordinary(content)) if content else 0) if tiktoken_encoder else 0
    logging.info(f"Reading file ({token_count:7,}): {path}")

    return content, token_count

def get_git_submodule_dirs(repo_root: Path) -> set[Path]:
    """
    Return the absolute paths of every Git sub-module declared in .gitmodules.
    Cost: O(lines in .gitmodules) – run once.
    """
    gm = repo_root / ".gitmodules"
    if not gm.exists():
        return set()

    sub_dirs: set[Path] = set()
    with gm.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if line.startswith("path"):
                # line:  path = some/dir
                _, path_str = line.split("=", 1)
                sub_dirs.add((repo_root / path_str.strip()).resolve())
    return sub_dirs

def copy_to_clipboard(text: str):
    """Copies text to the system clipboard."""
    try:
        import pyperclip
        pyperclip.copy(text)
        logging.info("Content successfully copied to clipboard using pyperclip.")
    except ImportError:
         logging.warning("pyperclip not found. Attempting system-specific commands.")
         system = platform.system()
         try:
             if system == "Darwin":  # macOS
                 process = subprocess.Popen('pbcopy', env={'LANG': 'en_US.UTF-8'}, stdin=subprocess.PIPE)
                 process.communicate(text.encode('utf-8'))
                 logging.info("Content copied to clipboard using pbcopy (macOS).")
             elif system == "Linux":
                 # Try Wayland's wl-copy first
                 try:
                     process = subprocess.Popen(['wl-copy'], stdin=subprocess.PIPE)
                     process.communicate(text.encode('utf-8'))
                     logging.info("Content copied to clipboard using wl-copy (Wayland).")
                 except FileNotFoundError:
                     logging.warning("wl-copy not found. Trying xclip (X11).")
                     # Fallback to X11's xclip
                     try:
                         process = subprocess.Popen(['xclip', '-selection', 'clipboard'], stdin=subprocess.PIPE)
                         process.communicate(text.encode('utf-8'))
                         logging.info("Content copied to clipboard using xclip (X11).")
                     except FileNotFoundError:
                         logging.error("Error: Neither wl-copy nor xclip found on Linux. Cannot copy to clipboard.")
                         print("\n--- SCRIPT OUTPUT (Could not copy to clipboard) ---")
                         print(text)
                         print("--- END SCRIPT OUTPUT ---")
                         return # Avoid further error messages

             else:
                 logging.error(f"Clipboard access not supported on this platform ({system}) via subprocess.")
                 print("\n--- SCRIPT OUTPUT (Could not copy to clipboard) ---")
                 print(text)
                 print("--- END SCRIPT OUTPUT ---")
         except Exception as e:
              logging.error(f"Error copying to clipboard using system command: {e}")
              print("\n--- SCRIPT OUTPUT (Error copying to clipboard) ---")
              print(text)
              print("--- END SCRIPT OUTPUT ---")

# --- Argument Parsing ---
def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Scans a directory recursively for relevant files (excluding specific folders), formats their content, and copies it to the clipboard.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "target_dir",
        nargs="?",
        default=".",
        help="The target directory to scan recursively."
    )
    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Enable debug logging."
    )
    parser.add_argument(
        "--submodules",
        action="store_true",
        help="Include files that live inside Git sub-modules."
    )
    # Example for future extension: allow adding exclusions
    # parser.add_argument(
    #     "--exclude-dir",
    #     action="append",
    #     default=[],
    #     help="Add a directory name to exclude (case-insensitive). Can be used multiple times."
    # )

    return parser.parse_args()

def main() -> NoReturn:
    self_update()
    args = parse_arguments()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    target_dir_path = Path(args.target_dir)

    if not target_dir_path.is_dir():
        logging.error(f"Error: Target directory '{target_dir_path}' not found or is not a directory.")
        sys.exit(1)

    target_dir = target_dir_path.resolve() # Use absolute path
    submodule_roots = get_git_submodule_dirs(target_dir)

    # --- File Scanning (Recursive BFS with Exclusions) ---
    files_to_process = []
    queue = [target_dir] # Start BFS queue with the resolved target directory
    visited_dirs = {target_dir} # Keep track of visited directories to avoid cycles

    logging.info(f"Scanning directory recursively: {target_dir}")
    logging.info(f"Excluding directory names (case-insensitive): {', '.join(sorted(EXCLUDED_DIRS))}")

    while queue:
        current_dir = queue.pop(0)
        logging.debug(f"Scanning in: {current_dir}")

        try:
            # Sort items alphabetically within the directory for consistent processing order
            items_in_dir = sorted(current_dir.iterdir(), key=lambda p: p.name)

            for item in items_in_dir:
                # 1. Skip Symbolic Links entirely to avoid complexity/cycles
                if item.is_symlink():
                     logging.debug(f"Skipping symbolic link: {item}")
                     continue

                # 2. Process Directories
                if item.is_dir():
                    dir_name_lower = item.name.lower()
                    resolved_item = item.resolve() # Resolve to handle relative paths and check visited

                    if dir_name_lower in EXCLUDED_DIRS:
                        logging.debug(f"Skipping excluded directory: {item}")
                        continue # Skip this directory and its contents

                    if (not args.submodules) and resolved_item in submodule_roots:
                        logging.debug(f"Skipping sub-module: {item} (use --submodules to include)")
                        continue

                    if resolved_item not in visited_dirs:
                        visited_dirs.add(resolved_item)
                        queue.append(item) # Add directory to queue for further scanning
                        logging.debug(f"Adding directory to scan queue: {item}")
                    else:
                        logging.debug(f"Already visited directory: {item}")


                # 3. Process Files
                elif item.is_file():
                     # Check if it's a target file type (CMakeLists.txt or matching suffix)
                     if item.name == "CMakeLists.txt" or item.suffix.lower() in FILE_PRIORITIES:
                         files_to_process.append(item) # Collect the file Path object

        except PermissionError:
            logging.warning(f"Permission denied accessing: {current_dir}")
        except FileNotFoundError:
             logging.warning(f"Directory not found during scan (might have been deleted): {current_dir}")
        except Exception as e:
             logging.error(f"Error scanning directory {current_dir}: {e}")


    # Sort collected files: Priority first, then alphabetically by full relative path
    def sort_key(path_obj):
        priority = get_file_priority(path_obj)

        header_rank = 3

        suffix = path_obj.suffix.lower()
        if suffix in HEADER_EXTS:
            header_rank = 1
        if suffix in IMPL_EXTS:
            header_rank = 2

        try:
            # Ensure consistent sorting using POSIX-style relative paths
            relative_path_str = str(path_obj.relative_to(target_dir).as_posix())
        except ValueError:
             # Fallback if somehow not relative (shouldn't happen with current logic but safe)
            relative_path_str = str(path_obj.as_posix())
        base_name = path_obj.stem.lower()

        is_source_like = suffix in HEADER_EXTS or suffix in IMPL_EXTS
        is_test_file = (
            is_source_like
            and (
                "/test/" in relative_path_str
                or path_obj.stem.lower().startswith("test_")
                or path_obj.stem.lower().endswith("_test")
            )
        )

        test_rank = 1 if is_test_file else 0

        return (priority, test_rank, base_name, header_rank, relative_path_str)
 
    sorted_files = sorted(files_to_process, key=sort_key)
    logging.info(f"Found {len(sorted_files)} relevant files across all scanned subdirectories.")

    # --- Build the output string ---
    output = io.StringIO()
    output.write(f"# Context for Folder: {target_dir} (Recursive Scan)\n\n")

    total_tokens = 0
    for file_path in sorted_files:
        try:
            # Use POSIX paths for consistency in the output markdown
            relative_path = file_path.relative_to(target_dir).as_posix()
        except ValueError:
            logging.warning(f"Could not determine relative path for {file_path} relative to {target_dir}. Using absolute path.")
            relative_path = file_path.as_posix()

        logging.debug(f"Processing: {relative_path} (Priority: {get_file_priority(file_path)})")

        content, num_tokens = read_file_content(file_path)
        if content is None:
            logging.warning(f"Skipping file due to read/conversion error or incompatible encoding: {relative_path}")
            continue

        if num_tokens + total_tokens > MAX_TOKENS:
            logging.warning(f"Out of tokens, skipping file: {relative_path}")
            continue

        total_tokens += num_tokens

        lang = get_markdown_language(file_path)

        output.write(f"## ./{relative_path}\n") # Prepend ./ for clarity
        if lang != 'md':
            output.write(f"```{lang}\n")
        output.write(content.strip()) # Remove leading/trailing whitespace from content
        
        if lang != 'md':
            output.write("\n```\n\n")

    prefix = """
When adding to the codebase, do your best to use the existing codebase for inspiration and avoid commenting every single line. Avoid dependencies as duplicated code can often be better than non-duplicated code in instances such as these.

---
"""
    final_output = prefix.strip() + output.getvalue()
    output.close()

    logging.info(f"Total of {total_tokens:7,} tokens")

    # --- Copy to clipboard ---
    if final_output.strip(): # Check if there's non-whitespace content
        copy_to_clipboard(final_output)
    else:
        logging.info("No relevant files found or processed. Nothing to copy.")

if __name__ == "__main__":
    main()