#!/usr/bin/env python3
"""Split IR/MIR dumps from LLVM debug output into separate files.

This script parses log files generated by LLVM's -print-before-all or
-print-after-all options and splits them into individual IR (.ll) or
MIR (.mir) files.

Supported header formats:
- LLVM IR (legacy): *** IR Dump After/Before {pass} on {target} ***
- LLVM IR (new):    *** IR Dump After/Before {pass} ({short_name}) ***
- MIR:              # *** IR Dump After/Before {pass} ({short_name}) ***:
"""

from __future__ import annotations

import argparse
import logging
import re
import sys
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from typing import TextIO

logger = logging.getLogger("split-ir")


class DumpType(Enum):
    """Type of IR dump."""

    LLVM_IR = auto()
    MIR = auto()

    @property
    def extension(self) -> str:
        """Return the file extension for this dump type."""
        return ".ll" if self == DumpType.LLVM_IR else ".mir"


class Direction(Enum):
    """Direction of the dump (before or after the pass)."""

    BEFORE = "before"
    AFTER = "after"


@dataclass
class DumpHeader:
    """Parsed information from a dump header line."""

    dump_type: DumpType
    direction: Direction
    pass_name: str
    target: str

    def to_filename(self, number: int) -> str:
        """Generate a sanitized filename for this dump."""
        # Sanitize pass_name and target for filesystem safety
        safe_pass = self._sanitize(self.pass_name)
        safe_target = self._sanitize(self.target)
        return f"{number}-{self.direction.value}-{safe_pass}-{safe_target}{self.dump_type.extension}"

    @staticmethod
    def _sanitize(name: str) -> str:
        """Sanitize a string for use in a filename."""
        # Replace problematic characters with underscores
        sanitized = re.sub(r'[<>:"/\\|?*\s]', "_", name)
        # Collapse multiple underscores
        sanitized = re.sub(r"_+", "_", sanitized)
        # Remove leading/trailing underscores
        return sanitized.strip("_")


# Regex patterns for different header formats
# LLVM IR legacy format: *** IR Dump After {pass} on {target} ***
PATTERN_IR_LEGACY = re.compile(r"^\*\*\* IR Dump (After|Before) (.+) on (.+) \*\*\*$")

# LLVM IR new format: *** IR Dump After {pass} ({short_name}) ***
PATTERN_IR_NEW = re.compile(r"^\*\*\* IR Dump (After|Before) (.+) \(([^)]+)\) \*\*\*$")

# MIR format: # *** IR Dump After {pass} ({short_name}) ***:
PATTERN_MIR = re.compile(r"^# \*\*\* IR Dump (After|Before) (.+) \(([^)]+)\) \*\*\*:$")


def parse_header(line: str) -> DumpHeader | None:
    """Parse a dump header line and return structured information.

    Args:
        line: The line to parse (should be stripped of trailing whitespace).

    Returns:
        DumpHeader if the line matches a known pattern, None otherwise.
    """
    # Try MIR pattern first (more specific due to # prefix)
    if match := PATTERN_MIR.match(line):
        direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER
        return DumpHeader(
            dump_type=DumpType.MIR,
            direction=direction,
            pass_name=match.group(2),
            target=match.group(3),
        )

    # Try LLVM IR legacy pattern (has "on" keyword)
    if match := PATTERN_IR_LEGACY.match(line):
        direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER
        target = match.group(3)
        # Normalize [module] to just "module"
        if target == "[module]":
            target = "module"
        return DumpHeader(
            dump_type=DumpType.LLVM_IR,
            direction=direction,
            pass_name=match.group(2),
            target=target,
        )

    # Try LLVM IR new pattern (has parentheses for short name)
    if match := PATTERN_IR_NEW.match(line):
        direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER
        return DumpHeader(
            dump_type=DumpType.LLVM_IR,
            direction=direction,
            pass_name=match.group(2),
            target=match.group(3),
        )

    return None


def write_dump(
    output_dir: Path, number: int, header: DumpHeader, contents: list[str]
) -> None:
    """Write a dump to a file.

    Args:
        output_dir: Directory to write the file to.
        number: Sequential number for the dump.
        header: Parsed header information.
        contents: Lines of content to write.
    """
    filename = header.to_filename(number)
    output_path = output_dir / filename

    logger.info(
        "Writing %s dump #%d: %s (%s) -> %s",
        header.dump_type.name,
        number,
        header.pass_name,
        header.target,
        filename,
    )

    output_path.write_text("".join(contents))


def process_input(input_file: TextIO, output_dir: Path) -> int:
    """Process the input file and split it into separate dump files.

    Args:
        input_file: File object to read from.
        output_dir: Directory to write output files to.

    Returns:
        Number of dumps written.
    """
    number = 0
    current_header: DumpHeader | None = None
    contents: list[str] = []

    for line in input_file:
        stripped = line.rstrip()

        # Check if this line is a header
        if stripped.startswith("***") or stripped.startswith("# ***"):
            header = parse_header(stripped)
            if header is not None:
                # Write previous dump if exists
                if current_header is not None and contents:
                    write_dump(output_dir, number, current_header, contents)
                    number += 1
                    contents = []

                current_header = header
                continue

        # Accumulate content
        if current_header is not None:
            contents.append(line)

    # Write final dump
    if current_header is not None and contents:
        write_dump(output_dir, number, current_header, contents)
        number += 1

    return number


def main() -> int:
    """Main entry point."""
    parser = argparse.ArgumentParser(
        prog="split-ir",
        description=(
            "Split IR/MIR dumps from LLVM debug output into separate files. "
            "Supports both -print-before-all and -print-after-all output formats."
        ),
    )

    parser.add_argument(
        "-o",
        "--output-dir",
        type=Path,
        required=True,
        help="Output directory for split IR/MIR files",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        default=False,
        help="Enable verbose logging",
    )
    parser.add_argument(
        "input",
        type=Path,
        help="Input log file from LLVM -print-before-all or -print-after-all",
    )

    args = parser.parse_args()

    # Configure logging
    if args.verbose:
        logging.basicConfig(
            level=logging.INFO,
            format="%(name)s: %(message)s",
        )

    # Validate input file
    if not args.input.is_file():
        print(f"Error: Input file '{args.input}' does not exist", file=sys.stderr)
        return 1

    # Create output directory if needed
    args.output_dir.mkdir(parents=True, exist_ok=True)

    if not args.output_dir.is_dir():
        print(
            f"Error: Output path '{args.output_dir}' is not a directory",
            file=sys.stderr,
        )
        return 1

    # Process the input file
    with args.input.open("r") as f:
        count = process_input(f, args.output_dir)

    print(f"Split {count} dumps into '{args.output_dir}'")
    return 0


if __name__ == "__main__":
    sys.exit(main())