#!/usr/bin/env python3 """Split IR/MIR dumps from LLVM debug output into separate files. This script parses log files generated by LLVM's -print-before-all or -print-after-all options and splits them into individual IR (.ll) or MIR (.mir) files. Supported header formats: - LLVM IR (legacy): *** IR Dump After/Before {pass} on {target} *** - LLVM IR (new): *** IR Dump After/Before {pass} ({short_name}) *** - MIR: # *** IR Dump After/Before {pass} ({short_name}) ***: """ from __future__ import annotations import argparse import logging import re import sys from dataclasses import dataclass from enum import Enum, auto from pathlib import Path from typing import TextIO logger = logging.getLogger("split-ir") class DumpType(Enum): """Type of IR dump.""" LLVM_IR = auto() MIR = auto() @property def extension(self) -> str: """Return the file extension for this dump type.""" return ".ll" if self == DumpType.LLVM_IR else ".mir" class Direction(Enum): """Direction of the dump (before or after the pass).""" BEFORE = "before" AFTER = "after" @dataclass class DumpHeader: """Parsed information from a dump header line.""" dump_type: DumpType direction: Direction pass_name: str target: str def to_filename(self, number: int) -> str: """Generate a sanitized filename for this dump.""" # Sanitize pass_name and target for filesystem safety safe_pass = self._sanitize(self.pass_name) safe_target = self._sanitize(self.target) return f"{number}-{self.direction.value}-{safe_pass}-{safe_target}{self.dump_type.extension}" @staticmethod def _sanitize(name: str) -> str: """Sanitize a string for use in a filename.""" # Replace problematic characters with underscores sanitized = re.sub(r'[<>:"/\\|?*\s]', "_", name) # Collapse multiple underscores sanitized = re.sub(r"_+", "_", sanitized) # Remove leading/trailing underscores return sanitized.strip("_") # Regex patterns for different header formats # LLVM IR legacy format: *** IR Dump After {pass} on {target} *** PATTERN_IR_LEGACY = re.compile(r"^\*\*\* IR Dump (After|Before) (.+) on (.+) \*\*\*$") # LLVM IR new format: *** IR Dump After {pass} ({short_name}) *** PATTERN_IR_NEW = re.compile(r"^\*\*\* IR Dump (After|Before) (.+) \(([^)]+)\) \*\*\*$") # MIR format: # *** IR Dump After {pass} ({short_name}) ***: PATTERN_MIR = re.compile(r"^# \*\*\* IR Dump (After|Before) (.+) \(([^)]+)\) \*\*\*:$") def parse_header(line: str) -> DumpHeader | None: """Parse a dump header line and return structured information. Args: line: The line to parse (should be stripped of trailing whitespace). Returns: DumpHeader if the line matches a known pattern, None otherwise. """ # Try MIR pattern first (more specific due to # prefix) if match := PATTERN_MIR.match(line): direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER return DumpHeader( dump_type=DumpType.MIR, direction=direction, pass_name=match.group(2), target=match.group(3), ) # Try LLVM IR legacy pattern (has "on" keyword) if match := PATTERN_IR_LEGACY.match(line): direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER target = match.group(3) # Normalize [module] to just "module" if target == "[module]": target = "module" return DumpHeader( dump_type=DumpType.LLVM_IR, direction=direction, pass_name=match.group(2), target=target, ) # Try LLVM IR new pattern (has parentheses for short name) if match := PATTERN_IR_NEW.match(line): direction = Direction.BEFORE if match.group(1) == "Before" else Direction.AFTER return DumpHeader( dump_type=DumpType.LLVM_IR, direction=direction, pass_name=match.group(2), target=match.group(3), ) return None def write_dump( output_dir: Path, number: int, header: DumpHeader, contents: list[str] ) -> None: """Write a dump to a file. Args: output_dir: Directory to write the file to. number: Sequential number for the dump. header: Parsed header information. contents: Lines of content to write. """ filename = header.to_filename(number) output_path = output_dir / filename logger.info( "Writing %s dump #%d: %s (%s) -> %s", header.dump_type.name, number, header.pass_name, header.target, filename, ) output_path.write_text("".join(contents)) def process_input(input_file: TextIO, output_dir: Path) -> int: """Process the input file and split it into separate dump files. Args: input_file: File object to read from. output_dir: Directory to write output files to. Returns: Number of dumps written. """ number = 0 current_header: DumpHeader | None = None contents: list[str] = [] for line in input_file: stripped = line.rstrip() # Check if this line is a header if stripped.startswith("***") or stripped.startswith("# ***"): header = parse_header(stripped) if header is not None: # Write previous dump if exists if current_header is not None and contents: write_dump(output_dir, number, current_header, contents) number += 1 contents = [] current_header = header continue # Accumulate content if current_header is not None: contents.append(line) # Write final dump if current_header is not None and contents: write_dump(output_dir, number, current_header, contents) number += 1 return number def main() -> int: """Main entry point.""" parser = argparse.ArgumentParser( prog="split-ir", description=( "Split IR/MIR dumps from LLVM debug output into separate files. " "Supports both -print-before-all and -print-after-all output formats." ), ) parser.add_argument( "-o", "--output-dir", type=Path, required=True, help="Output directory for split IR/MIR files", ) parser.add_argument( "-v", "--verbose", action="store_true", default=False, help="Enable verbose logging", ) parser.add_argument( "input", type=Path, help="Input log file from LLVM -print-before-all or -print-after-all", ) args = parser.parse_args() # Configure logging if args.verbose: logging.basicConfig( level=logging.INFO, format="%(name)s: %(message)s", ) # Validate input file if not args.input.is_file(): print(f"Error: Input file '{args.input}' does not exist", file=sys.stderr) return 1 # Create output directory if needed args.output_dir.mkdir(parents=True, exist_ok=True) if not args.output_dir.is_dir(): print( f"Error: Output path '{args.output_dir}' is not a directory", file=sys.stderr, ) return 1 # Process the input file with args.input.open("r") as f: count = process_input(f, args.output_dir) print(f"Split {count} dumps into '{args.output_dir}'") return 0 if __name__ == "__main__": sys.exit(main())