import Mathlib
import Langlib.Classes.RecursivelyEnumerable.Definition
import Langlib.Grammars.Unrestricted.Toolbox
import Langlib.Automata.Turing.Equivalence.TMToGrammar.Construction
import Langlib.Automata.Turing.Equivalence.TMToGrammar.Soundness

/-! # Soundness of the TM → Grammar Construction

This file proves that the grammar `tmToGrammar` constructed from a TM0 machine is
**sound**: it only generates words on which the TM halts.

## Proof strategy

We define a "terminal content" function `terminalContent` that extracts, from each symbol
in a sentential form, the original input character it represents. This function is
invariant under all simulation and cleanup grammar rules (but increases during generation).

We then define a forward invariant `GI` with five constructors:
- `initial`: the start symbol
- `generating`: generation phase
- `simulating`: simulation phase (tracks TM configuration correspondence)
- `cleanup`: post-halt cleanup (tracks `terminalContent = w` and TM halts on `w`)
- `done`: terminal-only form

The key insight is that `cleanup` uses `terminalContent` instead of tracking a derivation
to a terminal form. This avoids the non-confluence issue in the cleanup phase.
-/

open Turing TMtoGrammarNT Relation

/-- Every TM-recognizable language over a finite alphabet is generated by
an unrestricted grammar. -/
theorem tm_recognizable_implies_re
    {T : Type} [DecidableEq T] [Fintype T]
    (L : Language T) :
    is_TM L → is_RE L := by
  intro ⟨Λ, hΛ, hFin, M, hM⟩
  haveI : DecidableEq Λ := Classical.decEq Λ
  refine ⟨tmToGrammar T Λ M, ?_⟩
  ext w
  change grammar_generates (tmToGrammar T Λ M) w ↔ w ∈ L
  constructor
  · intro h; exact (hM w).mpr (tmToGrammar_halts_of_generates M w h)
  · intro h; exact tmToGrammar_generates_of_halts M w ((hM w).mp h)

theorem TM_subset_RE {T: Type} [DecidableEq T] [Fintype T] : (TM : Set (Language T)) ⊆ RE
  := by
  intro L hL
  have hL' := tm_recognizable_implies_re L
  exact hL' hL