module

public import Langlib.Automata.LinearBounded.Equivalence.LBAToCSG
public import Langlib.Automata.LinearBounded.Equivalence.LBAToCSG.Completeness
public import Langlib.Automata.LinearBounded.Equivalence.LBAToCSG.Soundness
public import Langlib.Automata.LinearBounded.Equivalence.CSGToLBA
public import Langlib.Automata.LinearBounded.Positive
public import Langlib.Automata.LinearBounded.Equivalence.EndmarkerTape
public import Langlib.Automata.LinearBounded.Equivalence.EndmarkerToFlag
public import Langlib.Classes.ContextSensitive.Definition
public import Langlib.Classes.ContextSensitive.Inclusion.RecursivelyEnumerable
public import Langlib.Classes.ContextSensitive.Inclusion.Recursive
public import Langlib.Classes.ContextSensitive.Closure.EmptyWord
import Mathlib.Tactic
@[expose]
public section


/-!
# Context-Sensitive Languages = LBA Languages (`CS = LBA`)

This file states the Myhill–Landweber–Kuroda theorem in its set-level form,

  `CS = LBA`  (`CS_eq_LBA`),

for the canonical **endmarker** LBA (`is_LBA`/`LBA`, `Automata/LinearBounded/Definition.lean`).

It is assembled in two layers:

* The genuinely space-bounded core, `is_LBA_pos_iff : is_LBA_pos L ↔ is_CS L ∧ [] ∉ L`: the
  marker-free `|w|`-cell model (`Automata/LinearBounded/Positive.lean`) recognizes exactly the
  **ε-free** context-sensitive languages. This is `myhill_language_eq` (Myhill's bisimulation, `→`,
  `Equivalence/LBAToCSG.lean`) together with the Kuroda construction restricted to the
  non-contracting core (`←`, `Equivalence/CSGToLBA.lean`).
* The empty word, carried by the **endmarker** model on the genuine two-cell tape `⊢⊣` — so the
  machine itself decides `ε`, no external flag. `CS_subset_LBA` runs the Kuroda LBA on `⊢ w ⊣` via
  `simMachine` (`Equivalence/EndmarkerTape.lean`); `LBA_subset_CS` folds an endmarker LBA onto
  `|w|` cells via `flagMachine` (`Equivalence/EndmarkerToFlag.lean`) and adjoins `ε` when accepted.

This makes the two classes *equal* (the endmarker's `⊢⊣` cell is exactly what upgrades "ε-free CSL"
to all of `CS`), rather than merely "equal up to `ε`".

## References

* Kuroda, S.-Y. (1964), "Classes of languages and linear-bounded automata".
* Myhill, J. (1960), "Linear bounded automata".
* Hopcroft, Motwani, Ullman, *Introduction to Automata Theory, Languages, and Computation*,
  Chapter 9.
-/

open MyhillConstruction

variable {T : Type}

/-! ## LBA ⊆ CS (Myhill) -/

/-- **Myhill's construction (core).** The context-sensitive grammar built from an LBA in
`LBAToCSG.lean` generates exactly the non-empty language recognised by the automaton.

This is the genuine content of the LBA → CS direction: the bisimulation between
cell-sentential-forms and LBA configurations (the grammar and its rule membership lemmas are
established in `LBAToCSG.lean`):

* **completeness** (`⊇`): an accepting LBA run on `w` is mirrored by a derivation
  `start ⇒* map terminal w` — the start rules lay down one `cell` per input symbol, the
  simulation rules track each LBA step (interior moves use the 3-step `cellPending`
  protocol), and the cleanup rules recover the terminals once an accepting state appears;
* **soundness** (`⊆`): conversely every terminal string derivable from `start` arises this
  way, by an invariant stating that every reachable sentential form of `cell`s decodes to a
  configuration reachable by `M` from the initial configuration on `w`. -/
theorem myhill_language_eq
    {Γ Λ : Type} [Fintype T] [Fintype Γ] [Fintype Λ]
    [DecidableEq T] [DecidableEq Γ] [DecidableEq Λ]
    (M : LBA.Machine Γ Λ) (embed : T ↪ Γ) :
    CS_language (myhillGrammar M embed) = LBA.LanguageViaEmbed M embed := by
  funext w
  apply propext
  constructor
  · -- soundness (⊆): every derived terminal word is accepted
    exact MyhillConstruction.myhill_sound M embed w
  · -- completeness (⊇): every accepted word is derived
    exact MyhillConstruction.myhill_complete M embed w

variable [Fintype T] [DecidableEq T]

/-! ## `LBA_pos` ⊆ ε-free CS (Myhill) -/

/-- Every bounded-tape LBA language is context-sensitive (Myhill's direction). -/
theorem is_LBA_pos_imp_isCS {L : Language T} (h : is_LBA_pos L) : is_CS L := by
  obtain ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M, rfl⟩ := h
  haveI := hΓ; haveI := hΛ; haveI := hdΓ; haveI := hdΛ
  let emb : T ↪ Option (T ⊕ Γ) := ⟨fun t => some (Sum.inl t), fun _ _ h => by simpa using h⟩
  rw [show LBA.LanguageViaEmbed M (fun t => some (Sum.inl t))
        = CS_language (myhillGrammar M emb) from (myhill_language_eq M emb).symm]
  exact is_CS_via_csg_implies_is_CS ⟨_, rfl⟩

/-- A bounded-tape LBA never runs on the empty input, so its language is ε-free. -/
theorem not_nil_mem_of_is_LBA_pos {L : Language T} (h : is_LBA_pos L) : [] ∉ L := by
  obtain ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M, rfl⟩ := h
  rintro ⟨hne, -⟩
  simp at hne

/-! ## ε-free CS ⊆ `LBA_pos` (Kuroda)

Given a context-sensitive `L` with `[] ∉ L`, restrict to the non-contracting core
(`Grammars/NonContracting`) with finitely many nonterminals
(`Grammars/ContextSensitive/Basic/FiniteNT.lean`), whose language is `L \ {[]} = L`. On input `w`
of length `n` the Kuroda LBA carries the frozen input and a working sentential form over the finite
grammar-symbol alphabet, nondeterministically runs the grammar backwards, and accepts once the form
reduces to `[S]`. Non-contraction bounds every form of a derivation of `w` by `n`, so the `|w|`-cell
tape suffices. -/
theorem is_LBA_pos_of_isCS_not_nil {L : Language T} (hCS : is_CS L) (hne : [] ∉ L) :
    is_LBA_pos L := by
  classical
  obtain ⟨g, hg, rfl⟩ := hCS
  obtain ⟨g₀, hfin, hnc, hlang⟩ := exists_noncontracting_offEmpty_of_CS g hg
  obtain ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M, hM⟩ :=
    KurodaConstruction.noncontracting_finite_to_LBA g₀ hfin hnc
  -- `LanguageViaEmbed M = grammar_language g₀ = grammar_language g \ {[]} = grammar_language g`.
  rw [hlang] at hM
  exact ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M, by rw [hM, Set.diff_singleton_eq_self hne]⟩

/-! ## The characterization -/

/-- **The bounded-tape LBA languages are exactly the ε-free context-sensitive languages.** -/
theorem is_LBA_pos_iff {L : Language T} : is_LBA_pos L ↔ (is_CS L ∧ [] ∉ L) :=
  ⟨fun h => ⟨is_LBA_pos_imp_isCS h, not_nil_mem_of_is_LBA_pos h⟩,
   fun h => is_LBA_pos_of_isCS_not_nil h.1 h.2⟩

/-! ## CS = LBA (the canonical endmarker model)

Promoting the ε-free core to the full theorem: the empty word is carried by the endmarker model,
which runs on the genuine two-cell tape `⊢⊣`. The two simulations live in
`Equivalence/EndmarkerTape.lean` (`simMachine`/`language_simMachine_eq`) and
`Equivalence/EndmarkerToFlag.lean` (`flagMachine`/`language_flagMachine_eq`). -/

/-- **CS ⊆ LBA.** Every context-sensitive language is recognized by an endmarker LBA: the Kuroda
LBA for the ε-free core is run on `⊢ w ⊣` via `simMachine`, and `ε` is decided on `⊢⊣` by the bit
`decide (ε ∈ L)`. -/
theorem CS_subset_LBA : (CS : Set (Language T)) ⊆ LBA := by
  classical
  rintro L ⟨g, hg, rfl⟩
  obtain ⟨g₀, hfin, hnc, hlang⟩ := exists_noncontracting_offEmpty_of_CS g hg
  obtain ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M, hM⟩ :=
    KurodaConstruction.noncontracting_finite_to_LBA g₀ hfin hnc
  rw [hlang] at hM
  haveI := hΓ; haveI := hΛ; haveI := hdΓ; haveI := hdΛ
  refine ⟨Γ, LBA.SimState Λ, inferInstance, inferInstance, inferInstance, inferInstance,
    LBA.simMachine M (decide ([] ∈ grammar_language g)), ?_⟩
  rw [LBA.language_simMachine_eq]
  funext w
  apply propext
  simp only [LBA.LanguageRecognized]
  rcases eq_or_ne w [] with rfl | hw
  · constructor
    · rintro (⟨hb, _⟩ | hv)
      · exact of_decide_eq_true hb
      · obtain ⟨hne, -⟩ := hv; simp at hne
    · intro hmem; exact Or.inl ⟨by simpa using hmem, rfl⟩
  · constructor
    · rintro (⟨_, h0⟩ | hv)
      · exact absurd h0 hw
      · exact ((congrFun hM w) ▸ hv).1
    · intro hmem
      exact Or.inr (by rw [congrFun hM w]; exact ⟨hmem, by simpa using hw⟩)

/-- **LBA ⊆ CS.** Every endmarker-LBA language is context-sensitive: its non-empty part is the
bounded-tape language `LanguageViaEmbed (flagMachine M')` (context-sensitive via Myhill), and `ε`
(if accepted on `⊢⊣`) is adjoined by `is_CS_insert_empty_of_CS_grammar`. -/
theorem LBA_subset_CS : (LBA : Set (Language T)) ⊆ CS := by
  classical
  rintro L ⟨Γ, Λ, hΓ, hΛ, hdΓ, hdΛ, M', rfl⟩
  haveI := hΓ; haveI := hΛ; haveI := hdΓ; haveI := hdΛ
  let emb : T ↪ Option (T ⊕ LBA.FoldCell T Γ) :=
    ⟨fun t => some (Sum.inl t), fun _ _ h => by simpa using h⟩
  have hcore : LBA.LanguageViaEmbed (LBA.flagMachine M') ⇑emb
      = CS_language (myhillGrammar (LBA.flagMachine M') emb) :=
    (myhill_language_eq (LBA.flagMachine M') emb).symm
  rw [show LBA.LanguageEnd M'
        = LBA.LanguageRecognized (LBA.flagMachine M') ⇑emb
            (decide (LBA.Accepts M' (LBA.initCfgEnd M' [])))
      from (LBA.language_flagMachine_eq M' _ (by simp)).symm]
  by_cases hb : (decide (LBA.Accepts M' (LBA.initCfgEnd M' []))) = true
  · -- `ε` accepted: adjoin `ε` to the Myhill language
    rw [hb]
    have : LBA.LanguageRecognized (LBA.flagMachine M') ⇑emb true
        = (fun w => w = [] ∨ CS_language (myhillGrammar (LBA.flagMachine M') emb) w) := by
      funext w
      simp only [LBA.LanguageRecognized]
      rw [show LBA.LanguageViaEmbed (LBA.flagMachine M') ⇑emb w
            = CS_language (myhillGrammar (LBA.flagMachine M') emb) w from congrFun hcore w]
      apply propext
      constructor
      · rintro (⟨_, h⟩ | h)
        · exact Or.inl h
        · exact Or.inr h
      · rintro (h | h)
        · exact Or.inl ⟨trivial, h⟩
        · exact Or.inr h
    rw [this]
    exact is_CS_insert_empty_of_CS_grammar _
  · -- `ε` not accepted: exactly the Myhill language
    rw [Bool.not_eq_true] at hb
    rw [hb]
    have : LBA.LanguageRecognized (LBA.flagMachine M') ⇑emb false
        = CS_language (myhillGrammar (LBA.flagMachine M') emb) := by
      funext w
      simp only [LBA.LanguageRecognized, Bool.false_eq_true, false_and, false_or, hcore]
    rw [this]
    exact is_CS_via_csg_implies_is_CS ⟨_, rfl⟩

/-! ## The theorem -/

/-- **Context-sensitive languages are exactly the LBA languages: `CS = LBA`.** The headline form of
the Myhill–Landweber–Kuroda theorem, for the canonical endmarker LBA model (`is_LBA`/`LBA`,
`Automata/LinearBounded/Definition.lean`). -/
theorem CS_eq_LBA : (CS : Set (Language T)) = LBA :=
  Set.Subset.antisymm CS_subset_LBA LBA_subset_CS