/-
Copyright (c) 2023 Chenyi Li. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Chenyi Li, Ziyu Wang, Zaiwen Wen
-/
import Mathlib.Analysis.Calculus.FDeriv.Mul
import Mathlib.Analysis.Calculus.FDeriv.Add
import Mathlib.Analysis.Calculus.Deriv.Comp
import Mathlib.Analysis.Calculus.Gradient.Basic
/-!
# Gradient

## Main results

This file contains the following parts of gradient.
* the chain rule for the `g : 𝕜 → 𝕜` composed with `f : F → 𝕜`.
* the gradient for the sum of two functions.
* the gradient for finite sum of functions.
* the gradient for the sum of a constant and a function.
* the gradient for the difference of two functions.
* the gradient for the difference of a constant and a function.
* the gradient for the product of two functions.
* the gradient for the product of a constant and a function.
-/

noncomputable section
set_option checkBinderAnnotations false

open Topology InnerProductSpace Set

variable {𝕜 F : Type*} [RCLike 𝕜]

variable [NormedAddCommGroup F] [InnerProductSpace 𝕜 F] [CompleteSpace F]

variable {f : F → 𝕜} {f' x : F} {L : Filter F} {s : Set F}

local notation "∇" => gradient

section Composition

open Set Filter

variable {g : 𝕜 → 𝕜} {g' : 𝕜}

variable {L' : Filter 𝕜} {t : Set 𝕜}

theorem HasGradientAtFilter.comp
    (hg : HasGradientAtFilter g g' (f x) L') (hf : HasGradientAtFilter f f' x L)
    (hL : Tendsto f L L') : HasGradientAtFilter (g ∘ f) (g' • f') x L := by
  have eq : (starRingEnd 𝕜) g' • (toDual 𝕜 F) f' =
    (toDual 𝕜 F) (g' • f') := by rw [map_smulₛₗ]
  rw [HasGradientAtFilter, ← eq]
  exact hg.hasDerivAtFilter.comp_hasFDerivAtFilter x hf hL

theorem HasGradientWithinAt.comp
    (hg : HasGradientWithinAt g g' t (f x)) (hf : HasGradientWithinAt f f' s x)
    (hst : MapsTo f s t) : HasGradientWithinAt (g ∘ f) (g' • f') s x :=
  HasGradientAtFilter.comp hg hf <| hf.continuousWithinAt.tendsto_nhdsWithin hst

theorem HasGradientAt.comp_hasGradientWithinAt
    (hg : HasGradientAt g g' (f x)) (hf : HasGradientWithinAt f f' s x) :
    HasGradientWithinAt (g ∘ f) (g' • f') s x :=
  hg.comp hf hf.continuousWithinAt

theorem HasGradientWithinAt.comp_of_mem
    (hg : HasGradientWithinAt g g' t (f x)) (hf : HasGradientWithinAt f f' s x)
    (hst : Tendsto f (𝓝[s] x) (𝓝[t] f x)) : HasGradientWithinAt (g ∘ f) (g' • f') s x :=
  HasGradientAtFilter.comp hg hf hst

/-- The chain rule. -/
theorem HasGradientAt.comp (hg : HasGradientAt g g' (f x))
    (hf : HasGradientAt f f' x) : HasGradientAt (g ∘ f) (g' • f') x :=
  HasGradientAtFilter.comp hg hf hf.continuousAt

theorem gradient.comp (hg : DifferentiableAt 𝕜 g (f x)) (hf : DifferentiableAt 𝕜 f x) :
    ∇ (g ∘ f) x = (∇ g (f x)) • (∇ f x) :=
  (hg.hasGradientAt.comp hf.hasGradientAt).gradient

end Composition

section ConstSmul

open Set Filter

theorem HasGradientAtFilter.const_smul (h : HasGradientAtFilter f f' x L) (c : 𝕜) :
    HasGradientAtFilter (fun x => c • f x) ((starRingEnd 𝕜) c • f') x L := by
  have : c • (toDual 𝕜 F) f' = (toDual 𝕜 F) ((starRingEnd 𝕜) c • f') := by
    rw [map_smulₛₗ, RingHomCompTriple.comp_apply, RingHom.id_apply]
  rw [HasGradientAtFilter, ← this]; rw [HasGradientAtFilter] at h
  exact h.const_smul c

theorem HasGradientWithinAt.const_smul (h : HasGradientWithinAt f f' s x) (c : 𝕜) :
    HasGradientWithinAt (fun x => c • f x) ((starRingEnd 𝕜) c • f') s x := by
  exact HasGradientAtFilter.const_smul h c

theorem HasGradientAt.const_smul (h : HasGradientAt f f' x) (c : 𝕜) :
    HasGradientAt (fun x => c • f x) ((starRingEnd 𝕜) c • f') x := by
  exact HasGradientAtFilter.const_smul h c

theorem gradient_const_smul (h : DifferentiableAt 𝕜 f x) (c : 𝕜) :
    ∇ (fun y => c • f y) x = (starRingEnd 𝕜) c • ∇ f x :=
  (h.hasGradientAt.const_smul c).gradient

variable [InnerProductSpace ℝ F] {f : F → ℝ} (c : ℝ)

theorem HasGradientAtFilter.const_smul' (h : HasGradientAtFilter f f' x L) :
    HasGradientAtFilter (fun x => c • f x) (c • f') x L := h.const_smul c

theorem HasGradientWithinAt.const_smul' (h : HasGradientWithinAt f f' s x) :
    HasGradientWithinAt (fun x => c • f x) (c • f') s x := by
  exact HasGradientWithinAt.const_smul h c

theorem HasGradientAt.const_smul' (h : HasGradientAt f f' x) :
    HasGradientAt (fun x => c • f x) (c • f') x := by
  exact HasGradientAt.const_smul h c

theorem gradient_const_smul' (h : DifferentiableAt ℝ f x) (c : ℝ) :
    ∇ (fun y => c • f y) x = c • ∇ f x :=
  (h.hasGradientAt.const_smul c).gradient

end ConstSmul

section Add

variable {f' : F} {g : F → 𝕜} {x : F} {g' : F}
variable {L : Filter F} {f : F → 𝕜} {L' : Filter 𝕜} {t : Set 𝕜}

theorem HasGradientAtFilter.add (hf : HasGradientAtFilter f f' x L)
    (hg : HasGradientAtFilter g g' x L) :
    HasGradientAtFilter (fun y => f y + g y) (f' + g') x L := by
  rw [HasGradientAtFilter]; rw [HasGradientAtFilter] at hf hg
  have : (toDual 𝕜 F) (f' + g') = (toDual 𝕜 F) f' + (toDual 𝕜 F) g' := by simp
  rw [this]; exact hf.add hg

theorem HasGradientWithinAt.add (hf : HasGradientWithinAt f f' s x)
    (hg : HasGradientWithinAt g g' s x) :
    HasGradientWithinAt (fun y => f y + g y) (f' + g') s x := by
  exact HasGradientAtFilter.add hf hg

theorem HasGradientAt.add (hf : HasGradientAt f f' x) (hg : HasGradientAt g g' x) :
    HasGradientAt (fun x => f x + g x) (f' + g') x := by
  exact HasGradientAtFilter.add hf hg

theorem gradient_add (hf : DifferentiableAt 𝕜 f x) (hg : DifferentiableAt 𝕜 g x) :
    ∇ (fun y => f y + g y) x = ∇ f x + ∇ g x :=
  (hf.hasGradientAt.add hg.hasGradientAt).gradient

theorem HasGradientAtFilter.add_const (hf : HasGradientAtFilter f f' x L) (c : 𝕜) :
    HasGradientAtFilter (fun y => f y + c) f' x L :=
  add_zero f' ▸ hf.add (hasGradientAtFilter_const _ _ _)

theorem HasGradientWithinAt.add_const (hf : HasGradientWithinAt f f' s x) (c : 𝕜) :
    HasGradientWithinAt (fun y => f y + c) f' s x := by
  exact HasGradientAtFilter.add_const hf c

theorem HasGradientAt.add_const (hf : HasGradientAt f f' x) (c : 𝕜) :
    HasGradientAt (fun x => f x + c) f' x := by
  exact HasGradientAtFilter.add_const hf c

theorem gradient_add_const (c : 𝕜) : ∇ (fun y => f y + c) x = ∇ f x := by
  unfold gradient
  simp only [EmbeddingLike.apply_eq_iff_eq]
  exact fderiv_add_const c

theorem HasGradientAtFilter.const_add (hf : HasGradientAtFilter f f' x L) (c : 𝕜) :
    HasGradientAtFilter (fun y => c + f y) f' x L :=
  zero_add f' ▸ (hasGradientAtFilter_const _ _ _).add hf

theorem HasGradientWithinAt.const_add (hf : HasGradientWithinAt f f' s x) (c : 𝕜) :
    HasGradientWithinAt (fun y => c + f y) f' s x := by
  exact HasGradientAtFilter.const_add hf c

theorem HasGradientAt.const_add (hf : HasGradientAt f f' x) (c : 𝕜) :
    HasGradientAt (fun x => c + f x) f' x := by
  exact HasGradientAtFilter.const_add hf c

theorem Gradient_const_add (c : 𝕜) : ∇ (fun y => c + f y) x = ∇ f x := by
  simp only [add_comm c, gradient_add_const]

end Add

section Sum

/-! ### Derivative of a finite sum of functions -/


open BigOperators Asymptotics

variable {ι : Type*} {u : Finset ι} {A : ι → F → 𝕜} {A' : ι → F}

theorem HasGradientAtFilter.sum (h : ∀ i ∈ u, HasGradientAtFilter (A i) (A' i) x L) :
    HasGradientAtFilter (fun y => ∑ i in u, A i y) (∑ i in u, A' i) x L := by
  have : ∑ i in u, (toDual 𝕜 F) (A' i) = (toDual 𝕜 F) (∑ i in u, A' i) := by
    rw [map_sum]
  rw [HasGradientAtFilter, ← this]; unfold HasGradientAtFilter at h
  exact HasFDerivAtFilter.sum h

theorem HasGradientWithinAt.sum (h : ∀ i ∈ u, HasGradientWithinAt (A i) (A' i) s x) :
    HasGradientWithinAt (fun y => ∑ i in u, A i y) (∑ i in u, A' i) s x := by
  exact HasGradientAtFilter.sum h

theorem HasGradientAt.sum (h : ∀ i ∈ u, HasGradientAt (A i) (A' i) x) :
    HasGradientAt (fun y => ∑ i in u, A i y) (∑ i in u, A' i) x := by
  exact HasGradientAtFilter.sum h

theorem gradient_sum (h : ∀ i ∈ u, DifferentiableAt 𝕜 (A i) x) :
    ∇ (fun y => ∑ i in u, A i y) x = ∑ i in u, ∇ (A i) x :=
  (HasGradientAt.sum fun i hi => (h i hi).hasGradientAt).gradient

end Sum

section Neg

/-! ### Gradient of the negative of a function -/

theorem HasGradientAtFilter.neg (h : HasGradientAtFilter f f' x L) :
    HasGradientAtFilter (fun x => -f x) (-f') x L := by
  have : -(toDual 𝕜 F) f' = (toDual 𝕜 F) (-f') := by simp
  rw [HasGradientAtFilter, ← this]; rw [HasGradientAtFilter] at h
  exact h.neg

theorem HasGradientWithinAt.neg (h : HasGradientWithinAt f f' s x) :
    HasGradientWithinAt (fun x => -f x) (-f') s x := by
  exact HasGradientAtFilter.neg h

theorem HasGradientAt.neg (h : HasGradientAt f f' x) :
    HasGradientAt (fun x => -f x) (-f') x := by
  exact HasGradientAtFilter.neg h

theorem gradient_neg : ∇ (fun y => - f y) x = - ∇ f x := by
  unfold gradient
  simp only [fderiv_neg, map_neg]

end Neg

section Sub

/-! ### Derivative of the difference of two functions -/

variable {f' : F} {g : F → 𝕜} {x : F} {g' : F}
variable {L : Filter F} {f : F → 𝕜} {L' : Filter 𝕜} {t : Set 𝕜}

theorem HasGradientAtFilter.sub (hf : HasGradientAtFilter f f' x L)
    (hg : HasGradientAtFilter g g' x L) :
    HasGradientAtFilter (fun x => f x - g x) (f' - g') x L := by
  simpa only [sub_eq_add_neg] using hf.add hg.neg

theorem HasGradientWithinAt.sub (hf : HasGradientWithinAt f f' s x)
    (hg : HasGradientWithinAt g g' s x) :
    HasGradientWithinAt (fun x => f x - g x) (f' - g') s x := by
  exact HasGradientAtFilter.sub hf hg

theorem HasGradientAt.sub (hf : HasGradientAt f f' x) (hg : HasGradientAt g g' x) :
    HasGradientAt (fun x => f x - g x) (f' - g') x := by
  exact HasGradientAtFilter.sub hf hg

theorem gradient_sub (hf : DifferentiableAt 𝕜 f x) (hg : DifferentiableAt 𝕜 g x) :
    ∇ (fun y => f y - g y) x = ∇ f x - ∇ g x :=
  (hf.hasGradientAt.sub hg.hasGradientAt).gradient

theorem HasGradientAtFilter.sub_const (hf : HasGradientAtFilter f f' x L) (c : 𝕜) :
    HasGradientAtFilter (fun x => f x - c) f' x L := by
  simpa only [sub_eq_add_neg] using hf.add_const (-c)

theorem HasGradientWithinAt.sub_const (hf : HasGradientWithinAt f f' s x) (c : 𝕜) :
    HasGradientWithinAt (fun x => f x - c) f' s x := by
  exact HasGradientAtFilter.sub_const hf c

theorem HasGradientAt.sub_const (hf : HasGradientAt f f' x) (c : 𝕜) :
    HasGradientAt (fun x => f x - c) f' x := by
  exact HasGradientAtFilter.sub_const hf c

theorem Gradient_sub_const (c : 𝕜) : ∇ (fun y => f y - c) x = ∇ f x := by
  simp only [sub_eq_add_neg, gradient_add_const]

theorem HasGradientAtFilter.const_sub (hf : HasGradientAtFilter f f' x L) (c : 𝕜) :
    HasGradientAtFilter (fun x => c - f x) (-f') x L := by
  simpa only [sub_eq_add_neg] using hf.neg.const_add c

theorem HasGradientWithinAt.const_sub (hf : HasGradientWithinAt f f' s x) (c : 𝕜) :
    HasGradientWithinAt (fun x => c - f x) (-f') s x := by
  exact HasGradientAtFilter.const_sub hf c

theorem HasGradientAt.const_sub (hf : HasGradientAt f f' x) (c : 𝕜) :
    HasGradientAt (fun x => c - f x) (-f') x := by
  exact HasGradientAtFilter.const_sub hf c

theorem gradient_const_sub (c : 𝕜) : ∇ (fun y => c - f y) x = - ∇ f x := by
  calc
    ∇ (fun y => c - f y) x = ∇ (fun y => - f y + c) x := by  congr; ext x; rw [sub_eq_neg_add]
    _ = - ∇ f x := by rw [gradient_add_const c, gradient_neg]

end Sub

section Mul

variable {a' b' c' d' : F} {a b c d : F → 𝕜} {x : F}

open ContinuousLinearMap

lemma equiv_lemma_mul : c x • (toDual 𝕜 F) d' + d x • (toDual 𝕜 F) c'
    = (toDual 𝕜 F) ((starRingEnd 𝕜) (c x) • d' + (starRingEnd 𝕜) (d x) • c'):= by
  simp
  congr <;> exact SemilinearMapClass.map_smul_inv _ _ _

theorem HasGradientAt.mul (hc : HasGradientAt c c' x) (hd : HasGradientAt d d' x) :
    HasGradientAt (fun y => c y * d y)
    ((starRingEnd 𝕜) (c x) • d' + (starRingEnd 𝕜) (d x) • c') x := by
  rw [hasGradientAt_iff_hasFDerivAt, ← equiv_lemma_mul]
  rw [hasGradientAt_iff_hasFDerivAt] at hc hd
  exact hc.mul hd

theorem HasGradientWithinAt.mul (hc : HasGradientWithinAt c c' s x)
    (hd : HasGradientWithinAt d d' s x) :
    HasGradientWithinAt (fun y => c y * d y) ((starRingEnd 𝕜) (c x) • d'
      + (starRingEnd 𝕜) (d x) • c') s x := by
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt, ← equiv_lemma_mul]
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt] at hc hd
  exact hc.mul hd

theorem gradient_mul (hc : DifferentiableAt 𝕜 c x) (hd : DifferentiableAt 𝕜 d x) :
    ∇ (fun y => c y * d y) x = (starRingEnd 𝕜) (c x) • ∇ d x
      + (starRingEnd 𝕜) (d x) • ∇ c x :=
  (hc.hasGradientAt.mul hd.hasGradientAt).gradient

variable [InnerProductSpace ℝ F] {a' b' c' d' : F} {a b c d : F → ℝ} {x : F}

theorem HasGradientAt.mul' (hc : HasGradientAt c c' x) (hd : HasGradientAt d d' x) :
    HasGradientAt (fun y => c y * d y) ((c x) • d' + (d x) • c') x :=
  HasGradientAt.mul hc hd

theorem HasGradientWithinAt.mul' (hc : HasGradientWithinAt c c' s x)
    (hd : HasGradientWithinAt d d' s x) :
    HasGradientWithinAt (fun y => c y * d y) ((c x) • d' + (d x) • c') s x :=
  hc.mul hd

theorem gradient_mul' (hc : DifferentiableAt ℝ c x) (hd : DifferentiableAt ℝ d x) :
    ∇ (fun y => c y * d y) x = (c x) • ∇ d x + (d x) • ∇ c x :=
  gradient_mul hc hd

end Mul

section Mul_const

variable {c' a': F} {c a: F → 𝕜} {x : F} (d b: 𝕜)

lemma equiv_lemma_mul_const : d • (toDual 𝕜 F) c' = (toDual 𝕜 F) ((starRingEnd 𝕜) d • c') := by
  rw [map_smulₛₗ, RingHomCompTriple.comp_apply, RingHom.id_apply]

theorem HasGradientWithinAt.mul_const (hc : HasGradientWithinAt c c' s x) :
    HasGradientWithinAt (fun y => c y * d) ((starRingEnd 𝕜) d • c') s x := by
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt, ← equiv_lemma_mul_const]
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt] at hc
  exact hc.mul_const d

theorem HasGradientAt.mul_const (hc : HasGradientAt c c' x) :
    HasGradientAt (fun y => c y * d) ((starRingEnd 𝕜) d • c') x := by
  rw [hasGradientAt_iff_hasFDerivAt, ← equiv_lemma_mul_const]
  rw [hasGradientAt_iff_hasFDerivAt] at hc
  exact hc.mul_const d

theorem gradient_mul_const (hc : DifferentiableAt 𝕜 c x) :
    ∇ (fun y => c y * d) x = (starRingEnd 𝕜) d • ∇ c x :=
  (hc.hasGradientAt.mul_const d).gradient

lemma equiv_lemma_const_mul : b • (toDual 𝕜 F) a' = (toDual 𝕜 F) ((starRingEnd 𝕜) b • a') := by
  rw [map_smulₛₗ, RingHomCompTriple.comp_apply, RingHom.id_apply]

theorem HasGradientWithinAt.const_mul (ha : HasGradientWithinAt a a' s x) :
    HasGradientWithinAt (fun y => b * a y) ((starRingEnd 𝕜) b • a') s x := by
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt, ← equiv_lemma_const_mul]
  rw [hasGradientWithinAt_iff_hasFDerivWithinAt] at ha
  exact ha.const_mul b

theorem HasGradientAt.const_mul (ha : HasGradientAt a a' x) :
    HasGradientAt (fun y => b * a y) ((starRingEnd 𝕜) b • a') x := by
  rw [hasGradientAt_iff_hasFDerivAt, ← equiv_lemma_const_mul]
  rw [hasGradientAt_iff_hasFDerivAt] at ha
  exact ha.const_mul b

theorem gradient_const_mul (ha : DifferentiableAt 𝕜 a x) :
    ∇ (fun y => b * a y) x = (starRingEnd 𝕜) b • ∇ a x :=
  (ha.hasGradientAt.const_mul b).gradient

variable [InnerProductSpace ℝ F] {c' a': F} {c a: F → ℝ} {x : F} (d b: ℝ)

theorem HasGradientWithinAt.mul_const' (hc : HasGradientWithinAt c c' s x) :
    HasGradientWithinAt (fun y => c y * d) (d • c') s x :=
  HasGradientWithinAt.mul_const d hc

theorem HasGradientAt.mul_const' (hc : HasGradientAt c c' x) :
    HasGradientAt (fun y => c y * d) (d • c') x :=
  HasGradientAt.mul_const d hc

theorem gradient_mul_const' (hc : DifferentiableAt ℝ c x) :
    ∇ (fun y => c y * d) x = d • ∇ c x :=
  gradient_mul_const d hc

theorem HasGradientWithinAt.const_mul' (ha : HasGradientWithinAt a a' s x) :
    HasGradientWithinAt (fun y => b * a y) (b • a') s x :=
  HasGradientWithinAt.const_mul b ha

theorem HasGradientAt.const_mul' (ha : HasGradientAt a a' x) :
    HasGradientAt (fun y => b * a y) (b • a') x :=
  HasGradientAt.const_mul b ha

theorem gradient_const_mul' (ha : DifferentiableAt ℝ a x) :
    ∇ (fun y => b * a y) x = b • ∇ a x :=
  gradient_const_mul b ha

end Mul_const