/*
 * Copyright (C) 2026  Behdad Esfahbod
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
 * Permission is hereby granted, without written agreement and without
 * license or royalty fees, to use, copy, modify, and distribute this
 * software and its documentation for any purpose, provided that the
 * above copyright notice and the following two paragraphs appear in
 * all copies of this software.
 *
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 */


/* Paint-renderer fragment shader (Metal).
 *
 * Assumes the shared fragment helpers (hb-gpu-fragment.msl) and
 * the draw-renderer fragment helpers (hb-gpu-draw-fragment.msl)
 * are prepended to this source.
 */


/* Fetch the i'th stop (2 texels per stop), resolving is_foreground. */
static float4 _hb_gpu_stop_color (device const short4* hb_gpu_atlas,
				  int stops_base, int i, float4 foreground,
				  thread float &offset)
{
  int4 a = int4 (hb_gpu_atlas[stops_base + i * 2]);
  offset = float (a.r) / 32767.0;
  int4 b = int4 (hb_gpu_atlas[stops_base + i * 2 + 1]);
  if ((a.g & 1) != 0)
    return float4 (foreground.rgb, foreground.a * (float (b.a) / 32767.0));
  return float4 (b) / 32767.0;
}

static float _hb_gpu_extend_t (float t, int extend)
{
  if (extend == 1) return t - floor (t);
  if (extend == 2) {
    float u = t - 2.0 * floor (t * 0.5);
    return u > 1.0 ? 2.0 - u : u;
  }
  return clamp (t, 0.0, 1.0);
}

static float4 _hb_gpu_eval_stops (device const short4* hb_gpu_atlas,
				  int stops_base, int stop_count,
				  float t, float4 foreground)
{
  float off_prev;
  float4 col_prev = _hb_gpu_stop_color (hb_gpu_atlas, stops_base, 0, foreground, off_prev);
  if (t <= off_prev) return col_prev;
  for (int i = 1; i < stop_count; i++)
  {
    float off;
    float4 col = _hb_gpu_stop_color (hb_gpu_atlas, stops_base, i, foreground, off);
    if (t <= off)
    {
      float span = off - off_prev;
      float f = span > 1e-6 ? (t - off_prev) / span : 0.0;
      float4 p0 = float4 (col_prev.rgb * col_prev.a, col_prev.a);
      float4 p1 = float4 (col.rgb * col.a, col.a);
      float4 pm = mix (p0, p1, f);
      return pm.a > 1e-6 ? float4 (pm.rgb / pm.a, pm.a) : float4 (0.0);
    }
    col_prev = col;
    off_prev = off;
  }
  return col_prev;
}

/* Apply the stored 2x2 M^-1 (row-major i16 Q10) to a vector. */
static float2 _hb_gpu_apply_minv (int4 m, float2 v)
{
  float4 mf = float4 (m) * (1.0 / 1024.0);
  return float2 (mf.x * v.x + mf.y * v.y,
		 mf.z * v.x + mf.w * v.y);
}

static float4 _hb_gpu_sample_linear (float2 renderCoord, int grad_base,
				     int stop_count, int extend,
				     float4 foreground,
				     device const short4* hb_gpu_atlas)
{
  int4 t0 = int4 (hb_gpu_atlas[grad_base]);
  int4 m  = int4 (hb_gpu_atlas[grad_base + 1]);
  float2 p0_r = float2 (float (t0.r), float (t0.g));
  float2 d    = float2 (float (t0.b), float (t0.a));
  float denom = dot (d, d);
  if (denom < 1e-6) return float4 (0.0);
  float2 p = _hb_gpu_apply_minv (m, renderCoord - p0_r);
  float t = dot (p, d) / denom;
  t = _hb_gpu_extend_t (t, extend);
  return _hb_gpu_eval_stops (hb_gpu_atlas, grad_base + 2, stop_count, t, foreground);
}

static float4 _hb_gpu_sample_radial (float2 renderCoord, int grad_base,
				     int stop_count, int extend,
				     float4 foreground,
				     device const short4* hb_gpu_atlas)
{
  int4 t0 = int4 (hb_gpu_atlas[grad_base]);
  int4 t1 = int4 (hb_gpu_atlas[grad_base + 1]);
  int4 m  = int4 (hb_gpu_atlas[grad_base + 2]);
  float2 c0_r = float2 (float (t0.r), float (t0.g));
  float2 cd   = float2 (float (t0.b), float (t0.a));
  float r0 = float (t1.r);
  float r1 = float (t1.g);

  float dr = r1 - r0;
  float2 p  = _hb_gpu_apply_minv (m, renderCoord - c0_r);

  float A = dot (cd, cd) - dr * dr;
  float B = -2.0 * (dot (p, cd) + r0 * dr);
  float C = dot (p, p) - r0 * r0;

  float t;
  if (abs (A) > 1e-6)
  {
    float disc = B * B - 4.0 * A * C;
    if (disc < 0.0) return float4 (0.0);
    float sq = sqrt (disc);
    float t1r = (-B + sq) / (2.0 * A);
    float t2r = (-B - sq) / (2.0 * A);
    t = (r0 + t1r * dr >= 0.0) ? t1r : t2r;
  }
  else
  {
    if (abs (B) < 1e-6) return float4 (0.0);
    t = -C / B;
  }
  t = _hb_gpu_extend_t (t, extend);
  return _hb_gpu_eval_stops (hb_gpu_atlas, grad_base + 3, stop_count, t, foreground);
}

static float4 _hb_gpu_sample_sweep (float2 renderCoord, int grad_base,
				    int stop_count, int extend,
				    float4 foreground,
				    device const short4* hb_gpu_atlas)
{
  int4 t0 = int4 (hb_gpu_atlas[grad_base]);
  int4 m  = int4 (hb_gpu_atlas[grad_base + 1]);
  float2 c_r = float2 (float (t0.r), float (t0.g));
  float a0 = float (t0.b) / 16384.0;
  float a1 = float (t0.a) / 16384.0;
  float span = a1 - a0;
  if (abs (span) < 1e-6) return float4 (0.0);

  float2 p = _hb_gpu_apply_minv (m, renderCoord - c_r);
  float ang = atan2 (p.y, p.x) / 3.14159265358979;
  if (ang < 0.0) ang += 2.0;
  float t = (ang - a0) / span;
  t = _hb_gpu_extend_t (t, extend);
  return _hb_gpu_eval_stops (hb_gpu_atlas, grad_base + 2, stop_count, t, foreground);
}

static float4 _hb_gpu_composite (float4 src, float4 dst, int mode)
{
  float4 r = src + dst * (1.0 - src.a);  /* SRC_OVER default */

  /* Mode numbers match hb_paint_composite_mode_t.  SRC_OVER (3) is
   * the default `r` above. */

  /* Approximate unsupported modes with the nearest Porter-Duff mode
   * we do implement.  DIFFERENCE / EXCLUSION / HSL_* still fall
   * through to SRC_OVER below. */
  if      (mode == 14 || mode == 18 || mode == 19) mode = 23; /* OVERLAY / COLOR_BURN / HARD_LIGHT -> MULTIPLY */
  else if (mode == 17 || mode == 20)               mode = 13; /* COLOR_DODGE / SOFT_LIGHT -> SCREEN */

  if      (mode == 0)  r = float4 (0.0);                       /* CLEAR */
  else if (mode == 1)  r = src;                                /* SRC */
  else if (mode == 2)  r = dst;                                /* DST */
  else if (mode == 4)  r = dst + src * (1.0 - dst.a);          /* DST_OVER */
  else if (mode == 5)  r = src * dst.a;                        /* SRC_IN */
  else if (mode == 6)  r = dst * src.a;                        /* DST_IN */
  else if (mode == 7)  r = src * (1.0 - dst.a);                /* SRC_OUT */
  else if (mode == 8)  r = dst * (1.0 - src.a);                /* DST_OUT */
  else if (mode == 9)                                          /* SRC_ATOP */
    r = src * dst.a + dst * (1.0 - src.a);
  else if (mode == 10)                                         /* DST_ATOP */
    r = dst * src.a + src * (1.0 - dst.a);
  else if (mode == 11)                                         /* XOR */
    r = src * (1.0 - dst.a) + dst * (1.0 - src.a);
  else if (mode == 12)                                         /* PLUS */
    r = min (src + dst, float4 (1.0));
  else if (mode == 13) {                                       /* SCREEN (premul) */
    r.rgb = src.rgb + dst.rgb - src.rgb * dst.rgb;
    r.a = src.a + dst.a - src.a * dst.a;
  }
  else if (mode == 15) {                                       /* DARKEN */
    r.rgb = min (src.rgb * dst.a, dst.rgb * src.a)
          + src.rgb * (1.0 - dst.a) + dst.rgb * (1.0 - src.a);
    r.a = src.a + dst.a - src.a * dst.a;
  }
  else if (mode == 16) {                                       /* LIGHTEN */
    r.rgb = max (src.rgb * dst.a, dst.rgb * src.a)
          + src.rgb * (1.0 - dst.a) + dst.rgb * (1.0 - src.a);
    r.a = src.a + dst.a - src.a * dst.a;
  }
  else if (mode == 23) {                                       /* MULTIPLY (premul) */
    r.rgb = src.rgb * (1.0 - dst.a) + dst.rgb * (1.0 - src.a)
          + src.rgb * dst.rgb;
    r.a = src.a + dst.a - src.a * dst.a;
  }
  /* SRC_OVER (3) and DIFFERENCE / EXCLUSION / HSL_* (21, 22, 24-27)
   * fall through to the SRC_OVER default. */

  return r;
}

/* Wrap _hb_gpu_slug with a sub-glyph extents bail-out.  Many
 * paint layers cover a small region of the outer glyph quad; for
 * fragments outside the layer's bbox (with an AA + MSAA-spread
 * margin) the slug coverage is exactly 0, so we can skip the
 * band/curve walk entirely. */
float _hb_gpu_slug_clipped (float2 renderCoord, float2 pixelsPerEm, uint glyphLoc_,
			    device const short4* hb_gpu_atlas)
{
  int4 header0 = hb_gpu_fetch (hb_gpu_atlas, int (glyphLoc_));
  float4 ext = float4 (header0) * HB_GPU_INV_UNITS;
  float2 margin = 2.0 / pixelsPerEm;
  if (any (renderCoord < ext.xy - margin) ||
      any (renderCoord > ext.zw + margin))
    return 0.0;
  return _hb_gpu_slug (renderCoord, pixelsPerEm, glyphLoc_, hb_gpu_atlas);
}

/* Combine slug coverages from all clip outlines on the layer.
 * Factored out so the shader has one set of inlined slug walks
 * instead of two (one per LAYER op type).  flags bits: 0x100 =
 * HAS_CLIP2; 0x200 = HAS_CLIP3 (HAS_CLIP3 implies HAS_CLIP2). */
static float
_hb_gpu_layer_coverage (float2 renderCoord, float2 pixelsPerEm,
			int base, int flags,
			int clip1_payload, int clip2_payload, int clip3_payload,
			device const short4* hb_gpu_atlas)
{
  float cov = _hb_gpu_slug_clipped (renderCoord, pixelsPerEm,
				    uint (base + clip1_payload), hb_gpu_atlas);
  if ((flags & 0x100) != 0)
  {
    cov *= _hb_gpu_slug_clipped (renderCoord, pixelsPerEm,
				 uint (base + clip2_payload), hb_gpu_atlas);
    if ((flags & 0x200) != 0)
      cov *= _hb_gpu_slug_clipped (renderCoord, pixelsPerEm,
				   uint (base + clip3_payload), hb_gpu_atlas);
  }
  return cov;
}

#define HB_GPU_PAINT_GROUP_DEPTH 4

float4 hb_gpu_paint (float2 renderCoord, uint glyphLoc_, float4 foreground,
		     device const short4* hb_gpu_atlas,
		     thread float &coverage)
{
  /* fwidth once, at uniform control flow. */
  float2 pixelsPerEm = 1.0 / fwidth (renderCoord);

  int base    = int (glyphLoc_);
  int4 h0     = int4 (hb_gpu_atlas[base]);       /* (num_ops, _, _, _) */
  int4 h2     = int4 (hb_gpu_atlas[base + 2]);   /* (ops_offset, _, _, _) */
  int num_ops = h0.r;
  int cursor  = base + h2.r;

  float4 acc = float4 (0.0);
  float4 group_stack[HB_GPU_PAINT_GROUP_DEPTH];
  int sp = 0;
  coverage = 0.0;

  for (int i = 0; i < num_ops; i++)
  {
    int4 op     = int4 (hb_gpu_atlas[cursor]);
    int op_type = op.r;
    int aux     = op.g;
    int payload = (op.b << 16) | (op.a & 0xffff);

    if (op_type == 0)  /* LAYER_SOLID */
    {
      /* texel 1: (clip2_hi, clip2_lo, clip3_hi, clip3_lo) -- valid
       *           per HAS_CLIP2 / HAS_CLIP3 flag bits.
       * texel 2: RGBA as signed Q15. */
      int4 op2 = int4 (hb_gpu_atlas[cursor + 1]);
      int clip2_payload = (op2.r << 16) | (op2.g & 0xffff);
      int clip3_payload = (op2.b << 16) | (op2.a & 0xffff);
      int4 ct = int4 (hb_gpu_atlas[cursor + 2]);
      float4 col = ((aux & 1) != 0)
		 ? float4 (foreground.rgb, foreground.a * (float (ct.a) / 32767.0))
		 : float4 (ct) / 32767.0;

      float cov = _hb_gpu_layer_coverage (renderCoord, pixelsPerEm,
					  base, aux,
					  payload, clip2_payload, clip3_payload,
					  hb_gpu_atlas);
      coverage = max (coverage, cov);
      float4 src = float4 (col.rgb * col.a, col.a) * cov;
      acc = src + acc * (1.0 - src.a);

      cursor += 3;
    }
    else if (op_type == 1)  /* LAYER_GRADIENT */
    {
      int4 op2 = int4 (hb_gpu_atlas[cursor + 1]);
      int clip2_payload = (op2.r << 16) | (op2.g & 0xffff);
      int clip3_payload = (op2.b << 16) | (op2.a & 0xffff);
      int4 op3 = int4 (hb_gpu_atlas[cursor + 2]);
      int grad_payload = (op3.r << 16) | (op3.g & 0xffff);
      int extend       = op3.b;
      int stop_count   = op3.a;
      int subtype      = aux & 0xff;

      float4 col = float4 (0.0);
      if (subtype == 0)
        col = _hb_gpu_sample_linear (renderCoord,
                                     base + grad_payload,
                                     stop_count, extend, foreground,
                                     hb_gpu_atlas);
      else if (subtype == 1)
        col = _hb_gpu_sample_radial (renderCoord,
                                     base + grad_payload,
                                     stop_count, extend, foreground,
                                     hb_gpu_atlas);
      else if (subtype == 2)
        col = _hb_gpu_sample_sweep  (renderCoord,
                                     base + grad_payload,
                                     stop_count, extend, foreground,
                                     hb_gpu_atlas);

      float cov = _hb_gpu_layer_coverage (renderCoord, pixelsPerEm,
					  base, aux,
					  payload, clip2_payload, clip3_payload,
					  hb_gpu_atlas);
      coverage = max (coverage, cov);
      float4 src = float4 (col.rgb * col.a, col.a) * cov;
      acc = src + acc * (1.0 - src.a);

      cursor += 3;
    }
    else if (op_type == 2)  /* PUSH_GROUP */
    {
      if (sp < HB_GPU_PAINT_GROUP_DEPTH) {
        group_stack[sp] = acc;
        sp++;
      }
      acc = float4 (0.0);
      cursor += 1;
    }
    else if (op_type == 3)  /* POP_GROUP */
    {
      if (sp > 0) {
        sp--;
        float4 src = acc;
        float4 dst = group_stack[sp];
        acc = _hb_gpu_composite (src, dst, aux);
      }
      cursor += 1;
    }
    else
    {
      break;
    }
  }

  return acc;
}