//!HOOK LUMA
//!BIND HOOKED
//!DESC gaussian smoothed film grain
//!COMPUTE 32 32

#define INTENSITY 0.1
#define TAPS 2

const uint row_size = 2 * TAPS + 1;
const float weights[row_size] = {
#if TAPS == 1
    0.10650697891920,
    0.78698604216159,
    0.10650697891920,
#endif

#if TAPS == 2
    0.05448868454964,
    0.24420134200323,
    0.40261994689424,
    0.24420134200323,
    0.05448868454964,
#endif

#if TAPS == 3
    0.03663284536919,
    0.11128075847888,
    0.21674532140370,
    0.27068214949642,
    0.21674532140370,
    0.11128075847888,
    0.03663284536919,
#endif

#if TAPS == 4
    0.02763055063889,
    0.06628224528636,
    0.12383153680577,
    0.18017382291138,
    0.20416368871516,
    0.18017382291138,
    0.12383153680577,
    0.06628224528636,
    0.02763055063889,
#endif

#if TAPS == 5
    0.02219054849244,
    0.04558899978527,
    0.07981140824009,
    0.11906462996609,
    0.15136080967773,
    0.16396720767670,
    0.15136080967773,
    0.11906462996609,
    0.07981140824009,
    0.04558899978527,
    0.02219054849244,
#endif
};

const uvec2 isize = uvec2(gl_WorkGroupSize) + uvec2(2 * TAPS);
shared float grain[isize.y][isize.x];

// PRNG
float permute(float x)
{
    x = (34.0 * x + 1.0) * x;
    return fract(x * 1.0/289.0) * 289.0;
}

float seed(uvec2 pos)
{
    const float phi = 1.61803398874989;
    vec3 m = vec3(fract(phi * vec2(pos)), random) + vec3(1.0);
    return permute(permute(m.x) + m.y) + m.z;
}

float rand(inout float state)
{
    state = permute(state);
    return fract(state * 1.0/41.0);
}

// Turns uniform white noise into gaussian white noise by passing it
// through an approximation of the gaussian quantile function
float rand_gaussian(inout float state) {
    const float a0 = 0.151015505647689;
    const float a1 = -0.5303572634357367;
    const float a2 = 1.365020122861334;
    const float b0 = 0.132089632343748;
    const float b1 = -0.7607324991323768;

    float p = 0.95 * rand(state) + 0.025;
    float q = p - 0.5;
    float r = q * q;

    float g = q * (a2 + (a1 * r + a0) / (r*r + b1*r + b0));
    g *= 0.255121822830526; // normalize to [-1,1)
    return g;
}

void hook()
{
    // generate grain in `grain`
    uint num_threads = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
    for (uint i = gl_LocalInvocationIndex; i < isize.y * isize.x; i += num_threads) {
        uvec2 pos = uvec2(i % isize.y, i / isize.y);
        float state = seed(gl_WorkGroupID.xy * gl_WorkGroupSize.xy + pos);
        grain[pos.y][pos.x] = rand_gaussian(state);
    }

    // make writes visible
    barrier();

    // convolve horizontally
    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
        float hsum = 0;
        for (uint x = 0; x < row_size; x++) {
            float g = grain[y][gl_LocalInvocationID.x + x];
            hsum += weights[x] * g;
        }

        // update grain LUT
        grain[y][gl_LocalInvocationID.x + TAPS] = hsum;
    }

    barrier();

    // convolve vertically
    float vsum = 0.0;
    for (uint y = 0; y < row_size; y++) {
        float g = grain[gl_LocalInvocationID.y + y][gl_LocalInvocationID.x + TAPS];
        vsum += weights[y] * g;
    }

    vec4 color = HOOKED_tex(HOOKED_pos);
    color.rgb += vec3(INTENSITY * vsum);
    imageStore(out_image, ivec2(gl_GlobalInvocationID), color);
}