/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other oclMaterials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

//data (which is float) is aligend in 32 bytes
#define WIDTH_MULTIPLE (32 >> 2)

/////////////////////////////////////////////////////////
//------------------------------------------------------
// basicretinafilter
//////////////// _spatiotemporalLPfilter ////////////////
//_horizontalCausalFilter_addInput
kernel void horizontalCausalFilter_addInput(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int out_offset,
    const float _tau,
    const float _a
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input  + mad24(gid, elements_per_row, in_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float res;
    float4 in_v4, out_v4, res_v4 = (float4)(0);
    //vectorize to increase throughput
    for(int i = 0; i < cols / 4; ++i, iptr += 4, optr += 4)
    {
        in_v4  = vload4(0, iptr);
        out_v4 = vload4(0, optr);

        res_v4.x = in_v4.x + _tau * out_v4.x + _a * res_v4.w;
        res_v4.y = in_v4.y + _tau * out_v4.y + _a * res_v4.x;
        res_v4.z = in_v4.z + _tau * out_v4.z + _a * res_v4.y;
        res_v4.w = in_v4.w + _tau * out_v4.w + _a * res_v4.z;

        vstore4(res_v4, 0, optr);
    }
    res = res_v4.w;
    // there may be left some
    for(int i = 0; i < cols % 4;  ++i, ++iptr, ++optr)
    {
        res = *iptr + _tau * *optr + _a * res;
        *optr = res;
    }
}

//_horizontalAnticausalFilter
kernel void horizontalAnticausalFilter(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global float * optr = output +
                          mad24(gid + 1, elements_per_row, - 1 + out_offset / 4);

    float4 result_v4 = (float4)(0), out_v4;
    float result = 0;
    // we assume elements_per_row is multple of WIDTH_MULTIPLE
    for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr)
    {
        if(i >= elements_per_row - cols)
        {
            result = *optr + _a * result;
        }
        *optr = result;
    }
    result_v4.x = result;
    optr -= 3;
    for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4)
    {
        // shift left, `offset` is type `size_t` so it cannot be negative
        out_v4 = vload4(0, optr);

        result_v4.w = out_v4.w + _a * result_v4.x;
        result_v4.z = out_v4.z + _a * result_v4.w;
        result_v4.y = out_v4.y + _a * result_v4.z;
        result_v4.x = out_v4.x + _a * result_v4.y;

        vstore4(result_v4, 0, optr);
    }
}

//_verticalCausalFilter
kernel void verticalCausalFilter(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a
)
{
    int gid = get_global_id(0);
    if(gid >= cols)
    {
        return;
    }

    global float * optr = output + gid + out_offset / 4;
    float result = 0;
    for(int i = 0; i < rows; ++i, optr += elements_per_row)
    {
        result = *optr + _a * result;
        *optr = result;
    }
}

//_verticalCausalFilter
kernel void verticalAnticausalFilter_multGain(
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const float _a,
    const float _gain
)
{
    int gid = get_global_id(0);
    if(gid >= cols)
    {
        return;
    }

    global float * optr = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
    float result = 0;
    for(int i = 0; i < rows; ++i, optr -= elements_per_row)
    {
        result = *optr + _a * result;
        *optr = _gain * result;
    }
}
//
// end of _spatiotemporalLPfilter
/////////////////////////////////////////////////////////////////////

//////////////// horizontalAnticausalFilter_Irregular ////////////////
kernel void horizontalAnticausalFilter_Irregular(
    global float * output,
    global float * buffer,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const int buffer_offset
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global float * optr =
        output + mad24(rows - gid, elements_per_row, -1 + out_offset / 4);
    global float * bptr =
        buffer + mad24(rows - gid, elements_per_row, -1 + buffer_offset / 4);

    float4 buf_v4, out_v4, res_v4 = (float4)(0);
    float result = 0;
    // we assume elements_per_row is multple of WIDTH_MULTIPLE
    for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr, -- bptr)
    {
        if(i >= elements_per_row - cols)
        {
            result = *optr + *bptr * result;
        }
        *optr = result;
    }
    res_v4.x = result;
    optr -= 3;
    bptr -= 3;
    for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
    {
        buf_v4 = vload4(0, bptr);
        out_v4 = vload4(0, optr);

        res_v4.w = out_v4.w + buf_v4.w * res_v4.x;
        res_v4.z = out_v4.z + buf_v4.z * res_v4.w;
        res_v4.y = out_v4.y + buf_v4.y * res_v4.z;
        res_v4.x = out_v4.x + buf_v4.x * res_v4.y;

        vstore4(res_v4, 0, optr);
    }
}

//////////////// verticalCausalFilter_Irregular ////////////////
kernel void verticalCausalFilter_Irregular(
    global float * output,
    global float * buffer,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int out_offset,
    const int buffer_offset
)
{
    int gid = get_global_id(0);
    if(gid >= cols)
    {
        return;
    }

    global float * optr = output + gid + out_offset / 4;
    global float * bptr = buffer + gid + buffer_offset / 4;
    float result = 0;
    for(int i = 0; i < rows; ++i, optr += elements_per_row, bptr += elements_per_row)
    {
        result = *optr + *bptr * result;
        *optr = result;
    }
}

//////////////// _adaptiveHorizontalCausalFilter_addInput ////////////////
kernel void adaptiveHorizontalCausalFilter_addInput(
    global const float * input,
    global const float * gradient,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int in_offset,
    const int grad_offset,
    const int out_offset
)
{
    int gid = get_global_id(0);
    if(gid >= rows)
    {
        return;
    }

    global const float * iptr =
        input + mad24(gid, elements_per_row, in_offset / 4);
    global const float * gptr =
        gradient + mad24(gid, elements_per_row, grad_offset / 4);
    global float * optr =
        output + mad24(gid, elements_per_row, out_offset / 4);

    float4 in_v4, grad_v4, out_v4, res_v4 = (float4)(0);
    for(int i = 0; i < cols / 4; ++i, iptr += 4, gptr += 4, optr += 4)
    {
        in_v4   = vload4(0, iptr);
        grad_v4 = vload4(0, gptr);

        res_v4.x = in_v4.x + grad_v4.x * res_v4.w;
        res_v4.y = in_v4.y + grad_v4.y * res_v4.x;
        res_v4.z = in_v4.z + grad_v4.z * res_v4.y;
        res_v4.w = in_v4.w + grad_v4.w * res_v4.z;

        vstore4(res_v4, 0, optr);
    }
    for(int i = 0; i < cols % 4; ++i, ++iptr, ++gptr, ++optr)
    {
        res_v4.w = *iptr + *gptr * res_v4.w;
        *optr = res_v4.w;
    }
}

//////////////// _adaptiveVerticalAnticausalFilter_multGain ////////////////
kernel void adaptiveVerticalAnticausalFilter_multGain(
    global const float * gradient,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const int grad_offset,
    const int out_offset,
    const float gain
)
{
    int gid = get_global_id(0);
    if(gid >= cols)
    {
        return;
    }

    int start_idx = mad24(rows - 1, elements_per_row, gid);

    global const float * gptr = gradient + start_idx + grad_offset / 4;
    global float * optr = output + start_idx + out_offset / 4;

    float result = 0;
    for(int i = 0; i < rows; ++i, gptr -= elements_per_row, optr -= elements_per_row)
    {
        result = *optr + *gptr * result;
        *optr = gain * result;
    }
}

//////////////// _localLuminanceAdaptation ////////////////
// FIXME:
//  This kernel seems to have precision problem on GPU
kernel void localLuminanceAdaptation(
    global const float * luma,
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float _localLuminanceAddon,
    const float _localLuminanceFactor,
    const float _maxInputValue
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);

    float X0 = luma[offset] * _localLuminanceFactor + _localLuminanceAddon;
    float input_val = input[offset];
    // output of the following line may be different between GPU and CPU
    output[offset] = (_maxInputValue + X0) * input_val / (input_val + X0 + 0.00000000001f);
}
// end of basicretinafilter
//------------------------------------------------------
/////////////////////////////////////////////////////////


/////////////////////////////////////////////////////////
//------------------------------------------------------
// magno
// TODO: this kernel has too many buffer accesses, better to make it
//   vector read/write for fetch efficiency
kernel void amacrineCellsComputing(
    global const float * opl_on,
    global const float * opl_off,
    global float * prev_in_on,
    global float * prev_in_off,
    global float * out_on,
    global float * out_off,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float coeff
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    opl_on      += offset;
    opl_off     += offset;
    prev_in_on  += offset;
    prev_in_off += offset;
    out_on      += offset;
    out_off     += offset;

    float magnoXonPixelResult = coeff * (*out_on + *opl_on - *prev_in_on);
    *out_on = fmax(magnoXonPixelResult, 0);
    float magnoXoffPixelResult = coeff * (*out_off + *opl_off - *prev_in_off);
    *out_off = fmax(magnoXoffPixelResult, 0);

    *prev_in_on = *opl_on;
    *prev_in_off = *opl_off;
}

/////////////////////////////////////////////////////////
//------------------------------------------------------
// parvo
// TODO: this kernel has too many buffer accesses, needs optimization
kernel void OPL_OnOffWaysComputing(
    global float4 * photo_out,
    global float4 * horiz_out,
    global float4 * bipol_on,
    global float4 * bipol_off,
    global float4 * parvo_on,
    global float4 * parvo_off,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx * 4 >= cols || gidy >= rows)
    {
        return;
    }
    // we assume elements_per_row must be multiples of 4
    int offset = mad24(gidy, elements_per_row >> 2, gidx);
    photo_out += offset;
    horiz_out += offset;
    bipol_on  += offset;
    bipol_off += offset;
    parvo_on  += offset;
    parvo_off += offset;

    float4 diff = *photo_out - *horiz_out;
    float4 isPositive;// = convert_float4(diff > (float4)(0.0f, 0.0f, 0.0f, 0.0f));
    isPositive.x = diff.x > 0.0f;
    isPositive.y = diff.y > 0.0f;
    isPositive.z = diff.z > 0.0f;
    isPositive.w = diff.w > 0.0f;
    float4 res_on  = isPositive * diff;
    float4 res_off = (isPositive - (float4)(1.0f)) * diff;

    *bipol_on = res_on;
    *parvo_on = res_on;

    *bipol_off = res_off;
    *parvo_off = res_off;
}

/////////////////////////////////////////////////////////
//------------------------------------------------------
// retinacolor
inline int bayerSampleOffset(int step, int rows, int x, int y)
{
    return mad24(y, step, x) +
           ((y % 2) + (x % 2)) * rows * step;
}


/////// colorMultiplexing //////
kernel void runColorMultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    output[offset] = input[bayerSampleOffset(elements_per_row, rows, gidx, gidy)];
}

kernel void runColorDemultiplexingBayer(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = mad24(gidy, elements_per_row, gidx);
    output[bayerSampleOffset(elements_per_row, rows, gidx, gidy)] = input[offset];
}

kernel void demultiplexAssign(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }

    int offset = bayerSampleOffset(elements_per_row, rows, gidx, gidy);
    output[offset] = input[offset];
}


//// normalizeGrayOutputCentredSigmoide
kernel void normalizeGrayOutputCentredSigmoide(
    global const float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float meanval,
    const float X0
)

{
    int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);

    float input_val = input[offset];
    output[offset] = meanval +
                     (meanval + X0) * (input_val - meanval) / (fabs(input_val - meanval) + X0);
}

//// normalize by photoreceptors density
kernel void normalizePhotoDensity(
    global const float * chroma,
    global const float * colorDensity,
    global const float * multiplex,
    global float * luma,
    global float * demultiplex,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pG
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    int index = offset;

    float Cr = chroma[index] * colorDensity[index];
    index += elements_per_row * rows;
    float Cg = chroma[index] * colorDensity[index];
    index += elements_per_row * rows;
    float Cb = chroma[index] * colorDensity[index];

    const float luma_res = (Cr + Cg + Cb) * pG;
    luma[offset] = luma_res;
    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx, gidy)] =
        multiplex[offset] - luma_res;
}


//////// computeGradient ///////
// TODO:
// this function maybe accelerated by image2d_t or lds
kernel void computeGradient(
    global const float * luma,
    global float * gradient,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    int gidx = get_global_id(0) + 2, gidy = get_global_id(1) + 2;
    if(gidx >= cols - 2 || gidy >= rows - 2)
    {
        return;
    }
    int offset = mad24(gidy, elements_per_row, gidx);
    luma += offset;

    // horizontal and vertical local gradients
    const float v_grad = fabs(luma[elements_per_row] - luma[- elements_per_row]);
    const float h_grad = fabs(luma[1] - luma[-1]);

    // neighborhood horizontal and vertical gradients
    const float cur_val  = luma[0];
    const float v_grad_p = fabs(cur_val - luma[- 2 * elements_per_row]);
    const float h_grad_p = fabs(cur_val - luma[- 2]);
    const float v_grad_n = fabs(cur_val - luma[2 * elements_per_row]);
    const float h_grad_n = fabs(cur_val - luma[2]);

    const float horiz_grad = 0.5f * h_grad + 0.25f * (h_grad_p + h_grad_n);
    const float verti_grad = 0.5f * v_grad + 0.25f * (v_grad_p + v_grad_n);
    const bool is_vertical_greater = horiz_grad < verti_grad;

    gradient[offset + elements_per_row * rows] = is_vertical_greater ? 0.06f : 0.57f;
    gradient[offset                          ] = is_vertical_greater ? 0.57f : 0.06f;
}


/////// substractResidual ///////
kernel void substractResidual(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float pR,
    const float pG,
    const float pB
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    int indices [3] =
    {
        mad24(gidy, elements_per_row, gidx),
        mad24(gidy + rows, elements_per_row, gidx),
        mad24(gidy + 2 * rows, elements_per_row, gidx)
    };
    float vals[3] = {input[indices[0]], input[indices[1]], input[indices[2]]};
    float residu = pR * vals[0] + pG * vals[1] + pB * vals[2];

    input[indices[0]] = vals[0] - residu;
    input[indices[1]] = vals[1] - residu;
    input[indices[2]] = vals[2] - residu;
}

///// clipRGBOutput_0_maxInputValue /////
kernel void clipRGBOutput_0_maxInputValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    float val = input[offset];
    val = clamp(val, 0.0f, maxVal);
    input[offset] = val;
}

//// normalizeGrayOutputNearZeroCentreredSigmoide ////
kernel void normalizeGrayOutputNearZeroCentreredSigmoide(
    global float * input,
    global float * output,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float maxVal,
    const float X0cube
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    float currentCubeLuminance = input[offset];
    currentCubeLuminance = currentCubeLuminance * currentCubeLuminance * currentCubeLuminance;
    output[offset] = currentCubeLuminance * X0cube / (X0cube + currentCubeLuminance);
}

//// centerReductImageLuminance ////
kernel void centerReductImageLuminance(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row,
    const float mean,
    const float std_dev
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

    float val = input[offset];
    input[offset] = (val - mean) / std_dev;
}

//// inverseValue ////
kernel void inverseValue(
    global float * input,
    const int cols,
    const int rows,
    const int elements_per_row
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);
    input[offset] = 1.f / input[offset];
}

#define CV_PI 3.1415926535897932384626433832795

//// _processRetinaParvoMagnoMapping ////
kernel void processRetinaParvoMagnoMapping(
    global float * parvo,
    global float * magno,
    global float * output,
    const int cols,
    const int rows,
    const int halfCols,
    const int halfRows,
    const int elements_per_row,
    const float minDistance
)
{
    const int gidx = get_global_id(0), gidy = get_global_id(1);
    if(gidx >= cols || gidy >= rows)
    {
        return;
    }
    const int offset = mad24(gidy, elements_per_row, gidx);

    float distanceToCenter =
        sqrt(((float)(gidy - halfRows) * (gidy - halfRows) + (gidx - halfCols) * (gidx - halfCols)));

    float a = distanceToCenter < minDistance ?
              (0.5f + 0.5f * (float)cos(CV_PI * distanceToCenter / minDistance)) : 0;
    float b = 1.f - a;

    output[offset] = parvo[offset] * a + magno[offset] * b;
}