// Copyright // FSWORD73 AT HOTMAIL DOT COM
//
// This Source Code Form is subject to the terms of the Mozilla Public License, 
// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
// obtain one at http://mozilla.org/MPL/2.0/.

// expected defines:
// BIASED (or not)

// Original workgroup by  hughperkins at gmail in deepCL
// globalId: [outPlane][inputPlane][filterRow][filterCol]
// per-thread iteration: [n][outputRow][outputCol]
//27     int workgroupsize = std::max(32, square(dim.filterSize) ); // no point in wasting cores... 
//28     int numWorkgroups = dim.inputPlanes * dim.numFilters; 
//29     int globalSize = workgroupsize * numWorkgroups; 
//30     globalSize = (( globalSize + workgroupsize - 1) / workgroupsize) * workgroupsize; 

#define 	gNumFilters 	8
#define 	gInputPlanes	4
#define   gFilterSize   3 
#define   gFilterSizeSquared (gFilterSize*gFilterSize)
//#define   gOutputSize   128 
#define   gInputSize    128 
#define   gMargin        0
#define   FIXED_WORKGROUPSIZE 64 
#define   FIXED_WORKGROUPSIZE_SHIFT 6

void  backprop_floats_fast_valid_thread(
        global const float *gradOutput, 
				global const float *images, 
				float *thiswchange,
				#ifdef BIASED  
				  float*  thisbiaschange,	
				#endif 
				const int batchSize,
				const int gOutputSize,
				int localId,
				int globalId	
 ) 
 {	
			  *thiswchange = 0;	      
				int batchId     =  (globalId >> FIXED_WORKGROUPSIZE_SHIFT) % batchSize  ;
			  int globalIdOutput = (globalId >> FIXED_WORKGROUPSIZE_SHIFT) / batchSize ;
	 
				int IntraFilterOffset =  globalIdOutput % gFilterSizeSquared;
				int filterRow = IntraFilterOffset / gFilterSize;
				int filterCol = IntraFilterOffset % gFilterSize;

				int filter2Id = globalIdOutput / gFilterSizeSquared;
				int outPlane = filter2Id / gInputPlanes;
				int upstreamPlane = filter2Id % gInputPlanes;	 
				int n = batchId;
	 
	 
			  int iterations = (gOutputSize * gOutputSize + FIXED_WORKGROUPSIZE -1) >> FIXED_WORKGROUPSIZE_SHIFT;
	 
	      for(int i = 0; i < iterations; i++)
				{
						int offsetofOutput = i*FIXED_WORKGROUPSIZE + localId;
						
						if( offsetofOutput < (gOutputSize * gOutputSize))
						{
								int outRow = offsetofOutput / gOutputSize;						  
								int upstreamRow = outRow - gMargin + filterRow;
								int outCol = offsetofOutput % gOutputSize;
								int upstreamCol = outCol - gMargin + filterCol;
								bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize
										 && upstreamCol < gInputSize;
									
								if (proceed) {
											int resultIndex = (( n * gNumFilters 
																+ outPlane) * gOutputSize
																+ outRow) * gOutputSize
																+ outCol;
											float error = gradOutput[resultIndex];
											int upstreamDataIndex = (( n * gInputPlanes 
																			 + upstreamPlane) * gInputSize
																			 + upstreamRow) * gInputSize
																			 + upstreamCol;
											float upstreamResult = images[upstreamDataIndex];
											float thisimagethiswchange = upstreamResult * error;
											*thiswchange += thisimagethiswchange;
			#ifdef BIASED
											*thisbiaschange += error;
			#endif
									}								
						}					
				}		//for loop 
 } 
 
 
 void Reduction_of_Weights( 
				float* thiswchange,
				__local  float* sdata, 				
				#ifdef BIASED  
				 float*  thisbiaschange,	
				#endif
			  int localId,
				int globalId)
 {
		  
			//store into local 	 
			sdata[localId] = *thiswchange;
			barrier(CLK_LOCAL_MEM_FENCE);
			for(unsigned int s = FIXED_WORKGROUPSIZE >>1; s > 0; s >>= 1) 
			{
        if(localId < s) 
        {
            sdata[localId] += sdata[localId + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
			}
			
			if(localId == 0)
			{
				*thiswchange = 	sdata[0];
			}
#ifdef BIASED  
			sdata[localId] = *thisbiaschange;
			barrier(CLK_LOCAL_MEM_FENCE);
			for(unsigned int s = FIXED_WORKGROUPSIZE >>1; s > 0; s >>= 1) 
			{
        if(localId < s) 
        {
            sdata[localId] += sdata[localId + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
			}
			
			if(localId == 0)
			{
				*thisbiaschange = 	sdata[0];
			}
#endif
}
			
			
inline void AtomicAdd(volatile __global float *source, const float operand) {
    union {
        unsigned int intVal;
        float floatVal;
    } newVal;
    union {
        unsigned int intVal;
        float floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, 
                             prevVal.intVal, newVal.intVal) 
                             != prevVal.intVal);
}

//   
//   Global ID :  [outPlane][inputPlane][filterRow][filterCol] *  BatchSize * 64   
//   Per thread = ( gOutputSize * gOutputSize)/64
//   workgroupsize =  64 treads == 1 image  
//   numWorkgroups  = [outPlane][inputPlane][filterRow][filterCol] *  BatchSize 
//   int globalSize = workgroupsize * numWorkgroups; 
//   Assuming   [outPlane][inputPlane][filterRow][filterCol] is cleared or preset 
//   Each workgroupsize call Atomic_Add 
//	  One pass to compete it. 
//    Stage1:  caclaute each pixels 
//    Stage2:  Rediction whole workgroupsize into 1 
//    Local LDS = 64;

#if 1 
void __kernel test_kernel(				
__global const float* filter,
						__global const float* gradOutput,
						__global float* images,	
						__global const float* dataBuf3,
						__global const float* dataBuf4,
        #ifdef BIASED
             __global float *gradBiasWeights,
				#else 
					 __global const float* dataBuf5, 
        #endif						
						__global float* gradWeights,
						const int batchSize,  
						const int gOutputSize,  
						const int const3,  
						const int const4,  
						const int const5,  
						const int const6
 ) 

#else 
void __kernel backprop_floats_fast(				
        __global const float *gradOutput, 
				__global const float *images, 
        __global float *gradWeights,				
        #ifdef BIASED
             __global float *gradBiasWeights,
        #endif
				const float learningRateMultiplier,
				const int batchSize,
			  const int gOutputSize,  	
 ) 
 #endif
 {	 
	 
	 const float learningRateMultiplier = 0.0001f;
	 int globalId = get_global_id(0);				
	 int localId  = get_local_id(0);
	  float thiswchange = 0;
	  __local float thiswchanges[FIXED_WORKGROUPSIZE];	
#ifdef BIASED
	 thisbiaschange  = 0;
#endif
	  
	 //It does not include any Invalid Threads since the FIXED_WORKGROUPSIZE is 64)
   // if (globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize *  batchSize * FIXED_WORKGROUPSIZE) {
   //     //Do nothing	 
   // }
	backprop_floats_fast_valid_thread(
																					gradOutput, 
																					images,         
																					&thiswchange,
																	#ifdef BIASED  
																					 &thisbiaschange,	
																	#endif 
																					batchSize,
																					gOutputSize,
																					globalId,
																					localId
																					
		);

		
		//aggregate Data to Thread0
		Reduction_of_Weights( 
					&thiswchange,
					thiswchanges, 
				
					#ifdef BIASED  
					&thisbiaschange,	
					thisbiaschanges,
					#endif
					localId,
					globalId
			);

		//Thread0 Atomics into 
		if(localId == 0)
		{
					int globalIdOutput = (globalId >> FIXED_WORKGROUPSIZE_SHIFT) / batchSize ;

						// gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]
						//       aggregate over:  [outRow][outCol][n]
						//gradWeights[ globalIdOutput ] = learningRateMultiplier * thiswchange;
						volatile __global float *source = gradWeights + globalIdOutput;
						AtomicAdd(source, learningRateMultiplier * thiswchange);
#ifdef BIASED
						int IntraFilterOffset =  globalIdOutput % gFilterSizeSquared;
						int filterRow = IntraFilterOffset / gFilterSize;
						int filterCol = IntraFilterOffset % gFilterSize;

						int filter2Id = globalIdOutput / gFilterSizeSquared;
						int outPlane = filter2Id / gInputPlanes;
						int upstreamPlane = filter2Id % gInputPlanes;	 
			
						bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;
						if (writeBias) {
								//gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;
								*source = gradBiasWeights + outPlane;
								AtomicAdd(source, learningRateMultiplier * thisbiaschange);
						}
#endif
		}
}

//CNNBench command line 
//-dim 1 -lx 64 -ly 1 -gx 2359296 -gy 1 -f 3 -c1 128 -c2 128 -i 1 -x 4096 -y 4096
	//    Input 	128x128, 4 Planes, BatchSize 128 
  //    output  128x128, 8 planes,  BatchSize 128    
  //    Filter Size = 3x3 
	//   numWorkgroups  = [outPlane][inputPlane][filterRow][filterCol] *  BatchSize 
  //   int globalSize = workgroupsize * numWorkgroups; 
	//    globalSize = 64 * [8][4][3][3] *  128   = 2359296
  //     
	//    	    Input 	= 128x128x4x128 batch = 4096 * 2048
  //			    output  = 128x128x8x128 batch = 4096 * 4096  
	//          force localthread_x =64,  globalthread_y =1
	//          FilterSize =3,  
	//         -constant1 = batchSize 128
	//         -constant2 = gOutputSize 128