#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#define LOG_INFINITY 10

__global__
void isingModel(int N, char* S, float* sp, int seed, int iterCount ) {
  int N_MINUS_ONE = N-1;
  int N_MUL_N_MINUS_ONE = N*(N-1);

  curandState state;
  int thread_id = blockIdx.x*blockDim.x + threadIdx.x;
  curand_init(seed, thread_id, 0, &state);
  for(int iter=0; iter < iterCount; iter++) {
    int x = curand(&state) % N;
    int y = curand(&state) % N;
    int pos = N * x + y;

    int dS = S[pos] * (
       S[(x==0)?(pos+N_MUL_N_MINUS_ONE):(pos-N)] +           // Top neigbor
       S[(x==N_MINUS_ONE)?(pos-N_MUL_N_MINUS_ONE):(pos+N)] + // Bottom neigbor
       S[(y==0)?(pos+N_MINUS_ONE):(pos-1)] +                 // Left neigbor
       S[(y==N_MINUS_ONE)?(pos-N_MINUS_ONE):(pos+1)]         // Right neigbor
    );

    if (curand_uniform(&state) < sp[dS+4]) {
      S[pos] *= -1;
    }
  }
}

int main(int argc, char** argv)
{
  int P = 100;
  if (argc>=2) P = atoi(argv[1]);

  int N=100;
  float J = 1.0;

  scanf("%d %f", &N, &J);

  fprintf(stderr, "N=%d; J=%f; P=%d\n", N, J, P);

  char *S, *d_S;
  float *sp, *d_sp;
  S = (char*)malloc(N*N*sizeof(char));
  cudaMalloc(&d_S, N*N*sizeof(char));
  sp = (float*)malloc(9*sizeof(float));
  cudaMalloc(&d_sp, 9*sizeof(float));

  // Genearting random lattice.
  for (int i = 0; i < N*N; i++) {
    S[i] = (rand()%2*2)-1;
  }
  cudaMemcpy(d_S, S, N*N*sizeof(char), cudaMemcpyHostToDevice);

  float T;
  int iters;
  int repeats;

  while(scanf("%f%d%d", &T, &iters, &repeats)==3) {
    // Calculating swap probabilities.
    for (int dS=-4;dS<=4;dS++) {
      float beta_mul_dE = (J/T) * 2 * dS;
      if (beta_mul_dE < -LOG_INFINITY) sp[dS+4] = 1;
      else if (beta_mul_dE > LOG_INFINITY) sp[dS+4] = 0;
      else sp[dS+4] = (1 / (exp(beta_mul_dE) + 1));
    }


    cudaMemcpy(d_sp, sp, 9*sizeof(float), cudaMemcpyHostToDevice);

    float avg_E_sum = 0;
    float m_sum = 0;
    for (int r = 0; r < repeats; r++) {
      // Do modelling on GPU.
      isingModel<<<1, P>>>(N, d_S, d_sp, rand(), iters / (P));

      // Retreiving spins grid from GPU.
      cudaMemcpy(S, d_S, N*N*sizeof(char), cudaMemcpyDeviceToHost);

      // Calculating magnetization
      int spinSum = 0;
      for (int i=0;i<N*N;i++) {
        spinSum += S[i];
      }
      float magn = (1.0 * spinSum)/ (N*N);
      if (magn<0) {magn *= -1;}
      m_sum += magn;

      // Calculating energy.
      int E_sum = 0;
      for (int i = 0; i < N; i++) {
        for (int j = 1; j < N; j++) {
          E_sum += S[i*N + (j-1)]*S[i*N + j];
          E_sum += S[(j-1)*N + i]*S[j*N + i];
        }
        E_sum += S[i*N + (N-1)]*S[i*N];
        E_sum += S[N*(N-1) + i]*S[i];
      }
      avg_E_sum += (1.0 * E_sum) / (N*N);
    }

    printf("%f\t%f\t%f\n", T, m_sum/repeats, -J*avg_E_sum/repeats);
    fprintf(stderr, "%f\t%f\t%f\n", T, m_sum/repeats, -J*avg_E_sum/repeats);

    cudaMemcpy(sp, d_sp, 9*sizeof(float), cudaMemcpyDeviceToHost);
  }

  cudaFree(d_S);
  free(S);
  cudaFree(d_sp);
  free(sp);
}
