#include <iostream>
#include <vector>
#include <chrono>
#include <fstream>
#include <string>

#include <cuda_runtime.h>

// forward declaration of CUDA kernel functions
void computeTemperature(std::vector<double> &T0, std::vector<double> &T1, double heatDiffusionCoefficient, double dt,
    double dx, int numberOfPoints, int numberOfTimeSteps);

int main(int argc, char* argv[]) {
    // make sure a CUDA capable device is available
    int device_count = 0;
    cudaError_t cudaStatus = cudaGetDeviceCount(&device_count);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "cudaGetDeviceCount() error: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }
    std::cout << "Number of CUDA capable devices found: " << device_count << std::endl;

    // get the current device id
    int device_id = 0;
    cudaStatus = cudaGetDevice(&device_id);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "cudaGetDevice() error: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    // use device id to print some information about the GPU that is being used
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device_id);

    std::cout << "Running on GPU " << device_id << ": " << prop.name << std::endl;
    std::cout << "Compute capability: " << prop.major << "." << prop.minor << std::endl;

    // input parameters
    const int numberOfPoints = static_cast<int>(atoi(argv[1]));
    const double leftBoundary = 0.0;
    const double rightBoundary = 1.0;
    const double CFL = 0.25;
    const double heatDiffusionCoefficient = 0.01;
    const double finalTime = static_cast<double>(atof(argv[2]));
    
    // computed input paramters
    const double dx = 1.0 / (numberOfPoints - 1);
    const double dt = CFL * dx * dx / heatDiffusionCoefficient;
    const int numberOfTimeSteps = static_cast<int>(finalTime / dt);

    // allocate memory for field arrays
    std::vector<double> T0(numberOfPoints); // T at time n
    std::vector<double> T1(numberOfPoints); // T at time n+1
    std::vector<double> x(numberOfPoints);

    // create mesh
    for (int i = 0; i < numberOfPoints; ++i) {
        x[i] = dx * i;
    }

    // initialise field arrays
    for (int i = 0; i < numberOfPoints; ++i) {
        T0[i] = 0.0;
        T1[i] = 0.0;
    }

    // set boundary conditions (important, set it for T1, not T0)
    T1[0] = leftBoundary;
    T1[numberOfPoints - 1] = rightBoundary;

    auto startTime = std::chrono::high_resolution_clock::now();
    
    // loop over all timesteps    
    computeTemperature(T0, T1, heatDiffusionCoefficient, dt, dx, numberOfPoints, numberOfTimeSteps);
    
    auto endTime = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
    std::cout << "Executation time: " << static_cast<double>(duration.count()) / 1000 << " seconds" << std::endl;
    
    // compute error
    double error = 0.0;
    for (int i = 0; i < numberOfPoints; ++i) {
        error += std::abs(T1[i] - x[i]);
    }
    error /= numberOfPoints;
    std::cout << "Error: " << error << std::endl;

    // // output results
    // auto finalTimeString = std::to_string(finalTime);
    // finalTimeString = finalTimeString.substr(0, finalTimeString.find("."));
    // std::string fileName = "results_CUDA_" + finalTimeString + ".csv";
    // std::ofstream file(fileName);
    // file << "x,T" << std::endl;
    // for (int i = 0; i < numberOfPoints; ++i) {
    //     file << x[i] << ", " << T1[i] << std::endl;
    // }
    // file.close();

    return 0;
}
