#include #include #include #include // kernel __global__ void convolution_2D_Kernel(float* d_m, float* d_mask, float* d_n, size_t a, size_t b, size_t maskWidth) { // indexing variables int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int m_row = j - maskWidth / 2; int m_col = i - maskWidth / 2; // thread boundary check if(i < b && j < a) { for(int k = 0; k < maskWidth; ++k) { for(int l = 0; l < maskWidth; ++l) { if(m_row + l >= 0 && m_row + l < a && m_col + k >= 0 && m_col + k < b) { d_n[j * b + i] += d_m[(m_row + l) * b + m_col + k] * d_mask[l * maskWidth + k]; } } } } } // CUDA error checking void errorCheck(unsigned int line) { cudaError_t cudaError = cudaGetLastError(); if(cudaError != cudaSuccess) { printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError)); exit(EXIT_FAILURE); } } // host function containing kernel call void convolution_2D(float* m, float* mask, float* n, size_t a, size_t b, size_t maskWidth) { dim3 numOfBlocks(ceil(b / 32.0), ceil(a / 32.0), 1); dim3 numOfThreads(32, 32, 1); size_t bytes_m = a * b * sizeof(float); size_t bytes_mask = maskWidth * maskWidth * sizeof(float); size_t bytes_n = a * b * sizeof(float); float* d_m; float* d_mask; float* d_n; cudaMalloc((void**) &d_m, bytes_m); errorCheck(__LINE__); cudaMalloc((void**) &d_mask, bytes_mask); errorCheck(__LINE__); cudaMalloc((void**) &d_n, bytes_n); errorCheck(__LINE__); cudaMemcpy(d_m, m, bytes_m, cudaMemcpyHostToDevice); errorCheck(__LINE__); cudaMemcpy(d_mask, mask, bytes_mask, cudaMemcpyHostToDevice); errorCheck(__LINE__); convolution_2D_Kernel<<>>(d_m, d_mask, d_n, a, b, maskWidth); errorCheck(__LINE__); cudaMemcpy(n, d_n, bytes_n, cudaMemcpyDeviceToHost); errorCheck(__LINE__); cudaFree(d_m); errorCheck(__LINE__); cudaFree(d_mask); errorCheck(__LINE__); cudaFree(d_n); errorCheck(__LINE__); } int main() { struct timespec start, end; srand(time(NULL)); size_t a = rand() % 257 + 3840; size_t b = rand() % 257 + 3840; size_t maskWidth = 11; float* m = (float*) malloc(a * b * sizeof(float)); float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float)); float* n = (float*) malloc(a * b * sizeof(float)); for(int i = 0; i < a * b; ++i) { m[i] = rand() % 129 - 64; } for(int j = 0; j < maskWidth * maskWidth; ++j) { mask[j] = rand() % 1001 / 1000.0; } clock_gettime(CLOCK_REALTIME, &start); // do convolution convolution_2D(m, mask, n, a, b, maskWidth); clock_gettime(CLOCK_REALTIME, &end); time_t execTime = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; printf("Execution time: %d microseconds.", execTime); return 0; }