#include __global__ void square(float* d_out,float* d_in){ int idx = threadIdx.x; float f = d_in[idx]; d_out[idx] = f * f; } int main(int argc,char** argv){ const int ARRAY_SIZE = 8; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // generate the input array on the host float h_in[ARRAY_SIZE]; for(int i=0;i>>(d_out,d_in); // copy back the result array to the GPU cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost); // print out the resulting array for(int i=0;i