/// LSU EE 7700-2 (Spring 2013), GPU Microarchitecture
//

 /// Homework 3 - Solution
 //
 // Assignment in: http://www.ece.lsu.edu/koppel/gp/2013/hw03.pdf
 //
 /// Your Name: David M. Koppelman

#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>
#include <cuda_runtime.h>

#define N 4

 /// CUDA API Error-Checking Wrapper
///
#define CE(call)                                                              \
 {                                                                            \
   const cudaError_t rv = call;                                               \
   if ( rv != cudaSuccess )                                                   \
     {                                                                        \
       printf("CUDA error %d, %s\n",rv,cudaGetErrorString(rv));               \
       exit(1);                                                               \
     }                                                                        \
 }

double
time_fp()
{
  struct timespec tp;
  clock_gettime(CLOCK_REALTIME,&tp);
  return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001;
}

// Make it easy to switch between float and double for vertex and matrix
// elements.
//
typedef float Elt_Type;

struct __align__(16) Vertex
{
  Elt_Type a[N];
};

struct App
{
  int num_threads;
  Elt_Type matrix[N][N];
  int array_size;  // Number of vertices.
  bool find_minimum_magnitude; // For problem 2.
  Vertex *v_in, *v_out;
  Vertex *d_v_in, *d_v_out;

  /// SOLUTION
  //
  //  Declare pointers to magnitudes (squared) and their indices.
  //  Each thread will write on element. (If shared memory were
  //  used each block would write one element.)
  //
  Elt_Type *thd_min_mag_sq;
  int *thd_min_mag_sq_idx;
  Elt_Type *d_thd_min_mag_sq;
  int *d_thd_min_mag_sq_idx;

};

// In host address space.
App app;

// In device constant address space.
__constant__ App d_app;

// The entry point for the GPU code.
//
__global__ void
cuda_thread_start()
{
  // Compute an id number that will be in the range from 0 to num_threads-1.
  //
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;

  // Number of elements that each thread should work on.  We are ignoring
  // rounding errors.
  //
  const int elt_per_thread = d_app.array_size / d_app.num_threads;

  /// WARNING:
  //
  // The order in which the threads examine elements here is poorly
  // chosen and will unnecessarily make inefficient use of the memory
  // system.

  // Compute element number to start at.
  //
  const int start = elt_per_thread * tid;

  // Compute element number to stop at.
  //
  const int stop = start + elt_per_thread;

  int min_mag_idx = -1;
  Elt_Type min_sum_of_sq = 0;

  // WARNING: This code accesses elements in an inefficient order.
  for ( int h=start; h<stop; h++ )
    {
      Vertex p = d_app.d_v_in[h];
      Vertex q;
      for ( int i=0; i<N; i++ )
        {
          q.a[i] = 0;
          for ( int j=0; j<N; j++ ) q.a[i] += d_app.matrix[i][j] * p.a[j];
        }
      d_app.d_v_out[h] = q;

      /// SOLUTION
      //
      //  Compute the magnitude squared of q and update "min" variables
      //  it it's the smallest yet.
      //
      Elt_Type sum_of_sq = 0;
      for ( int i=0; i<N; i++ ) sum_of_sq += q.a[i] * q.a[i];
      if ( h == start || sum_of_sq < min_sum_of_sq )
        {
          min_sum_of_sq = sum_of_sq;
          min_mag_idx = h;
        }
    }

  /// SOLUTION
  //
  //  Write thread-minimum value and it's index to global memory.
  //
  d_app.d_thd_min_mag_sq[tid] = min_sum_of_sq;
  d_app.d_thd_min_mag_sq_idx[tid] = min_mag_idx;
}

void
print_gpu_info()
{
  // Get information about GPU and its ability to run CUDA.
  //
  int device_count;
  cudaGetDeviceCount(&device_count); // Get number of GPUs.
  if ( device_count == 0 )
    {
      fprintf(stderr,"No GPU found, exiting.\n");
      exit(1);
    }

  int dev = 0;
  CE(cudaGetDevice(&dev));
  printf("Using GPU %d\n",dev);

  cudaDeviceProp cuda_prop;  // Properties of cuda device (GPU, cuda version).

  /// Print information about the available GPUs.
  //
  {
    CE(cudaGetDeviceProperties(&cuda_prop,dev));
    printf
      ("GPU %d: %s @ %.2f GHz WITH %d MiB GLOBAL MEM\n",
       dev, cuda_prop.name, cuda_prop.clockRate/1e6,
       int(cuda_prop.totalGlobalMem >> 20));

    printf
      ("GPU %d: CC: %d.%d  MP: %2d  TH/WP: %3d  TH/BL: %4d\n",
       dev, cuda_prop.major, cuda_prop.minor,
       cuda_prop.multiProcessorCount,
       cuda_prop.warpSize,
       cuda_prop.maxThreadsPerBlock
       );

    printf
      ("GPU %d: SHARED: %5d  CONST: %5d  # REGS: %5d\n",
       dev,
       int(cuda_prop.sharedMemPerBlock), int(cuda_prop.totalConstMem),
       cuda_prop.regsPerBlock
       );
  }

  cudaFuncAttributes cfa_prob1; // Properties of code to run on device.
  CE( cudaFuncGetAttributes(&cfa_prob1,cuda_thread_start) );

  // Print information about time_step routine.
  //
  printf("\nCUDA Routine Resource Usage:\n");
  printf(" Our CUDA Thread: %6zd shared, %zd const, %zd loc, %d regs; "
         "%d max threads per block.\n",
         cfa_prob1.sharedSizeBytes,
         cfa_prob1.constSizeBytes,
         cfa_prob1.localSizeBytes,
         cfa_prob1.numRegs,
         cfa_prob1.maxThreadsPerBlock);
}

void*
pt_thread_start(void *arg)
{
  const int tid = (ptrdiff_t) arg;
  printf("Hello from %d\n",tid);
  const int elt_per_thread = app.array_size / app.num_threads;
  const int start = elt_per_thread * tid;
  const int stop = start + elt_per_thread;

  for ( int h=start; h<stop; h++ )
    {
      Vertex p = app.v_in[h];
      Vertex q;
      for ( int i=0; i<N; i++ )
        {
          q.a[i] = 0;
          for ( int j=0; j<N; j++ ) q.a[i] += app.matrix[i][j] * p.a[j];
        }
      app.v_out[h] = q;
    }

  return NULL;
}

int
main(int argc, char **argv)
{
  // Examine argument 1, block size, if negative, find minimum magnitude.
  //
  const int arg1_int = argc < 2 ? 1 : atoi(argv[1]);
  const bool find_mag = arg1_int < 0;
  const int num_blocks = abs(arg1_int);

  const bool use_pthreads = false;

  // For Problem 2.
  app.find_minimum_magnitude = find_mag;

  // Examine argument 2, number of threads per block.
  //
  const int thd_per_block = argc < 3 ? 1 : atoi(argv[2]);
  app.num_threads = use_pthreads ? -arg1_int : num_blocks * thd_per_block;

  // Examine argument 3, size of array in MiB. Fractional values okay.
  //
  app.array_size = argc < 4 ? 1 << 20 : int( atof(argv[3]) * (1<<20) );

  if ( app.num_threads <= 0 || app.array_size <= 0 )
    {
      printf("Usage: %s [ NUM_PTHREADS | - NUM_CUDA_BLOCKS ] [THD_PER_BLOCK] [DATA_SIZE_MiB]\n",
             argv[0]);
      exit(1);
    }

  if ( !use_pthreads )
    print_gpu_info();

  const int array_size_bytes = app.array_size * sizeof(app.v_in[0]);

  // Allocate storage for CPU copy of data.
  //
  app.v_in = new Vertex[app.array_size];
  app.v_out = new Vertex[app.array_size];

  // Allocate storage for GPU copy of data.
  //
  CE( cudaMalloc( &app.d_v_in,  app.array_size * sizeof(Vertex) ) );
  CE( cudaMalloc( &app.d_v_out, app.array_size * sizeof(Vertex) ) );

  /// SOLUTION
  //
  //  Allocate storage on CPU and GPU for the minimum magnitude (sq) and
  //  its index.
  //
  app.thd_min_mag_sq_idx = new int[app.num_threads];
  app.thd_min_mag_sq = new Elt_Type[app.num_threads];
  CE( cudaMalloc( &app.d_thd_min_mag_sq_idx, app.num_threads * sizeof(int) ) );
  CE( cudaMalloc( &app.d_thd_min_mag_sq, app.num_threads * sizeof(Elt_Type) ) );

  printf
    ("\nPreparing for %d %s threads operating on %d vectors of %d elements.\n",
         app.num_threads,
         use_pthreads ? "CPU" : "GPU",
         app.array_size, N);

  // Initialize input array.
  //
  for ( int i=0; i<app.array_size; i++ )
    for ( int j=0; j<N; j++ ) app.v_in[i].a[j] = drand48();

  // Initialize transformation matrix.
  //
  for ( int i=0; i<N; i++ )
    for ( int j=0; j<N; j++ )
      app.matrix[i][j] = drand48();

  double elapsed_time_s;
  int minimum_mag_index = 0;     // For Problem 2.
  Elt_Type minimum_mag_val = 0;  // For Problem 2.

  if ( use_pthreads )
    {
      const double time_start = time_fp();

      // Allocate a structure to hold pthread thread ids.
      //
      pthread_t* const ptid = new pthread_t[app.num_threads];

      // Set up a pthread attribute, used for specifying options.
      //
      pthread_attr_t attr;
      pthread_attr_init(&attr);
      pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);

      // Launch the threads.
      //
      for ( int i=0; i<app.num_threads; i++ )
        pthread_create(&ptid[i], &attr, pt_thread_start, (void*)i);

      // Wait for each thread to finish.
      //
      for ( int i=0; i<app.num_threads; i++ )
        pthread_join( ptid[i], NULL );

      elapsed_time_s = time_fp() - time_start;
    }
  else
    {
      // Prepare events used for timing.
      //
      cudaEvent_t gpu_start_ce, gpu_stop_ce;
      CE(cudaEventCreate(&gpu_start_ce));
      CE(cudaEventCreate(&gpu_stop_ce));

      // Copy input array from CPU to GPU.
      //
      CE( cudaMemcpy
          ( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) );

      // Copy App structure to GPU.
      //
      CE( cudaMemcpyToSymbol
          ( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );

      // Measure execution time starting "now", which is after data
      // set to GPU.
      //
      CE(cudaEventRecord(gpu_start_ce,0));

      printf("Launching with %d blocks of %d threads.\n",
             num_blocks, thd_per_block);

      // Tell CUDA to start our threads on the GPU.
      //
      cuda_thread_start<<<num_blocks,thd_per_block>>>();

      // Stop measuring execution time now, which is before is data
      // returned from GPU.
      //
      CE(cudaEventRecord(gpu_stop_ce,0));
      CE(cudaEventSynchronize(gpu_stop_ce));
      float cuda_time_ms = -1.1;
      CE(cudaEventElapsedTime(&cuda_time_ms,gpu_start_ce,gpu_stop_ce));
      elapsed_time_s = cuda_time_ms * 0.001;

      // Copy output array from GPU to CPU.
      //
      CE( cudaMemcpy
          ( app.v_out, app.d_v_out, array_size_bytes, cudaMemcpyDeviceToHost) );


      // PROBLEM 2
      //
      // Insert code for reading magnitude information and having
      // CPU finish up finding the minimum.

      
      /// SOLUTION
      //
      //  Copy back per-thread minimum magnitudes (squared) and their indices.
      //
      CE( cudaMemcpy
          ( app.thd_min_mag_sq, app.d_thd_min_mag_sq,
            sizeof(Elt_Type) * app.num_threads, cudaMemcpyDeviceToHost) );
      CE( cudaMemcpy
          ( app.thd_min_mag_sq_idx, app.d_thd_min_mag_sq_idx,
            sizeof(int) * app.num_threads, cudaMemcpyDeviceToHost) );

      // Find the minimum magnitude squared and its index.
      //
      Elt_Type min_sos = app.thd_min_mag_sq[0];
      minimum_mag_index = app.thd_min_mag_sq_idx[0];
      for ( int i=1; i<app.num_threads; i++ )
        if ( app.thd_min_mag_sq[i] < min_sos )
          {
            min_sos = app.thd_min_mag_sq[i];
            minimum_mag_index = app.thd_min_mag_sq_idx[i];
          }
      
      // Take the square root to get the magnitude.
      //
      minimum_mag_val = sqrt(min_sos);
    }

  const double data_size = app.array_size * sizeof(Vertex) * 2;
  const double fp_op_count = app.array_size * ( 2 * N * N - N  );

  printf("Elapsed time for %d threads and %d elements is %.3f µs\n",
         app.num_threads, app.array_size, 1e6 * elapsed_time_s);
  printf("Rate %.3f GFLOPS,  %.3f GB/s\n",
         1e-9 * fp_op_count / elapsed_time_s,
         1e-9 * data_size / elapsed_time_s);

  if ( app.find_minimum_magnitude )
    {
      // Compute correct answer.
      Elt_Type min_val = 0;
      int min_idx = -1;

      for ( int h=0; h<app.array_size; h++ )
        {
          Vertex p = app.v_in[h];
          Vertex q;
          for ( int i=0; i<N; i++ )
            {
              q.a[i] = 0;
              for ( int j=0; j<N; j++ ) q.a[i] += app.matrix[i][j] * p.a[j];
            }
          Elt_Type sos = 0; for(int i=0; i<N; i++ ) sos+= q.a[i]*q.a[i];
          Elt_Type mag = sqrt(sos);
          if ( min_idx < 0 || mag < min_val ) { min_val = mag; min_idx = h; }
        }
      Elt_Type diff = fabs(min_val-minimum_mag_val);
      printf
        ("\nMinimum mag is %s,  %d %s %d (correct)  %.4f %s %.4f (correct)\n",
         diff < 1e-5 ? "correct" : "**wrong**",
         minimum_mag_index,
         min_idx == minimum_mag_index ? "==" : "!=",
         min_idx,
         minimum_mag_val,
         min_val == minimum_mag_val ? "==" : diff < 1e-5 ? "~" : "!=",
         min_val
         );

    }

}