/// LSU EE 4702-1 (Fall 2015), GPU Programming
//

 /// Shared memory CUDA Example, without LSU ECE helper classes.

/// References
//
//

#if 0
/// Background

 /// Shared Memory
 //
 //  An address space that's shared amongst threads in a block.
 //    Members of a block can load values that other block members wrote.
 //
 //  The maximum amount of shared memory is 48 kiB per block in Kepler
 //  and Maxwell devices.
 //
 //  A variable is assigned to shared memory if it is declared using
 //  the __shared__ qualifier.
 //
 //  :Example: Declaration examples.

__shared__ int amount;
__shared__ float4 forces[12];

 /// Shared Memory Uses
 //
 //  Communication between threads.
 //  Caching of global memory. 
 //    (Copying to a place where it can be accessed quickly.)


  /// Atomic Operations
  //
  //  :Def: Atomic Operation
  //        An operation that appears to be either ..
  //        .. completely finished or ..
  //        .. not yet started.
  //        An atomic operation NEVER appears to be partially done.

  //  :Example:
  //
  //  A the following "+=" operation is NOT atomic.
  //
  __shared__ int sum;

  if ( threadIdx.x == 0 )  sum = 0;
  if ( threadIdx.x == 40 ) sum += 40;
  if ( threadIdx.x == 70 ) sum += 70;
  //
  //  An we need an atomic operation to perform the additions above.

  /// CUDA C Atomic Operations
  //
  //  Reference: CUDA C Programming Guide B.12
  //
  /// oldval = atomicAdd( valAddress, amount )
  //  Atomically add amount to *valAddress, return old value.


 /// Other Stuff
 //
 //  Need to coordinate readers and writers.


 __syncthreads();

 atomicAdd(POINTER, AMT);

#endif


#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>

#include <cuda_runtime.h>

 /// CUDA API Error-Checking Wrapper
///
#define CE(call)                                                              \
 {                                                                            \
   const cudaError_t rv = call;                                               \
   if ( rv != cudaSuccess )                                                   \
     {                                                                        \
       printf("CUDA error %d, %s\n",rv,cudaGetErrorString(rv));               \
       exit(1);                                                               \
     }                                                                        \
 }

double
time_fp()
{
  struct timespec tp;
  clock_gettime(CLOCK_REALTIME,&tp);
  return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001;
}

#if 0

__global__ void
cuda_thread_super_simple(int *output_data, int *input_data)
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int my_element = input_data[tid];

  /// Reasonable use of shared memory.
  //
  //  Make thread 12's element available to all threads in the block.

  __shared__ int a;

  if ( threadIdx.x == 12 ) a = my_element;

  __syncthreads();

  output_data[tid] = my_element + a;


  /// Bad use of shared memory.
  //
  //  Everyone writes trouble.

  __shared__ int trouble;

  trouble = my_element;

  __syncthreads();

  // All threads read the same value, whoever got there last.

  output_data[tid] = my_element + trouble;


  /// Reasonable use of shared memory.
  //
  //  Share your array element with our neighbor.

  __shared__ int our_data[1024];

  our_data[threadIdx.x] = my_element;

  __syncthreads();

  output_data[tid] = my_element + our_data[threadIdx ^ 1];


}
#endif


struct Vector
{
  float a[4];
};

struct App
{
  int num_threads;
  int array_size;
  Vector *v_in;
  float *m_out;
  Vector *d_v_in;
  float *d_m_out;
  float *block_mag_sum;
  float *d_block_mag_sum;
};

// In host address space.
App app;

// In device constant address space.
__constant__ App d_app;

///
/// GPU Code (Kernel)
///

__global__ void
cuda_thread_start()
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;

  // Shared array, one element for each member of block (up to max bl size).
  __shared__ float our_mag_sums[1024];

  float my_mag_sum = 0;

  for ( int h=tid; h<d_app.array_size; h += d_app.num_threads )
    {
      Vector p = d_app.d_v_in[h];
      float sos = 0;

      for ( int i=0; i<4; i++ ) sos += p.a[i] * p.a[i];

      const float mag = sqrtf( sos );

      // Write magnitude to global memory.
      d_app.d_m_out[h] = mag;

      // Compute this thread's magnitude sum.
      my_mag_sum += mag;
    }

  // Save this thread's magnitude sum in shared memory.
  //
  our_mag_sums[threadIdx.x] = my_mag_sum;

  // Wait for all threads to do this.
  //
  __syncthreads();

  // All but the first warp are finished.
  //
  if ( threadIdx.x >= 32 ) return;

  // Threads in first warp (first 32) each compute sum for their lane.
  //
  float lane_mag_sum = 0;
  for ( int i=threadIdx.x; i<blockDim.x; i+=32 )
    lane_mag_sum += our_mag_sums[i];

  // Save the sum for this lane in shared memory.
  //
  our_mag_sums[threadIdx.x] = lane_mag_sum;

  // Have just thread 0 finish up.
  //
  if ( threadIdx.x != 0 ) return;

  // Compute the sum of the last 32 elements.
  //
  float block_mag_sum = 0;
  for ( int i=0; i<32; i++ ) block_mag_sum += our_mag_sums[i];

  // Save this sum to global memory.  CPU will sum of blocks.
  //
  d_app.d_block_mag_sum[blockIdx.x] = block_mag_sum;
}


///
/// Collect Information About GPU and Code
///

// Info about a specific kernel.
//
struct Kernel_Info {
  void (*func_ptr)();           // Pointer to kernel function.
  const char *name;             // ASCII version of kernel name.
  cudaFuncAttributes cfa;       // Kernel attributes reported by CUDA.
};

// Info about GPU and each kernel.
//
struct GPU_Info {
  double bw_Bps;
  static const int num_kernels_max = 4;
  int num_kernels;
  Kernel_Info ki[num_kernels_max];
};

GPU_Info gpu_info;

void
cuda_init()
{
  // Get information about GPU and its ability to run CUDA.
  //
  int device_count;
  cudaGetDeviceCount(&device_count); // Get number of GPUs.
  if ( device_count == 0 )
    {
      fprintf(stderr,"No GPU found, exiting.\n");
      exit(1);
    }

  cudaDeviceProp cuda_prop;  // Properties of cuda device (GPU, cuda version).

  /// Print information about the available GPUs.
  //
  for ( int dev=0; dev<device_count; dev++ )
    {
      CE(cudaGetDeviceProperties(&cuda_prop,dev));
      printf
        ("GPU %d: %s @ %.2f GHz WITH %d MiB GLOBAL MEM\n",
         dev, cuda_prop.name, cuda_prop.clockRate/1e6,
         int(cuda_prop.totalGlobalMem >> 20));

      const int cc_per_mp =
        cuda_prop.major == 1 ? 8 :
        cuda_prop.major == 2 ? ( cuda_prop.minor == 0 ? 32 : 48 ) :
        cuda_prop.major == 3 ? 192 : 0;

      const double chip_bw_Bps = gpu_info.bw_Bps =
        2 * cuda_prop.memoryClockRate * 1000.0
        * ( cuda_prop.memoryBusWidth >> 3 );
      const double chip_sp_flops =
        1000.0 * cc_per_mp * cuda_prop.clockRate
        * cuda_prop.multiProcessorCount;

      printf
        ("GPU %d: CC: %d.%d  MP: %2d  CC/MP: %3d  TH/BL: %4d\n",
         dev, cuda_prop.major, cuda_prop.minor,
         cuda_prop.multiProcessorCount,
         cc_per_mp,
         cuda_prop.maxThreadsPerBlock);

      printf
        ("GPU %d: SHARED: %5d B  CONST: %5d B  # REGS: %5d\n",
         dev,
         int(cuda_prop.sharedMemPerBlock), int(cuda_prop.totalConstMem),
         cuda_prop.regsPerBlock);

      printf
        ("GPU %d: L2: %d kiB   MEM to L2: %.1f GB/s  SP %.1f GFLOPS  "
         "OP/ELT %.2f\n",
         dev,
         cuda_prop.l2CacheSize >> 10,
         chip_bw_Bps * 1e-9,
         chip_sp_flops * 1e-9,
         4 * chip_sp_flops / chip_bw_Bps);

    }

  // Choose GPU 0 because we don't have time to provide a way to let
  // the user choose.
  //
  int dev = 0;
  CE(cudaSetDevice(dev));
  printf("Using GPU %d\n",dev);

  gpu_info.num_kernels = 0;

#define GET_INFO(proc_name) {                                                 \
  const int idx = gpu_info.num_kernels++;                                     \
  if ( idx < gpu_info.num_kernels_max ) {                                     \
    gpu_info.ki[idx].name = #proc_name;                                       \
    gpu_info.ki[idx].func_ptr = (void(*)())proc_name;                         \
  }}

  GET_INFO(cuda_thread_start);

#undef GET_INFO

  // Print information about time_step routine.
  //
  printf("\nCUDA Routine Resource Usage:\n");

  for ( int i=0; i<gpu_info.num_kernels; i++ )
    {
      printf("For %s:\n", gpu_info.ki[i].name);
      printf("  %6zd shared, %zd const, %zd loc, %d regs; "
             "%d max threads per block.\n",
             gpu_info.ki[i].cfa.sharedSizeBytes,
             gpu_info.ki[i].cfa.constSizeBytes,
             gpu_info.ki[i].cfa.localSizeBytes,
             gpu_info.ki[i].cfa.numRegs,
             gpu_info.ki[i].cfa.maxThreadsPerBlock);
    }

  printf("\n");
}

///
/// Main Routine
///

int
main(int argc, char **argv)
{
  const int threads_per_block = argc < 2 ? 1 : atoi(argv[1]);
  const int blocks_per_grid = argc < 3 ? 1 : atoi(argv[2]);
  app.num_threads = threads_per_block * blocks_per_grid;

  app.array_size = argc < 4 ? 1 << 20 : int( atof(argv[3]) * (1<<20) );

  const int array_size_bytes = app.array_size * sizeof(app.v_in[0]);
  const int out_array_size_bytes = app.array_size * sizeof(app.m_out[0]);
  const int block_mag_sum_bytes =
    blocks_per_grid * sizeof(app.block_mag_sum[0]);

  if ( argc < 2 ) cuda_init();

  // Allocate storage for CPU copy of data.
  //
  app.v_in = new Vector[app.array_size];
  app.m_out = new float[app.array_size];
  app.block_mag_sum = new float[blocks_per_grid];

  // Allocate storage for GPU copy of data.
  //
  CE( cudaMalloc( &app.d_v_in,  array_size_bytes     ) );
  CE( cudaMalloc( &app.d_m_out, out_array_size_bytes ) );
  CE( cudaMalloc( &app.d_block_mag_sum, block_mag_sum_bytes ) );

  printf("Preparing for %d threads %d elements using %d blocks of size %d.\n",
         app.num_threads, app.array_size,
         blocks_per_grid, threads_per_block);

  // Initialize input array.
  //
  for ( int i=0; i<app.array_size; i++ )
    for ( int j=0; j<4; j++ ) app.v_in[i].a[j] = drand48();

  const double time_start = time_fp();

  // Copy input array from CPU to GPU.
  //
  CE( cudaMemcpy
      ( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) );

  // Copy App structure to GPU.
  //
  CE( cudaMemcpyToSymbol
      ( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );

  /// Launch Kernel
  cuda_thread_start<<<blocks_per_grid,threads_per_block>>>();

  // Copy output arrays from GPU to CPU.
  //
  CE( cudaMemcpy
      ( app.m_out, app.d_m_out, out_array_size_bytes, cudaMemcpyDeviceToHost) );
  CE( cudaMemcpy
      ( app.block_mag_sum, app.d_block_mag_sum, block_mag_sum_bytes,
        cudaMemcpyDeviceToHost) );

  float mag_sum = 0;
  for ( int i=0; i<blocks_per_grid; i++ )
    mag_sum += app.block_mag_sum[i];

  const double data_size = app.array_size * ( sizeof(Vector) + sizeof(float) );
  const double fp_op_count = app.array_size * 5;
  const double elapsed_time = time_fp() - time_start;

  float mag_sum_check = 0;
  for ( int i=0; i<app.array_size; i++ )
    mag_sum_check += app.m_out[i];

  const float mag_avg_check = mag_sum_check / app.array_size;
  const float mag_avg = mag_sum / app.array_size;

  if ( fabs(mag_avg_check-mag_avg) > 0.00001 )
    printf("** Averages don't check %.7f != %.7f (cpu)\n",
           mag_avg, mag_avg_check);

  printf("Elapsed time for %d threads and %d elements is %.3f µs\n",
         app.num_threads, app.array_size, 1e6 * elapsed_time);

  printf("Rate %.3f GFLOPS,  %.3f GB/s\n",
         1e-9 * fp_op_count / elapsed_time,
         1e-9 * data_size / elapsed_time);

}