/// LSU EE 4702-1 (Fall 2015), GPU Programming
//

 /// Simple CUDA Example, without LSU ECE helper classes.

/// References
//
//

#if 0
/// Background

//  The following background describes the view of NVIDIA GPUs
//  provided by the CUDA system.  "View" means how the hardware
//  appears to the application programmer.  For CUDA, this should
//  be very close to the actual hardware.

 /// Compute Capability (CC)
//
//   An NVIDIA numbering system that identifies the approximate
//   capabilities of the hardware.
//
//   Compute Capabilities
//
//   -- CC 1.0, 1.1, 1.2, 1.3
//      Tesla. (Not to be confused with the Tesla board.)
//      Now considered obsolete.
//
//   -- CC 2.0, 2.1
//      Fermi
//
//   -- CC 3.0, 3.5
//      Kepler
//      Product cycle ending. (17 November 2014,  9:24:12 CST)
//      Currently the highest performance for scientific computation.
//
//   -- CC 5.2
//      Maxwell
//      Product cycle beginning. (17 November 2014,  9:24:16 CST)
//      Currently the highest performance for single-precision computation.


 /// Hardware Organization
//
//   Streaming Multiprocessor (SM, SMX)
//
//   Functional Unit
//     A piece of hardware that can perform a particular set of operations.
//     Examples:
//
//        CUDA Core:  Can perform most single-precision non-divide FP.


 /// Thread Organization
//
//
//   Thread
//     Has one PC.
//      Sort of a brain, or maybe just a body that can be in one place
//      at one time.
//
//   Warp
//     A collection of 32 threads.
//     One day the size of warp may change but it's been 32 through CC 5.2.
//     Threads in a warp travel together.
//
//   Block
//     A collection of threads, number determined by user.
//     All threads in a block execute on the same multiprocessor.
//     Threads within a block share shared memory.
//
//   Grid
//     A collection of blocks.

 /// Kernel Launch
//
//   :Def: Kernel Launch
//         The initiation of execution of CUDA code.
//         Done by a CUDA API call.
//         Specify:
//           The name of the CUDA C procedure to start. (E.g., my_kernel();)
//           The grid size. (The number of blocks.)
//           The block size.
//
 ///  Launch Configuration Criteria
//
//    Number of blocks is a multiple of number of SMs.
//    Number of threads per block is a multiple of 32.


 /// 

#endif

#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>

#include <cuda_runtime.h>

 /// CUDA API Error-Checking Wrapper
///
#define CE(call)                                                              \
 {                                                                            \
   const cudaError_t rv = call;                                               \
   if ( rv != cudaSuccess )                                                   \
     {                                                                        \
       printf("CUDA error %d, %s\n",rv,cudaGetErrorString(rv));               \
       exit(1);                                                               \
     }                                                                        \
 }

double
time_fp()
{
  struct timespec tp;
  clock_gettime(CLOCK_REALTIME,&tp);
  return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001;
}

struct Vertex
{
  float a[4];
};

struct App
{
  int num_threads;
  int array_size;
  Vertex *v_in;
  float *m_out;
  float4 *d_v_in;
  float *d_m_out;
};

// In host address space.
App app;

// In device constant address space.
__constant__ App d_app;


///
/// GPU Code (Kernels)
///


__global__ void
cuda_thread_start_simple()
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if ( tid >= d_app.num_threads ) return;

  const int elt_per_thread = d_app.array_size / d_app.num_threads;
  const int start = elt_per_thread * tid;
  const int stop = start + elt_per_thread;

  for ( int h=start; h<stop; h++ )
    {
      float4 p = d_app.d_v_in[h];
      float sos = 0;

      sos = p.x * p.x + p.y * p.y + p.z + p.z + p.w * p.w;

      d_app.d_m_out[h] = sqrtf( sos );
    }
}

__global__ void
cuda_thread_start_efficient()
{
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if ( tid >= d_app.num_threads ) return;

  for ( int h=tid; h<d_app.array_size; h += d_app.num_threads )
    {
      float4 p = d_app.d_v_in[h];

      float sos = p.x * p.x + p.y * p.y + p.z + p.z + p.w * p.w;

      d_app.d_m_out[h] = sqrtf( sos );
    }
}


///
/// Collect Information About GPU and Code
///

// Info about a specific kernel.
//
struct Kernel_Info {
  void (*func_ptr)();           // Pointer to kernel function.
  const char *name;             // ASCII version of kernel name.
  cudaFuncAttributes cfa;       // Kernel attributes reported by CUDA.
};

// Info about GPU and each kernel.
//
struct GPU_Info {
  double bw_Bps;
  static const int num_kernels_max = 4;
  int num_kernels;
  Kernel_Info ki[num_kernels_max];
};

GPU_Info gpu_info;

void
cuda_init()
{
  // Get information about GPU and its ability to run CUDA.
  //
  int device_count;
  cudaGetDeviceCount(&device_count); // Get number of GPUs.
  if ( device_count == 0 )
    {
      fprintf(stderr,"No GPU found, exiting.\n");
      exit(1);
    }

  cudaDeviceProp cuda_prop;  // Properties of cuda device (GPU, cuda version).

  /// Print information about the available GPUs.
  //
  for ( int dev=0; dev<device_count; dev++ )
    {
      CE(cudaGetDeviceProperties(&cuda_prop,dev));
      printf
        ("GPU %d: %s @ %.2f GHz WITH %d MiB GLOBAL MEM\n",
         dev, cuda_prop.name, cuda_prop.clockRate/1e6,
         int(cuda_prop.totalGlobalMem >> 20));

      const int cc_per_mp =
        cuda_prop.major == 1 ? 8 :
        cuda_prop.major == 2 ? ( cuda_prop.minor == 0 ? 32 : 48 ) :
        cuda_prop.major == 3 ? 192 : 0;

      const double chip_bw_Bps = gpu_info.bw_Bps =
        2 * cuda_prop.memoryClockRate * 1000.0
        * ( cuda_prop.memoryBusWidth >> 3 );
      const double chip_sp_flops =
        1000.0 * cc_per_mp * cuda_prop.clockRate
        * cuda_prop.multiProcessorCount;

      printf
        ("GPU %d: CC: %d.%d  MP: %2d  CC/MP: %3d  TH/BL: %4d\n",
         dev, cuda_prop.major, cuda_prop.minor,
         cuda_prop.multiProcessorCount,
         cc_per_mp,
         cuda_prop.maxThreadsPerBlock);

      printf
        ("GPU %d: SHARED: %5d B  CONST: %5d B  # REGS: %5d\n",
         dev,
         int(cuda_prop.sharedMemPerBlock), int(cuda_prop.totalConstMem),
         cuda_prop.regsPerBlock);

      printf
        ("GPU %d: L2: %d kiB   MEM to L2: %.1f GB/s  SP %.1f GFLOPS  "
         "OP/ELT %.2f\n",
         dev,
         cuda_prop.l2CacheSize >> 10,
         chip_bw_Bps * 1e-9,
         chip_sp_flops * 1e-9,
         4 * chip_sp_flops / chip_bw_Bps);

    }

  // Choose GPU 0 because we don't have time to provide a way to let
  // the user choose.
  //
  int dev = 0;
  CE(cudaSetDevice(dev));
  printf("Using GPU %d\n",dev);

  gpu_info.num_kernels = 0;

#define GET_INFO(proc_name) {                                                 \
  const int idx = gpu_info.num_kernels++;                                     \
  if ( idx < gpu_info.num_kernels_max ) {                                     \
    gpu_info.ki[idx].name = #proc_name;                                       \
    gpu_info.ki[idx].func_ptr = (void(*)())proc_name;                         \
    CE(cudaFuncGetAttributes(&gpu_info.ki[idx].cfa,proc_name));               \
  }}

  GET_INFO(cuda_thread_start_simple);
  GET_INFO(cuda_thread_start_efficient);

#undef GET_INFO

  // Print information about time_step routine.
  //
  printf("\nCUDA Routine Resource Usage:\n");

  for ( int i=0; i<gpu_info.num_kernels; i++ )
    {
      printf("For %s:\n", gpu_info.ki[i].name);
      printf("  %6zd shared, %zd const, %zd loc, %d regs; "
             "%d max threads per block.\n",
             gpu_info.ki[i].cfa.sharedSizeBytes,
             gpu_info.ki[i].cfa.constSizeBytes,
             gpu_info.ki[i].cfa.localSizeBytes,
             gpu_info.ki[i].cfa.numRegs,
             gpu_info.ki[i].cfa.maxThreadsPerBlock);
    }

  printf("\n");
}

///
/// Main Routine
///

int
main(int argc, char **argv)
{
  const int nt_raw = argc < 2 ? 1 : atoi(argv[1]);
  app.num_threads = abs(nt_raw);

  app.array_size = argc < 3 ? 1 << 20 : int( atof(argv[2]) * (1<<20) );
  const int array_size_bytes = app.array_size * sizeof(app.v_in[0]);
  const int out_array_size_bytes = app.array_size * sizeof(app.m_out[0]);

  const bool simple = argc < 4 ? 1 : atof(argv[3]);

  if ( argc < 2 ) cuda_init();

  const int threads_per_block = 256;
  const int blocks_per_grid =
    ( app.num_threads + threads_per_block-1 ) / threads_per_block;

  // Allocate storage for CPU copy of data.
  //
  app.v_in = new Vertex[app.array_size];
  app.m_out = new float[app.array_size];

  // Allocate storage for GPU copy of data.
  //
  CE( cudaMalloc( &app.d_v_in,  array_size_bytes     ) );
  CE( cudaMalloc( &app.d_m_out, out_array_size_bytes ) );

  printf("Launching %d blocks * %d threads for %d elts using %s kernel.\n",
         blocks_per_grid, threads_per_block, app.array_size,
         simple ? "simple" : "efficient" );

  // Initialize input array.
  //
  for ( int i=0; i<app.array_size; i++ )
    for ( int j=0; j<4; j++ ) app.v_in[i].a[j] = drand48();

  const double time_start = time_fp();

  // Copy input array from CPU to GPU.
  //
  CE( cudaMemcpy
      ( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) );

  // Copy App structure to GPU.
  //
  CE( cudaMemcpyToSymbol
      ( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );



  /// Launch Kernel
  //
  if ( simple )

    cuda_thread_start_simple<<< blocks_per_grid, threads_per_block >>>();

  else

    cuda_thread_start_efficient<<< blocks_per_grid, threads_per_block >>>();

  // Copy output array from GPU to CPU.
  //
  CE( cudaMemcpy
      ( app.m_out, app.d_m_out, out_array_size_bytes, cudaMemcpyDeviceToHost) );

  const double data_size = app.array_size * ( sizeof(Vertex) + sizeof(float) );
  const double fp_op_count = app.array_size * 5;
  const double elapsed_time = time_fp() - time_start;

  printf("Elapsed time for %d threads and %d elements is %.3f µs\n",
         app.num_threads, app.array_size, 1e6 * elapsed_time);
  printf("Rate %.3f GFLOPS,  %.3f GB/s\n",
         1e-9 * fp_op_count / elapsed_time,
         1e-9 * data_size / elapsed_time);
}