```/// LSU EE 7700-2 (Spring 2013), GPU Microarchitecture
//

/// Homework 3 - Solution
//
// Assignment in: http://www.ece.lsu.edu/koppel/gp/2013/hw03.pdf
//
/// Your Name: David M. Koppelman

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>
#include <cuda_runtime.h>

#define N 4

/// CUDA API Error-Checking Wrapper
///
#define CE(call)                                                              \
{                                                                            \
const cudaError_t rv = call;                                               \
if ( rv != cudaSuccess )                                                   \
{                                                                        \
printf("CUDA error %d, %s\n",rv,cudaGetErrorString(rv));               \
exit(1);                                                               \
}                                                                        \
}

double
time_fp()
{
struct timespec tp;
clock_gettime(CLOCK_REALTIME,&tp);
return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001;
}

// Make it easy to switch between float and double for vertex and matrix
// elements.
//
typedef float Elt_Type;

struct __align__(16) Vertex
{
Elt_Type a[N];
};

struct App
{
Elt_Type matrix[N][N];
int array_size;  // Number of vertices.
bool find_minimum_magnitude; // For problem 2.
Vertex *v_in, *v_out;
Vertex *d_v_in, *d_v_out;

/// SOLUTION
//
//  Declare pointers to magnitudes (squared) and their indices.
//  Each thread will write on element. (If shared memory were
//  used each block would write one element.)
//
Elt_Type *thd_min_mag_sq;
int *thd_min_mag_sq_idx;
Elt_Type *d_thd_min_mag_sq;
int *d_thd_min_mag_sq_idx;

};

App app;

// In device constant address space.
__constant__ App d_app;

// The entry point for the GPU code.
//
__global__ void
{
// Compute an id number that will be in the range from 0 to num_threads-1.
//
const int tid = threadIdx.x + blockIdx.x * blockDim.x;

// Number of elements that each thread should work on.  We are ignoring
// rounding errors.
//

/// WARNING:
//
// The order in which the threads examine elements here is poorly
// chosen and will unnecessarily make inefficient use of the memory
// system.

// Compute element number to start at.
//
const int start = elt_per_thread * tid;

// Compute element number to stop at.
//
const int stop = start + elt_per_thread;

int min_mag_idx = -1;
Elt_Type min_sum_of_sq = 0;

// WARNING: This code accesses elements in an inefficient order.
for ( int h=start; h<stop; h++ )
{
Vertex p = d_app.d_v_in[h];
Vertex q;
for ( int i=0; i<N; i++ )
{
q.a[i] = 0;
for ( int j=0; j<N; j++ ) q.a[i] += d_app.matrix[i][j] * p.a[j];
}
d_app.d_v_out[h] = q;

/// SOLUTION
//
//  Compute the magnitude squared of q and update "min" variables
//  it it's the smallest yet.
//
Elt_Type sum_of_sq = 0;
for ( int i=0; i<N; i++ ) sum_of_sq += q.a[i] * q.a[i];
if ( h == start || sum_of_sq < min_sum_of_sq )
{
min_sum_of_sq = sum_of_sq;
min_mag_idx = h;
}
}

/// SOLUTION
//
//  Write thread-minimum value and it's index to global memory.
//
d_app.d_thd_min_mag_sq[tid] = min_sum_of_sq;
d_app.d_thd_min_mag_sq_idx[tid] = min_mag_idx;
}

void
print_gpu_info()
{
// Get information about GPU and its ability to run CUDA.
//
int device_count;
cudaGetDeviceCount(&device_count); // Get number of GPUs.
if ( device_count == 0 )
{
fprintf(stderr,"No GPU found, exiting.\n");
exit(1);
}

int dev = 0;
CE(cudaGetDevice(&dev));
printf("Using GPU %d\n",dev);

cudaDeviceProp cuda_prop;  // Properties of cuda device (GPU, cuda version).

/// Print information about the available GPUs.
//
{
CE(cudaGetDeviceProperties(&cuda_prop,dev));
printf
("GPU %d: %s @ %.2f GHz WITH %d MiB GLOBAL MEM\n",
dev, cuda_prop.name, cuda_prop.clockRate/1e6,
int(cuda_prop.totalGlobalMem >> 20));

printf
("GPU %d: CC: %d.%d  MP: %2d  TH/WP: %3d  TH/BL: %4d\n",
dev, cuda_prop.major, cuda_prop.minor,
cuda_prop.multiProcessorCount,
cuda_prop.warpSize,
);

printf
("GPU %d: SHARED: %5d  CONST: %5d  # REGS: %5d\n",
dev,
int(cuda_prop.sharedMemPerBlock), int(cuda_prop.totalConstMem),
cuda_prop.regsPerBlock
);
}

cudaFuncAttributes cfa_prob1; // Properties of code to run on device.

// Print information about time_step routine.
//
printf("\nCUDA Routine Resource Usage:\n");
printf(" Our CUDA Thread: %6zd shared, %zd const, %zd loc, %d regs; "
cfa_prob1.sharedSizeBytes,
cfa_prob1.constSizeBytes,
cfa_prob1.localSizeBytes,
cfa_prob1.numRegs,
}

void*
{
const int tid = (ptrdiff_t) arg;
printf("Hello from %d\n",tid);
const int start = elt_per_thread * tid;
const int stop = start + elt_per_thread;

for ( int h=start; h<stop; h++ )
{
Vertex p = app.v_in[h];
Vertex q;
for ( int i=0; i<N; i++ )
{
q.a[i] = 0;
for ( int j=0; j<N; j++ ) q.a[i] += app.matrix[i][j] * p.a[j];
}
app.v_out[h] = q;
}

return NULL;
}

int
main(int argc, char **argv)
{
// Examine argument 1, block size, if negative, find minimum magnitude.
//
const int arg1_int = argc < 2 ? 1 : atoi(argv[1]);
const bool find_mag = arg1_int < 0;
const int num_blocks = abs(arg1_int);

// For Problem 2.
app.find_minimum_magnitude = find_mag;

// Examine argument 2, number of threads per block.
//
const int thd_per_block = argc < 3 ? 1 : atoi(argv[2]);

// Examine argument 3, size of array in MiB. Fractional values okay.
//
app.array_size = argc < 4 ? 1 << 20 : int( atof(argv[3]) * (1<<20) );

if ( app.num_threads <= 0 || app.array_size <= 0 )
{
printf("Usage: %s [ NUM_PTHREADS | - NUM_CUDA_BLOCKS ] [THD_PER_BLOCK] [DATA_SIZE_MiB]\n",
argv[0]);
exit(1);
}

print_gpu_info();

const int array_size_bytes = app.array_size * sizeof(app.v_in[0]);

// Allocate storage for CPU copy of data.
//
app.v_in = new Vertex[app.array_size];
app.v_out = new Vertex[app.array_size];

// Allocate storage for GPU copy of data.
//
CE( cudaMalloc( &app.d_v_in,  app.array_size * sizeof(Vertex) ) );
CE( cudaMalloc( &app.d_v_out, app.array_size * sizeof(Vertex) ) );

/// SOLUTION
//
//  Allocate storage on CPU and GPU for the minimum magnitude (sq) and
//  its index.
//
CE( cudaMalloc( &app.d_thd_min_mag_sq_idx, app.num_threads * sizeof(int) ) );
CE( cudaMalloc( &app.d_thd_min_mag_sq, app.num_threads * sizeof(Elt_Type) ) );

printf
("\nPreparing for %d %s threads operating on %d vectors of %d elements.\n",
app.array_size, N);

// Initialize input array.
//
for ( int i=0; i<app.array_size; i++ )
for ( int j=0; j<N; j++ ) app.v_in[i].a[j] = drand48();

// Initialize transformation matrix.
//
for ( int i=0; i<N; i++ )
for ( int j=0; j<N; j++ )
app.matrix[i][j] = drand48();

double elapsed_time_s;
int minimum_mag_index = 0;     // For Problem 2.
Elt_Type minimum_mag_val = 0;  // For Problem 2.

{
const double time_start = time_fp();

//

// Set up a pthread attribute, used for specifying options.
//

//
for ( int i=0; i<app.num_threads; i++ )

// Wait for each thread to finish.
//
for ( int i=0; i<app.num_threads; i++ )

elapsed_time_s = time_fp() - time_start;
}
else
{
// Prepare events used for timing.
//
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));

// Copy input array from CPU to GPU.
//
CE( cudaMemcpy
( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) );

// Copy App structure to GPU.
//
CE( cudaMemcpyToSymbol
( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );

// Measure execution time starting "now", which is after data
// set to GPU.
//
CE(cudaEventRecord(gpu_start_ce,0));

printf("Launching with %d blocks of %d threads.\n",
num_blocks, thd_per_block);

// Tell CUDA to start our threads on the GPU.
//

// Stop measuring execution time now, which is before is data
// returned from GPU.
//
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE(cudaEventElapsedTime(&cuda_time_ms,gpu_start_ce,gpu_stop_ce));
elapsed_time_s = cuda_time_ms * 0.001;

// Copy output array from GPU to CPU.
//
CE( cudaMemcpy
( app.v_out, app.d_v_out, array_size_bytes, cudaMemcpyDeviceToHost) );

// PROBLEM 2
//
// Insert code for reading magnitude information and having
// CPU finish up finding the minimum.

/// SOLUTION
//
//  Copy back per-thread minimum magnitudes (squared) and their indices.
//
CE( cudaMemcpy
( app.thd_min_mag_sq, app.d_thd_min_mag_sq,
CE( cudaMemcpy
( app.thd_min_mag_sq_idx, app.d_thd_min_mag_sq_idx,

// Find the minimum magnitude squared and its index.
//
Elt_Type min_sos = app.thd_min_mag_sq[0];
minimum_mag_index = app.thd_min_mag_sq_idx[0];
for ( int i=1; i<app.num_threads; i++ )
if ( app.thd_min_mag_sq[i] < min_sos )
{
min_sos = app.thd_min_mag_sq[i];
minimum_mag_index = app.thd_min_mag_sq_idx[i];
}

// Take the square root to get the magnitude.
//
minimum_mag_val = sqrt(min_sos);
}

const double data_size = app.array_size * sizeof(Vertex) * 2;
const double fp_op_count = app.array_size * ( 2 * N * N - N  );

printf("Elapsed time for %d threads and %d elements is %.3f µs\n",
printf("Rate %.3f GFLOPS,  %.3f GB/s\n",
1e-9 * fp_op_count / elapsed_time_s,
1e-9 * data_size / elapsed_time_s);

if ( app.find_minimum_magnitude )
{
Elt_Type min_val = 0;
int min_idx = -1;

for ( int h=0; h<app.array_size; h++ )
{
Vertex p = app.v_in[h];
Vertex q;
for ( int i=0; i<N; i++ )
{
q.a[i] = 0;
for ( int j=0; j<N; j++ ) q.a[i] += app.matrix[i][j] * p.a[j];
}
Elt_Type sos = 0; for(int i=0; i<N; i++ ) sos+= q.a[i]*q.a[i];
Elt_Type mag = sqrt(sos);
if ( min_idx < 0 || mag < min_val ) { min_val = mag; min_idx = h; }
}
Elt_Type diff = fabs(min_val-minimum_mag_val);
printf
("\nMinimum mag is %s,  %d %s %d (correct)  %.4f %s %.4f (correct)\n",
diff < 1e-5 ? "correct" : "**wrong**",
minimum_mag_index,
min_idx == minimum_mag_index ? "==" : "!=",
min_idx,
minimum_mag_val,
min_val == minimum_mag_val ? "==" : diff < 1e-5 ? "~" : "!=",
min_val
);

}

}
```