/// LSU EE 4702-1 (Fall 2018), GPU Programming // /// Simple CUDA Example, without LSU ECE helper classes. /// References // // :ccpg8: CUDA C Programming Guide Version 8 // https://docs.nvidia.com/cuda/cuda-c-programming-guide #if 0 /// Background /// CUDA // // NVIDIA's system for programming NVIDIA GPUs. // // Intended for non-graphical computation, widely used for // scientific computation. // /// CUDA Components. // // - CUDA C // Language used for writing code that runs on the GPU. // // - CUDA Runtime API // Library used for managing the execution of code on the GPU. // // - CUDA Compiler Toolchain // The "compiler" nvcc, and related tools. // // - CUDA Compatible GPU // Probably just NVIDIA GPUs. /// CUDA C // // Language used for writing code that runs on the GPU. // // A file can contain both CUDA C and C for the host compiler ... // ... that is the case for this file. // // In this file CUDA C is in routine: cuda_thread_start() // // Syntactically similar to C++. // // Major Differences // // Executes as a hierarchy of threads. // // Specialized address spaces. /// CUDA C Runtime API // // Library calls used on CPU side to manage execution on GPU. // // Activities Performed with API // // o Send data from CPU to GPU. // o Start execution of GPU code. // o Send data from GPU to CPU. /// CUDA Address Spaces // /// Global Address Space // // Works like "regular" memory on CPU, but it's usually separated. // Uses the same hardware as OpenGL buffer objects. /// Constant Address Space // // Limited amount of storage, read-only on GPU. // Probably uses the same hardware as OpenGL uniforms. /// The Whole Process // // - Install CUDA toolkit and NVIDIA drivers. // - Write hello.cu. // - Compile with nvcc // - Run. /// Typical Program Structure // // - Prepare input data on CPU. // - Send data from CPU to GPU. // - Launch kernel. // - Send data from GPU to CPU. /// Multithreaded Execution // // :Def: Thread // A path of execution through a program. // Most beginner programs have one thread. // // :Def: Multithreaded Execution // Execution of a program using more than one thread. // On CPUs, program starts with one thread (in main), additional // threads are requested by programmer. #endif #include <string.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> #include <ctype.h> #include <time.h> #include <new> #include <cuda_runtime.h> /// CUDA API Error-Checking Wrapper /// #define CE(call) \ { \ const cudaError_t rv = call; \ if ( rv != cudaSuccess ) \ { \ printf("CUDA error %d, %s\n",rv,cudaGetErrorString(rv)); \ exit(1); \ } \ } double time_fp() { struct timespec tp; clock_gettime(CLOCK_REALTIME,&tp); return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001; } struct App { int num_threads; int array_size; float *v_in; float *m_out; float *d_v_in; float *d_m_out; }; // In host address space. App app; // In device constant address space. __constant__ App d_app; __global__ void cuda_thread_start() { // Compute a unique ID for this thread. const int tid = threadIdx.x + blockIdx.x * blockDim.x; if ( tid >= d_app.num_threads ) return; // Warning: The order in which d_v_in is accessed is inefficient. // See demo-cuda-02-basics for a better ordering. // const int elt_per_thread = d_app.array_size / d_app.num_threads; const int start = elt_per_thread * tid; const int stop = start + elt_per_thread; for ( int h=start; h<stop; h++ ) d_app.d_m_out[h] = d_app.d_v_in[h] + 1; } int main(int argc, char **argv) { const int nt_raw = argc < 2 ? 1 : atoi(argv[1]); app.num_threads = abs(nt_raw); app.array_size = argc < 3 ? 1 << 20 : int( atof(argv[2]) * (1<<20) ); const int array_size_bytes = app.array_size * sizeof(app.v_in[0]); const int out_array_size_bytes = app.array_size * sizeof(app.m_out[0]); // Allocate storage for CPU copy of data. // app.v_in = new float[app.array_size]; app.m_out = new float[app.array_size]; // Allocate storage for GPU copy of data. // CE( cudaMalloc( &app.d_v_in, array_size_bytes ) ); CE( cudaMalloc( &app.d_m_out, out_array_size_bytes ) ); printf("Preparing for %d threads %d elements.\n", app.num_threads, app.array_size); // Initialize input array. // for ( int i=0; i<app.array_size; i++ ) app.v_in[i] = drand48(); const double time_start = time_fp(); // Copy input array from CPU to GPU. // CE( cudaMemcpy ( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) ); // Copy App structure to GPU. // CE( cudaMemcpyToSymbol ( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) ); const int threads_per_block = 256; const int blocks_per_grid = ( app.num_threads + threads_per_block-1 ) / threads_per_block; /// KERNEL LAUNCH cuda_thread_start<<< blocks_per_grid, threads_per_block >>>(); // --- <-- CUDA C kernel launch syntax -> --- // Copy output array from GPU to CPU. // CE( cudaMemcpy ( app.m_out, app.d_m_out, out_array_size_bytes, cudaMemcpyDeviceToHost) ); const double data_size = app.array_size * ( sizeof(app.v_in[0]) + sizeof(app.m_out[0]) ); const double fp_op_count = app.array_size * 5; const double elapsed_time = time_fp() - time_start; printf("Elapsed time for %d threads and %d elements is %.3f µs\n", app.num_threads, app.array_size, 1e6 * elapsed_time); printf("Rate %.3f GFLOPS, %.3f GB/s\n", 1e-9 * fp_op_count / elapsed_time, 1e-9 * data_size / elapsed_time); }