/// LSU EE 7722 (Spring 2025), GPU Microarchitecture // /// Shared memory CUDA Example, without LSU ECE helper classes. /// References // // :ccpg12: CUDA C Programming Guide Version 12.0 // https://docs.nvidia.com/cuda/cuda-c-programming-guide // #if 0 /// Background /// Shared Address Space and Shared Memory // // References // General description: CUDA C Programming Guide Section 3.2.3 (v10) // Amount of SM: CUDA C Programming Guide Section Table 13 (Appendix H) // // // :Def: Shared Address Space // An address space provided by CUDA (through CC 9.x) in which: // // - Each block has its own shared address space. // // - Address space is shared by all threads in a block. // // - Locations can be read and written. // // - Size of space is 48 kiB in CC 2.X to CC 6.x. // Size of space is 96 kiB on CC 7.0 and 64 kiB on CC 7.5. // Size of space is 163 kiB on CC 8.0 and 99 kiB on CC 8.5 and 8.9 // Size of space is 227 kiB on CC 9.0 // // - Shared address space uses shared memory. // // // :Def: Shared Memory // Hardware used to implement the shared address space. // // - Shared memory is part of SM, so no communication limits. // // - Amount of shared memory per SM (NOT per block) varies: // 48 kiB CC 2.0 - CC 3.5 // 112 kiB CC 3.7 // 64 kiB CC 5.0, 5.3, 6.0, 6.2, 7.5 // 96 kiB CC 5.2, 6.1, 7.0 // 164 kiB CC 8.0 // 100 kiB CC 8.6, 8.9 // 228 kiB CC 9.0 // // - Low latency (fast). As low as 12 cycles. // // - High throughput. // // - Banked organization. Throughput depends on access patterns. // // /// Declaration and Use of Shared Memory // // - Declare variables using __shared__ qualifier. // // - Declaration can be at procedure or global scope. // // - Any type can be shared, including arrays. // // - Pointers to shared variables can be taken. // // :Example: Declaration examples. __shared__ int amount; __shared__ float4 forces[12]; /// Shared Memory Uses // // Communication between threads. // For example, to compute a block-wide sum. // // Caching of global memory. // (Copying to a place where it can be accessed quickly and reliably.) /// Barrier Synchronization (__syncthreads, __syncwarp) // // When a group is working together on a multi-step project .. // .. sometimes everyone must finish step x .. // .. before anyone can start on step x + 1. // // The same thing holds for threads. // // :Def: Barrier // A place in the execution of a code by a group of threads .. // .. which all must enter before any can exit. // // A barrier is like a room with two doors, the entrance and exit. // Initially the entrance is open and the exit is closed. // Threads enter the room but can't leave. (It's comfortable.) // When the last thread enters .. // .. the entrance closes .. // .. and the exit opens. // // A barrier is an easy-to-use form of synchronization, but it is an // inefficient choice when more threads are forced to wait than are // necessary. That said, in most cases in this class a barrier will // be used. // // CUDA provides several functions for placing barriers in your // code. // /// __syncthreads // Implements a block-wide barrier. x = a + b; // Stuff before. __syncthreads(); y = c + d; // Stuff after. // // - Must be called by all (active) threads in a block. (Or by none at all.) // - No thread executes "Stuff after" until all threads call __syncthreads. // /// __syncwarp() // Implements a warp-wide barrier. // // Uses of __syncwarp can be omitted in places where the compiler will // converge the threads in a warp -- IF YOU KNOW WHAT YOU'RE DOING, // OTHERWISE USE __syncwarp(). /// Atomic Operations // // :Def: Atomic Operation // An operation that appears to be either .. // .. completely finished or .. // .. not yet started. // An atomic operation NEVER appears to be partially done. // :Example: // // A the following "+=" operation is NOT atomic. // __shared__ int sum; if ( threadIdx.x == 0 ) sum = 0; if ( threadIdx.x == 40 ) sum += 40; if ( threadIdx.x == 70 ) sum += 70; // // An we need an atomic operation to perform the additions above. /// CUDA C Atomic Operations // // Reference: CUDA C Programming Guide B.12 // /// oldval = atomicAdd( valAddress, amount ) // Atomically add amount to *valAddress, return old value. /// Other Stuff // // Need to coordinate readers and writers. __syncthreads(); atomicAdd(POINTER, AMT); #endif #include <string.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> #include <ctype.h> #include <time.h> #include <new> #include <cuda_runtime.h> #include <gp/cuda-gpuinfo.h> inline double time_fp() { struct timespec tp; clock_gettime(CLOCK_REALTIME,&tp); return ((double)tp.tv_sec)+((double)tp.tv_nsec) * 0.000000001; } #if 0 __global__ void cuda_thread_super_simple(int *output_data, int *input_data) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; int my_element = input_data[tid]; /// Reasonable use of shared memory. // // Make thread 12's element available to all threads in the block. __shared__ int thd_12_element; // <----- Declaration of a shared variable. if ( threadIdx.x == 12 ) thd_12_element = my_element; __syncthreads(); output_data[tid] = my_element + thd_12_element; /// Bad use of shared memory. // // Everyone writes trouble. __shared__ int trouble; trouble = my_element; __syncthreads(); // All threads read the same value, whoever got there last. output_data[tid] = my_element + trouble; /// Reasonable use of shared memory. // // Share your array element with your neighbor. __shared__ int our_data[1024]; our_data[threadIdx.x] = my_element; __syncthreads(); output_data[tid] = my_element + our_data[ threadIdx.x ^ 1 ]; /// Bad use of shared memory. // // Bank conflicts. __shared__ int another_array[512 * 32]; // All threads in a block will write to same bank. // This will slow down execution by a factor of 32. // another_array[ threadIdx.x * 32 ] = my_element; } #endif struct Vector { float a[4]; }; struct App { int num_threads; int array_size; Vector *v_in; float *m_out; Vector *d_v_in; float *d_m_out; float *block_mag_sum; float *d_block_mag_sum; }; // In host address space. App app; // In device constant address space. __constant__ App d_app; /// /// GPU Code (Kernel) /// __global__ void cuda_thread_start() { const int tid = threadIdx.x + blockIdx.x * blockDim.x; // Shared array, one element for each member of block (up to max bl size). __shared__ float our_mag_sums[1024]; float my_mag_sum = 0; for ( int h=tid; h<d_app.array_size; h += d_app.num_threads ) { Vector p = d_app.d_v_in[h]; float sos = 0; for ( int i=0; i<4; i++ ) sos += p.a[i] * p.a[i]; const float mag = sqrtf( sos ); // Write magnitude to global memory. d_app.d_m_out[h] = mag; // Compute this thread's magnitude sum. my_mag_sum += mag; } // Save this thread's magnitude sum in shared memory. // our_mag_sums[threadIdx.x] = my_mag_sum; // Wait for all threads to do this. // __syncthreads(); // All but the first warp are finished. // if ( threadIdx.x >= 32 ) return; // Threads in first warp (first 32) each compute sum for their lane. // float lane_mag_sum = 0; for ( int i=threadIdx.x; i<blockDim.x; i+=32 ) lane_mag_sum += our_mag_sums[i]; // Save the sum for this lane in shared memory. // our_mag_sums[threadIdx.x] = lane_mag_sum; // Wait for all threads in the warp to finish. // __syncwarp(); // // Because of the return above there will only be one warp per block. // // The __syncwarp is not needed on devices before CC 7.0. // Have just thread 0 finish up. // if ( threadIdx.x != 0 ) return; // Compute the sum of the last 32 elements. // float block_mag_sum = 0; for ( int i=0; i<32; i++ ) block_mag_sum += our_mag_sums[i]; // Save this sum to global memory. CPU will sum of blocks. // d_app.d_block_mag_sum[blockIdx.x] = block_mag_sum; } /// /// Collect Information About GPU and Code /// void cuda_init() { GPU_Info gpu_info; gpu_info_print(); // Choose GPU 0 because we don't have time to provide a way to let // the user choose. // int dev = gpu_choose_index(); CE(cudaSetDevice(dev)); printf("Using GPU %d\n",dev); gpu_info.get_gpu_info(dev); gpu_info.GET_INFO(cuda_thread_start); // Print information about time_step routine. // printf("\nCUDA Routine Resource Usage:\n"); for ( int i=0; i<gpu_info.num_kernels; i++ ) { printf("For %s:\n", gpu_info.ki[i].name); printf(" %6zd shared, %zd const, %zd loc, %d regs; " "%d max threads per block.\n", gpu_info.ki[i].cfa.sharedSizeBytes, gpu_info.ki[i].cfa.constSizeBytes, gpu_info.ki[i].cfa.localSizeBytes, gpu_info.ki[i].cfa.numRegs, gpu_info.ki[i].cfa.maxThreadsPerBlock); } printf("\n"); } /// /// Main Routine /// int main(int argc, char **argv) { const int threads_per_block = argc < 2 ? 1 : atoi(argv[1]); const int blocks_per_grid = argc < 3 ? 1 : atoi(argv[2]); app.num_threads = threads_per_block * blocks_per_grid; app.array_size = argc < 4 ? 1 << 20 : int( atof(argv[3]) * (1<<20) ); const int array_size_bytes = app.array_size * sizeof(app.v_in[0]); const int out_array_size_bytes = app.array_size * sizeof(app.m_out[0]); const int block_mag_sum_bytes = blocks_per_grid * sizeof(app.block_mag_sum[0]); if ( argc < 2 ) cuda_init(); // Allocate storage for CPU copy of data. // app.v_in = new Vector[app.array_size]; app.m_out = new float[app.array_size]; app.block_mag_sum = new float[blocks_per_grid]; // Allocate storage for GPU copy of data. // CE( cudaMalloc( &app.d_v_in, array_size_bytes ) ); CE( cudaMalloc( &app.d_m_out, out_array_size_bytes ) ); CE( cudaMalloc( &app.d_block_mag_sum, block_mag_sum_bytes ) ); printf("Preparing for %d threads %d elements using %d blocks of size %d.\n", app.num_threads, app.array_size, blocks_per_grid, threads_per_block); // Initialize input array. // for ( int i=0; i<app.array_size; i++ ) for ( int j=0; j<4; j++ ) app.v_in[i].a[j] = drand48(); const double time_start = time_fp(); // Copy input array from CPU to GPU. // CE( cudaMemcpy ( app.d_v_in, app.v_in, array_size_bytes, cudaMemcpyHostToDevice ) ); // Copy App structure to GPU. // CE( cudaMemcpyToSymbol ( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) ); /// Launch Kernel cuda_thread_start<<<blocks_per_grid,threads_per_block>>>(); // Copy output arrays from GPU to CPU. // CE( cudaMemcpy ( app.m_out, app.d_m_out, out_array_size_bytes, cudaMemcpyDeviceToHost) ); CE( cudaMemcpy ( app.block_mag_sum, app.d_block_mag_sum, block_mag_sum_bytes, cudaMemcpyDeviceToHost) ); float mag_sum = 0; for ( int i=0; i<blocks_per_grid; i++ ) mag_sum += app.block_mag_sum[i]; const double data_size = app.array_size * ( sizeof(Vector) + sizeof(float) ); const double fp_op_count = app.array_size * 5; const double elapsed_time = time_fp() - time_start; float mag_sum_check = 0; for ( int i=0; i<app.array_size; i++ ) mag_sum_check += app.m_out[i]; const float mag_avg_check = mag_sum_check / app.array_size; const float mag_avg = mag_sum / app.array_size; if ( fabs(mag_avg_check-mag_avg) > 0.00001 ) printf("** Averages don't check %.7f != %.7f (cpu)\n", mag_avg, mag_avg_check); printf("Elapsed time for %d threads and %d elements is %.3f µs\n", app.num_threads, app.array_size, 1e6 * elapsed_time); printf("Rate %.3f GFLOPS, %.3f GB/s\n", 1e-9 * fp_op_count / elapsed_time, 1e-9 * data_size / elapsed_time); }