#include <cuda_runtime.h>
#include <gp/cuda-gpuinfo.h>
#include <ptable.h>
#include <nperf.h>
#include <misc.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <algorithm>
#include <ranges>
typedef float elt_t;
struct App
{
int n_l;
int nlimit_l; int d_l;
elt_t *l_in_d, *l_out_d;
};
__constant__ App c_app;
__device__ elt_t
group_sum(elt_t thd_val, uint group_size, bool all = false)
{
assert( group_size <= 32 );
const uint32_t mask =
!all && group_size > 1 && group_size < 32 ? __activemask() : ~0;
elt_t sum = thd_val;
for ( int dist = 1; dist < group_size; dist <<= 1 )
sum += __shfl_xor_sync(mask, sum, dist );
return sum;
}
template<int D_L = 0, int grp_size = 1>
__global__ void
norm_group(elt_t* __restrict__ l_out, const elt_t* __restrict__ l_in)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int n_threads = blockDim.x * gridDim.x;
const int d_l = D_L ?: c_app.d_l;
const int n_l = c_app.n_l;
const int sub_lane = threadIdx.x % grp_size;
const int h_start = tid / grp_size;
for ( int h = h_start; h < n_l; h += n_threads / grp_size )
{
const size_t idx_vec_start = h * d_l;
const size_t idx_vec_thd_start = idx_vec_start + sub_lane;
elt_t thd_sum = 0;
for ( int i = 0; i < d_l; i += grp_size )
thd_sum += l_in[ idx_vec_thd_start + i ];
const elt_t sum = group_sum(thd_sum,grp_size);
const elt_t avg = sum / d_l;
for ( int i = 0; i < d_l; i += grp_size )
l_out[ idx_vec_thd_start + i ] = l_in[ idx_vec_thd_start + i ] - avg;
}
}
template<int D_L = 0, int grp_size = 1, int unroll_degree = 1>
__global__ void
norm_group_u(elt_t* __restrict__ l_out, const elt_t* __restrict__ l_in)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int n_threads = blockDim.x * gridDim.x;
const int d_l = D_L ?: c_app.d_l;
const int n_l = c_app.n_l;
const int sub_lane = threadIdx.x % grp_size;
constexpr int wp_lg = 5;
constexpr int wp_sz = 1 << wp_lg;
const int lane = threadIdx.x % wp_sz;
const int wp_id = tid >> wp_lg;
constexpr int vec_p_wp = wp_sz / grp_size;
const int vec_p_wp_iter = vec_p_wp * unroll_degree;
const int h_wp_start = wp_id * vec_p_wp_iter;
const int inc = unroll_degree * n_threads / grp_size;
for ( int h_wp = h_wp_start; h_wp < n_l; h_wp += inc )
{
const int hi = h_wp + lane / grp_size;
elt_t thd_sum[unroll_degree]{};
for ( int j = 0; j < unroll_degree; j++ )
{
const int h = hi + j * vec_p_wp;
const size_t idx_vec_start = h * d_l;
const size_t idx_vec_thd_start = idx_vec_start + sub_lane;
for ( int i = 0; i < d_l; i += grp_size )
thd_sum[j] += l_in[ idx_vec_thd_start + i ];
}
elt_t sum[unroll_degree]{};
for ( int j = 0; j < unroll_degree; j++ )
sum[j] = group_sum(thd_sum[j],grp_size,true);
for ( int j = 0; j < unroll_degree; j++ )
{
const int h = hi + j * vec_p_wp;
const size_t idx_vec_start = h * d_l;
const size_t idx_vec_thd_start = idx_vec_start + sub_lane;
const elt_t avg = sum[j] / d_l;
for ( int i = 0; i < d_l; i += grp_size )
l_out[ idx_vec_thd_start+i ] = l_in[ idx_vec_thd_start + i ] - avg;
}
}
}
double
utilization_compute
(GPU_Info& info, App& app, int n_blocks, int thd_p_block, int grp_size_raw,
int unroll_degree_raw )
{
const int grp_size = grp_size_raw ?: 1;
const int unroll_degree = unroll_degree_raw ?: 1;
const int n_threads = n_blocks * thd_p_block;
const int64_t vec_per_iter = n_threads * unroll_degree / grp_size;
const int64_t n_iter_needed = ( app.n_l + vec_per_iter - 1 ) / vec_per_iter;
const int64_t n_vec_computable = n_iter_needed * vec_per_iter;
const double utilization = double(app.n_l) / n_vec_computable;
return utilization;
}
GPU_Info
print_gpu_and_kernel_info()
{
GPU_Info info;
gpu_info_print();
int dev = gpu_choose_index();
CE(cudaSetDevice(dev));
printf("Using GPU %d\n",dev);
info.get_gpu_info(dev);
return info;
}
int
main(int argc, char **argv)
{
App app;
NPerf_init();
GPU_Info info = print_gpu_and_kernel_info();
const uint num_mp = info.cuda_prop.multiProcessorCount;
const int arg1_int = argc < 2 ? num_mp : atoi(argv[1]);
const uint num_blocks =
arg1_int == 0 ? num_mp : arg1_int < 0 ? -arg1_int * num_mp : arg1_int;
const bool opt_p = true;
const int thd_per_block_arg = argc < 3 ? 0 : atoi(argv[2]);
const int thd_per_block_goal =
thd_per_block_arg == 0 ? 1024 : thd_per_block_arg;
const int num_threads = num_blocks * thd_per_block_goal;
const bool vary_warps = thd_per_block_arg == 0;
const int l2_size_bytes = info.cuda_prop.l2CacheSize;
const float l2_bandwidth_Bps = 32 * bit_floor(num_mp-1) * info.clock_freq_hz;
const float default_n_l2_units = 0.25;
const float arg3_val = argc < 4 ? -default_n_l2_units : atof(argv[3]);
const int n_bytes_raw =
arg3_val < 0 ? -arg3_val * l2_size_bytes : arg3_val * ( 1 << 20 );
const int n_elts_max = n_bytes_raw / sizeof(elt_t);
if ( num_threads <= 0 || n_elts_max <= 0 )
{
printf("Usage: %s [ NUM_CUDA_BLOCKS ] [THD_PER_BLOCK|p] "
"[-DATA_SIZE_L2_UNITS|DATA_SIZE_MiB]\n",
argv[0]);
exit(1);
}
NPerf_metric_collect("sm__inst_executed.sum");
NPerf_metric_collect("gld_efficiency");
if ( opt_p )
{
NPerf_metric_collect
("sm__instruction_throughput.avg.pct_of_peak_sustained_elapsed");
NPerf_metric_collect("l1tex__m_xbar2l1tex_read_bytes.sum");
NPerf_metric_collect("l1tex__m_l1tex2xbar_write_bytes.sum");
NPerf_metric_collect("dram__bytes_read.sum");
NPerf_metric_collect("dram__bytes_write.sum");
NPerf_metric_collect("l1tex__t_requests.sum");
NPerf_metric_collect("l1tex__data_bank_conflicts_pipe_lsu.sum");
}
const size_t max_size_elts = n_elts_max;
const size_t max_size_bytes = max_size_elts * sizeof( app.l_in_d[0] );
const int wp_sz = 32;
const size_t unroll_max = 4;
const size_t vec_per_wp_max = wp_sz;
const size_t overrun_size_vecs = unroll_max * vec_per_wp_max;
const size_t vec_size_max = 1024;
const size_t overrun_size_bytes =
overrun_size_vecs * vec_size_max * sizeof( app.l_in_d[0] );
struct App_Kernel_Info {
App_Kernel_Info
(int idx, Kernel_Info& k,const char *name_base, const char *name,
int grp_sz, int d_l, int unroll_degree = 0):
idx(idx),k_ptr(k.func_ptr), name(name), name_base(name_base),
grp_size(grp_sz), d_l(d_l), unroll_degree(unroll_degree){}
int idx;
GPU_Info_Func k_ptr;
const char *name;
const char *name_base;
int grp_size, d_l, unroll_degree;
};
vector<App_Kernel_Info> kernels;
#define TDH_GRP_KERNEL3(k,kb,grp_sz,d_l) \
{ const int idx = kernels.size(); \
kernels.emplace_back(idx,info.GET_INFO(k),kb,#k,grp_sz,d_l); }
#define TDH_GRP_KERNEL2(k,grp_sz,d_l) \
TDH_GRP_KERNEL3((k<d_l,grp_sz>),#k,grp_sz,d_l);
#define TDH_GRP_KERNEL1(k,grp_sz) \
TDH_GRP_KERNEL2(k,grp_sz,0); TDH_GRP_KERNEL2(k,grp_sz,4); \
TDH_GRP_KERNEL2(k,grp_sz,8); TDH_GRP_KERNEL2(k,grp_sz,32);
#define TDH_GRP_KERNEL(k) \
TDH_GRP_KERNEL1(k,1); TDH_GRP_KERNEL1(k,2); TDH_GRP_KERNEL1(k,4); \
TDH_GRP_KERNEL1(k,8); TDH_GRP_KERNEL1(k,16); TDH_GRP_KERNEL1(k,32);
#define TDH_GRP_DEG_KERNEL4(k,kb,grp_sz,d_l,udeg) \
{ const int idx = kernels.size(); \
kernels.emplace_back(idx,info.GET_INFO(k),kb,#k,grp_sz,d_l,udeg); }
#define TDH_GRP_DEG_KERNEL3b(k,grp_sz,d_l,udeg) \
TDH_GRP_DEG_KERNEL4((k<d_l,grp_sz,udeg>),#k,grp_sz,d_l,udeg);
#define TDH_GRP_DEG_KERNEL3(k,grp_sz,d_l,udeg) \
TDH_GRP_DEG_KERNEL3b(norm_group_u,grp_sz,d_l,udeg);
#define TDH_GRP_DEG_KERNEL2b(k,grp_sz,d_l) \
TDH_GRP_DEG_KERNEL3(k,grp_sz,d_l,1); TDH_GRP_DEG_KERNEL3(k,grp_sz,d_l,2); \
TDH_GRP_DEG_KERNEL3(k,grp_sz,d_l,4); TDH_GRP_DEG_KERNEL3(k,grp_sz,d_l,8);
#define TDH_GRP_DEG_KERNEL2(grp_sz,d_l) \
TDH_GRP_KERNEL2(norm_group,grp_sz,d_l) \
TDH_GRP_DEG_KERNEL2b(norm_group_u,grp_sz,d_l)
#define TDH_GRP_DEG_KERNEL1(grp_sz) \
TDH_GRP_DEG_KERNEL2(grp_sz,0); TDH_GRP_DEG_KERNEL2(grp_sz,4); \
TDH_GRP_DEG_KERNEL2(grp_sz,8); TDH_GRP_DEG_KERNEL2(grp_sz,32);
#define THD_DL_KERNEL(d_l) \
{ \
constexpr int grp_base = d_l ?: 32; \
TDH_GRP_DEG_KERNEL2(grp_base,d_l); \
}
THD_DL_KERNEL(0);
THD_DL_KERNEL(4);
THD_DL_KERNEL(8);
THD_DL_KERNEL(32);
vector<string> knames = { "norm_group", "norm_group_u" };
map<string, map<int, vector<App_Kernel_Info*> > > kmap;
for ( auto& k: kernels ) kmap[k.name_base][k.d_l].push_back(&k);
for ( auto& n: knames ) assert( kmap.contains(n) );
map<int, vector<App_Kernel_Info*> > dlmap;
for ( auto& k: kernels ) dlmap[k.d_l].push_back(&k);
for ( auto& [name,k_name]: kmap )
for ( auto& [d_l,kd]: k_name )
ranges::sort( kd, {}, [](auto *a){ return a->grp_size; } );
vector<elt_t> l_in(n_elts_max);
vector<elt_t> l_out(n_elts_max);
vector<elt_t> l_out_check(n_elts_max);
const size_t alloc_size_bytes = max_size_bytes + overrun_size_bytes;
CE( cudaMalloc( &app.l_in_d, alloc_size_bytes ) );
CE( cudaMalloc( &app.l_out_d, alloc_size_bytes ) );
for ( auto& e: l_in ) e = drand48();
double elapsed_time_s = 86400; const int output_width = stdout_width_get();
{
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));
CE( cudaMemcpy
( app.l_in_d, l_in.data(), max_size_bytes, cudaMemcpyHostToDevice ) );
printf("Launching with %d blocks of up to %d threads. \n",
num_blocks, thd_per_block_goal);
intd_l for ( int d_l: { 8, 32 } )
{
const int n_l = n_elts_max / d_l;
const size_t in_size_elts = d_l * n_l;
const size_t out_size_elts = in_size_elts;
const size_t in_size_bytes = in_size_elts * sizeof( elt_t );
const size_t out_size_bytes = in_size_bytes;
const int64_t num_ops_fp = 2 * in_size_elts;
const int64_t num_ops_ls = 2 * in_size_elts + out_size_elts;
const int64_t amt_data_bytes = in_size_bytes + out_size_bytes;
const int64_t alloc_size_vecs =
alloc_size_bytes / ( d_l * sizeof( elt_t ) );
app.n_l = n_l;
app.nlimit_l = alloc_size_vecs;
app.d_l = d_l;
CE( cudaMemcpyToSymbol
( c_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );
for ( int i=0; i<n_l; i++ )
{
const auto idxs = views::iota( i*d_l, (i+1)*d_l );
elt_t sum = 0;
for ( auto j: idxs ) sum += l_in[j];
const elt_t avg = sum / d_l;
for ( auto j: idxs ) l_out_check[j] = l_in[j] - avg;
}
auto& dl_kernels = dlmap.contains(d_l) ? dlmap[d_l] : dlmap[0];
vector<App_Kernel_Info*> k_run;
for ( auto& k: dl_kernels )
{
if ( k->grp_size > d_l ) continue;
k_run.push_back(k);
}
for ( auto ki: k_run )
{
const int kernel = ki->idx;
cudaFuncAttributes& cfa = info.ki[kernel].cfa;
const auto func_ptr = info.ki[kernel].func_ptr;
const int wp_limit = cfa.maxThreadsPerBlock >> 5;
const int thd_limit = wp_limit << 5;
const int thd_per_block_no_vary = min(thd_per_block_goal,thd_limit);
pTable table;
vector<int> n_wps = { 4, 8, 12, 16, 24, 32 };
while ( n_wps.back() > wp_limit ) n_wps.pop_back();
if ( !vary_warps ) n_wps.resize(1);
const int wp_start = n_wps.front();
for ( int wp_cnt: n_wps )
{
const int thd_per_block =
vary_warps ? wp_cnt << 5 : thd_per_block_no_vary;
CE(cudaMemset(app.l_out_d,0,out_size_bytes));
CE(cudaEventRecord(gpu_start_ce,0));
typedef void (*KPtr)(elt_t*, const elt_t*);
for ( NPerf_data_reset(); NPerf_need_run_get(); )
KPtr( info.ki[kernel].func_ptr )
<<< num_blocks, thd_per_block >>>(app.l_out_d,app.l_in_d);
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE( cudaEventElapsedTime
( &cuda_time_ms, gpu_start_ce, gpu_stop_ce) );
const double this_elapsed_time_s =
NPerf_metrics_collection_get()
? NPerf_kernel_et_get() : cuda_time_ms * 0.001;
const double thpt_compute_gflops =
num_ops_fp / this_elapsed_time_s * 1e-9;
const double thpt_data_gbps =
amt_data_bytes / this_elapsed_time_s * 1e-9;
const double fp_bw =
sizeof(elt_t) == 4 ? info.chip_sp_flops :
sizeof(elt_t) == 8 ? info.chip_dp_flops : 1;
const double chip_ls_ops = info.chip_sp_flops / 4;
const double t_bound_fp_s = num_ops_fp / fp_bw;
const double t_bound_ls_s = num_ops_ls / chip_ls_ops;
const double t_bound_insn_s = t_bound_fp_s + t_bound_ls_s;
const double t_bound_mem_s = amt_data_bytes / info.chip_bw_Bps;
const double t_bound_l2_s = amt_data_bytes / l2_bandwidth_Bps;
const double frac_fp =
min( 2.0, t_bound_fp_s / this_elapsed_time_s );
const double frac_insn =
min( 2.0, t_bound_insn_s / this_elapsed_time_s );
const double frac_mem =
min( 2.0, t_bound_mem_s / this_elapsed_time_s );
const double frac_l2 =
min( 2.0, t_bound_l2_s / this_elapsed_time_s );
const double comm_frac =
min(2.0,1e9 * thpt_data_gbps / info.chip_bw_Bps);
const double comm_l2_frac =
min(2.0,1e9 * thpt_data_gbps / l2_bandwidth_Bps );
const int num_wps = ( thd_per_block + 31 ) >> 5;
const int max_bl_per_mp =
info.get_max_active_blocks_per_mp(kernel,thd_per_block);
const int bl_per_mp_available =
0.999 + double(num_blocks) / num_mp;
const int bl_per_mp =
min( bl_per_mp_available, max_bl_per_mp );
const int act_wps = num_wps * bl_per_mp;
const int act_thds_gpu =
min( num_mp * act_wps * 32, num_blocks * thd_per_block );
if ( wp_cnt == wp_start )
printf("\nKernel %s. %d regs. "
"D_L %d, Group sz %d, Unroll %d, n_l %d d_l %d\n",
ki->name_base,
info.ki[kernel].cfa.numRegs,
ki->d_l, ki->grp_size, ki->unroll_degree, n_l, d_l );
table.row_start();
table.entry("wp",num_wps);
if ( num_blocks > num_mp )
table.entry("ac",act_wps);
table.entry
("Utl","%4.2f",
utilization_compute
(info,app,num_blocks,thd_per_block,ki->grp_size,
ki->unroll_degree));
table.entry("t/µs","%4.0f", this_elapsed_time_s * 1e6);
table.entry
("I/el","%4.1f",
NPerf_metric_value_get("sm__inst_executed.sum") * 32.0
/ in_size_elts );
if ( opt_p )
{
if ( false )
table.entry
("%","%2.0f",
NPerf_metric_value_get
("sm__instruction_throughput"
".avg.pct_of_peak_sustained_elapsed") );
table.entry
("BXW","%4.1f",
NPerf_metric_value_get
("l1tex__data_bank_conflicts_pipe_lsu.sum") /
NPerf_metric_value_get ("l1tex__t_requests.sum"));
table.header_span_start("L2-Cache");
const double data_l2_l1_bytes =
NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum");
const double data_l1_l2_bytes =
NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum");
table.entry
("N*R", "%4.1f", data_l2_l1_bytes / in_size_bytes );
table.entry
("N*W", "%4.1f", data_l1_l2_bytes / out_size_bytes );
table.entry
("%pk", "%3.0f",
100.0 * ( data_l1_l2_bytes + data_l2_l1_bytes )
/ ( this_elapsed_time_s * l2_bandwidth_Bps ) );
table.entry
("GB/s", "%4.0f",
1e-9*
( NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum")
+ NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum") )
/ this_elapsed_time_s );
table.header_span_end();
table.header_span_start("DRAM");
if ( false )
table.entry
("N*RW", "%4.1f",
( NPerf_metric_value_get("dram__bytes_read.sum")
+ NPerf_metric_value_get("dram__bytes_write.sum") )
/ ( in_size_bytes + out_size_bytes ) );
table.entry
("GB/s","%4.0f",
1e-9 *
( NPerf_metric_value_get("dram__bytes_write.sum")
+ NPerf_metric_value_get("dram__bytes_read.sum") )
/ this_elapsed_time_s );
table.header_span_end();
}
table.entry("FP θ","%4.0f", thpt_compute_gflops);
const size_t max_st_len =
max(5, output_width - 1 - table.row_len_get() );
pStringF fmt("%%-%zds",max_st_len);
typedef struct { double f; char c; } Elt;
const bool l2_contained = amt_data_bytes < l2_size_bytes;
const double frac_data = l2_contained ? frac_l2 : frac_mem;
vector<Elt> segments =
{ { frac_fp, '+' }, { frac_insn, '-' }, { frac_data, '*' } };
string util_hdr = "=== Util: FP++ Insn-- ";
util_hdr += l2_contained ? "L2** " : "Mem** ";
if ( max_st_len > util_hdr.length() )
util_hdr += string(max_st_len - util_hdr.length(),'=');
ranges::sort( segments, {}, &Elt::f );
string bar;
for ( Elt& e: segments )
if ( size_t p = e.f * max_st_len + 0.5; p > bar.length() )
bar += string( p - bar.length(), e.c );
if ( bar.length() > max_st_len )
{
bar.resize(max_st_len);
bar[max_st_len-1] = '>';
}
table.entry(util_hdr,fmt, bar, pTable::pT_Left);
elapsed_time_s = min(this_elapsed_time_s,elapsed_time_s);
CE( cudaMemcpy
( l_out.data(), app.l_out_d, out_size_bytes,
cudaMemcpyDeviceToHost ) );
int err_count = 0;
elt_t max_err = 0;
for ( int i=0; i<n_l; i++ )
for ( int j=0; j<d_l; j++ )
{
const int idx = i * d_l + j;
const elt_t err = fabs( l_out_check[idx] - l_out[idx] );
set_max( max_err, err );
if ( err > 1e-5 )
{
err_count++;
if ( err_count < 5 )
printf
( "Error at vec %d elt %d: "
"%.7f != %.7f (correct)\n",
i, j, l_out[idx], l_out_check[idx] );
}
}
if ( err_count )
printf("Total errors %d, max error %f\n",
err_count, max_err );
}
printf("%s",table.body_get());
}
}
}
}