#include <stdio.h>
#include <cuda_runtime.h>
#include <gp/cuda-gpuinfo.h>
#include <ptable.h>
#include <nperf.h>
typedef float wht_t;
typedef float acc_t;
struct Layer_Shape
{
int nn, nc, ni, no, nm;
};
constexpr Layer_Shape ls[] = { { 200, 20, 32, 32, 20 },
{ 200, 52, 48, 48, 52 } };
constexpr int n_shapes = sizeof(ls)/sizeof(ls[0]);
struct Layer
{
wht_t *w_h, *w_d;
wht_t *w2_h, *w2_d;
acc_t *ai_h, *ai_d;
acc_t *ao_h, *ao_d, *ao_check_h;
Layer_Shape s;
size_t sz_ai_elts, sz_ao_elts, sz_w_elts;
size_t sz_ai_bytes, sz_ao_bytes, sz_w_bytes;
};
template<int tpnn=0, int tpnc=0, int tpni=0>
__global__ void
dnn_base(Layer l)
{
Layer_Shape& s = l.s;
const int [[gnu::unused]] tid = blockIdx.x * blockDim.x + threadIdx.x;
const int [[gnu::unused]] num_threads = blockDim.x * gridDim.x;
const int nn = tpnn ? tpnn : s.nn; const int ni = tpni ? tpni : s.ni; const int nc = tpnc ? tpnc : s.nc; const int no = ni; const int nm = nc; assert( ni == s.ni );
assert( nc == s.nc );
assert( nn == s.nn );
acc_t* const ai = l.ai_d;
acc_t* const ao = l.ao_d;
wht_t [[gnu::unused]] * const w = l.w_d;
wht_t [[gnu::unused]] * const w2 = l.w2_d;
const int nnmo = nn * nm * no;
for ( int inmo = tid; inmo < nnmo; inmo += num_threads )
{
const int im = inmo % nm;
const int ino = inmo / nm;
const int in = ino % nn;
const int io = ino / nn;
acc_t ac = 0;
for ( int ic = 0; ic < nc; ic++ )
for ( int ii = 0; ii < ni; ii++ )
ac +=
ai[ ii + ni * ( ic + nc * in ) ]
* w[ im + nm * ( ii + ni * ( ic + nc * io ) ) ];
ao[ io + no * ( im + nm * in ) ] = ac;
}
}
template<int tpnn=0, int tpnc=0, int tpni=0, int bn=8, int bo=2, int bm=4>
__global__ void
dnn_fe(Layer l)
{
Layer_Shape& s = l.s;
const int [[gnu::unused]] tid = blockIdx.x * blockDim.x + threadIdx.x;
const int [[gnu::unused]] num_threads = blockDim.x * gridDim.x;
const int nn = tpnn ? tpnn : s.nn; const int ni = tpni ? tpni : s.ni; const int nc = tpnc ? tpnc : s.nc; const int no = ni; const int nm = nc; assert( ni == s.ni );
assert( nc == s.nc );
assert( nn == s.nn );
acc_t* const ai = (acc_t*) __builtin_assume_aligned(l.ai_d,16);
acc_t* const ao = (acc_t*) __builtin_assume_aligned(l.ao_d,16);
wht_t [[gnu::unused]] * const w = (wht_t*) __builtin_assume_aligned(l.w_d,16);
wht_t [[gnu::unused]] * const w2 =
(wht_t*) __builtin_assume_aligned(l.w2_d,16);
constexpr int ab = bo * bn * bm;
const int nnmo = nn * nm * no;
const int nnmo_ab = nnmo / ab;
const int nn_bn = nn / bn;
const int nm_bm = nm / bm;
for ( int inmo = tid; inmo < nnmo_ab; inmo += num_threads )
{
const int im_bm = inmo % nm_bm;
const int im0 = im_bm * bm;
const int ino = inmo / nm_bm;
const int in_bn = ino % nn_bn;
const int in0 = in_bn * bn;
const int io_bo = ino / nn_bn;
const int io0 = io_bo * bo;
acc_t ac[bo][bm][bn]{};
for ( int ic = 0; ic < nc; ic++ )
#pragma unroll 4
for ( int ii = 0; ii < ni; ii++ )
{
acc_t ain[bn];
for ( int i_bn = 0; i_bn < bn; i_bn++ )
{
const int in = in0 + i_bn;
ain[i_bn] = ai[ ii + ni * ( ic + nc * in ) ];
}
for ( int i_bm = 0; i_bm < bm; i_bm++ )
for ( int i_bo = 0; i_bo < bo; i_bo++ )
{
const int im = im0 + i_bm;
const int io = io0 + i_bo;
wht_t wht = w[ im + nm * ( ii + ni * ( ic + nc * io ) ) ];
for ( int i_bn = 0; i_bn < bn; i_bn++ )
ac[i_bo][i_bm][i_bn] += ain[i_bn] * wht;
}
}
for ( int i_bn = 0; i_bn < bn; i_bn++ )
for ( int i_bm = 0; i_bm < bm; i_bm++ )
for ( int i_bo = 0; i_bo < bo; i_bo++ )
{
const int io = io0 + i_bo;
const int in = in0 + i_bn;
const int im = im0 + i_bm;
ao[ io + no * ( im + nm * in ) ] = ac[i_bo][i_bm][i_bn];
}
}
}
void
layer_init(Layer &l)
{
Layer_Shape& s = l.s;
int &ni = s.ni; int &no = s.no; int &nc = s.nc; int &nm = s.nm; int &nn = s.nn;
srand48(2735);
l.sz_ai_elts = ni * nc * nn;
l.sz_ao_elts = no * nm * nn;
l.sz_w_elts = nc * nm * ni * no;
l.ai_h = new acc_t[l.sz_ai_elts];
l.ao_h = new acc_t[l.sz_ao_elts];
l.ao_check_h = new acc_t[l.sz_ao_elts];
l.w_h = new wht_t[l.sz_w_elts];
l.w2_h = new wht_t[l.sz_w_elts];
l.sz_ai_bytes = l.sz_ai_elts * sizeof(*l.ai_d);
l.sz_w_bytes = l.sz_w_elts * sizeof(*l.w_d);
l.sz_ao_bytes = l.sz_ao_elts * sizeof(*l.ao_d);
CE( cudaMalloc( &l.ai_d, l.sz_ai_bytes ) );
CE( cudaMalloc( &l.w_d, l.sz_w_bytes ) );
CE( cudaMalloc( &l.w2_d, l.sz_w_bytes ) );
CE( cudaMalloc( &l.ao_d, l.sz_ao_bytes ) );
acc_t* const ai = l.ai_h;
acc_t* const ao = l.ao_check_h;
wht_t* const w = l.w_h;
const bool debug = false;
const bool debug_w = false;
for ( int in = 0; in < nn; in++ )
for ( int ic = 0; ic < nc; ic++ )
for ( int ii = 0; ii < ni; ii++ )
{
size_t idx_ai = ii + ni * ( ic + nc * in );
ai[idx_ai] = debug ? ic : drand48();
}
for ( size_t i=0; i<l.sz_w_elts; i++ ) w[i] = debug_w ? 1.0 : drand48()-0.5;
# pragma omp parallel for
for ( size_t i=0; i<l.sz_ao_elts; i++ ) ao[i] = -1;
# pragma omp parallel for
for ( int in = 0; in < nn; in++ )
for ( int im = 0; im < nm; im++ )
for ( int io = 0; io < no; io++ )
{
acc_t ac = 0;
for ( int ic = 0; ic < nc; ic++ )
for ( int ii = 0; ii < ni; ii++ )
{
size_t idx_ai = ii + ni * ( ic + nc * in );
size_t idx_w = im + nm * ( ii + ni * ( ic + nc * io ) );
ac += ai[ idx_ai ] * w[ idx_w ];
}
ao[ io + no * ( im + nm * in ) ] = ac;
}
# pragma omp parallel for
for ( int im = 0; im < nm; im++ )
for ( int io = 0; io < no; io++ )
for ( int ic = 0; ic < nc; ic++ )
for ( int ii = 0; ii < ni; ii++ )
{
size_t idx_w = im + nm * ( ii + ni * ( ic + nc * io ) );
assert( idx_w < l.sz_w_elts );
size_t idx_w2 = im + nm * ( ii + ni * ( ic + nc * io ) );
assert( idx_w2 < l.sz_w_elts );
l.w2_h[idx_w2] = w[idx_w];
}
}
GPU_Info
print_gpu_and_kernel_info()
{
GPU_Info info;
gpu_info_print();
int dev = gpu_choose_index();
CE(cudaSetDevice(dev));
printf("Using GPU %d\n",dev);
info.get_gpu_info(dev);
return info;
}
int
main(int argc, char **argv)
{
NPerf_init();
GPU_Info info = print_gpu_and_kernel_info();
NPerf_metric_collect("inst_executed");
NPerf_metric_collect("l2_global_load_bytes");
NPerf_metric_collect("l2_write_transactions");
NPerf_metric_collect("dram_read_bytes");
NPerf_metric_collect("dram_write_bytes");
constexpr int wp_sz = 32;
struct App_Kernel_Info {
App_Kernel_Info
(Kernel_Info& k,const char *name, int i, int bnp, int bop, int bmp):
k_ptr(k.func_ptr),name_base(name),shape_idx{i},bn(bnp),bo(bop),bm(bmp){};
GPU_Info_Func k_ptr;
const char *name_base;
const int shape_idx;
const int bn, bo, bm;
};
vector<App_Kernel_Info> kernels;
#define EXAMINE_KERNEL(k,sidx,bn,bo,bm) \
{ const int idx = kernels.size(); \
kernels.emplace_back(info.GET_INFO((k)),#k,sidx,bn,bo,bm); }
#define SPECIFY_KERNEL(k,sidx) \
EXAMINE_KERNEL((k<ls[sidx].nn,ls[sidx].nc,ls[sidx].ni>),sidx,1,1,1);
#define SPECIALIZE_KERNEL(sidx) \
SPECIFY_KERNEL(dnn_base,sidx);
#define BLOCKIZE_KERNEL(sidx,bn,bo,bm) \
EXAMINE_KERNEL((dnn_fe<ls[sidx].nn,ls[sidx].nc,ls[sidx].ni,bn,bo,bm>),sidx,bn,bo,bm); \
#define BLOCKIZE_KERNELS(sidx) \
SPECIFY_KERNEL(dnn_base,sidx); \
BLOCKIZE_KERNEL(sidx,4,1,1); \
BLOCKIZE_KERNEL(sidx,2,1,2); \
BLOCKIZE_KERNEL(sidx,4,1,2); \
BLOCKIZE_KERNEL(sidx,4,1,4); \
BLOCKIZE_KERNEL(sidx,4,2,4); \
BLOCKIZE_KERNEL(sidx,8,2,4);
BLOCKIZE_KERNELS(0);
BLOCKIZE_KERNELS(1);
#undef SPECIALIZE_KERNEL
const bool want_kernel_info = true;
if ( want_kernel_info )
{
printf("\nCUDA Kernel Resource Usage:\n");
for ( int i=0; i<info.num_kernels; i++ )
{
printf("For %s:\n", info.ki[i].name);
printf(" %6zd shared, %zd const, %zd loc, %d regs; "
"%d max threads per block.\n",
info.ki[i].cfa.sharedSizeBytes,
info.ki[i].cfa.constSizeBytes,
info.ki[i].cfa.localSizeBytes,
info.ki[i].cfa.numRegs,
info.ki[i].cfa.maxThreadsPerBlock);
}
}
const int num_mp = info.cuda_prop.multiProcessorCount;
const int arg1_int = argc < 2 ? num_mp : atoi(argv[1]);
const int num_blocks =
arg1_int == 0 ? num_mp :
arg1_int < 0 ? -arg1_int * num_mp : arg1_int;
const int wp_per_block_arg = argc < 3 ? 0 : atoi(argv[2]);
const int wp_per_block_goal =
wp_per_block_arg == 0 ? 32 : wp_per_block_arg;
const int n_threads = num_blocks * wp_per_block_goal * wp_sz;
const bool vary_warps = wp_per_block_arg == 0;
Layer layers[n_shapes];
for ( int i=0; i<n_shapes; i++ )
{
Layer& l = layers[i];
Layer_Shape& s = l.s;
s = ls[i];
layer_init(l);
const size_t act_one_bytes =
( s.ni*s.nc + s.no*s.nm ) * sizeof(l.ai_h[0]);
const size_t act_all_bytes = act_one_bytes * s.nn;
printf("Layer shape %d: ni=no=%d. nc=nm=%d. nn=%d.\n",
i, s.ni, s.nc, s.nn);
printf(" Number elts: activations %zd, weights %zd\n",
l.sz_ai_elts + l.sz_ao_elts, l.sz_w_elts);
printf(" Weights size: %zu kiB L2 cache units: %.3f\n",
l.sz_w_bytes >> 10,
double(l.sz_w_bytes) / info.cuda_prop.l2CacheSize);
printf(" Act size one batch : %zu B L2 cache units: %.3f\n",
act_one_bytes,
double(act_one_bytes) / info.cuda_prop.l2CacheSize);
printf(" Act size all batches: %zu B L2 cache units: %.3f\n",
act_all_bytes,
double(act_all_bytes) / info.cuda_prop.l2CacheSize);
}
if ( n_threads <= 0 )
{
printf("Usage: %s [ NUM_CUDA_BLOCKS ] [WARPS_PER_BLOCK] "
"[COL PER MP]\n",
argv[0]);
exit(1);
}
const int output_width = stdout_width_get();
{
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));
for ( auto &l: layers )
{
CE( cudaMemcpy
( l.ai_d, l.ai_h, l.sz_ai_bytes, cudaMemcpyHostToDevice ) );
CE( cudaMemcpy
( l.w_d, l.w_h, l.sz_w_bytes, cudaMemcpyHostToDevice ) );
CE( cudaMemcpy
( l.w2_d, l.w2_h, l.sz_w_bytes, cudaMemcpyHostToDevice ) );
}
printf("Launching with %d blocks of up to %d warps. \n",
num_blocks, wp_per_block_goal);
for ( auto& aki: kernels )
{
const char* kname = aki.name_base;
const int sidx = aki.shape_idx;
Layer& l = layers[sidx];
const Layer_Shape s = l.s;
const int o_per_iter = aki.bn * aki.bo * aki.bm;
printf("%s bn=%d, bo=%d, bm=%d, out/iter=%d\n",
kname, aki.bn, aki.bo, aki.bm, o_per_iter);
pTable table(stdout);
Kernel_Info* const ki = &info.get_info(aki.k_ptr);
const int wp_limit = ki->cfa.maxThreadsPerBlock >> 5;
const int thd_limit = wp_limit << 5;
const int thd_per_block_no_vary =
min(wp_per_block_goal*wp_sz,thd_limit);
const int wp_start = 1;
const int wp_stop = vary_warps ? wp_limit : wp_start;
const int wp_inc = 1;
for ( int wp_cnt = wp_start; wp_cnt <= wp_stop; wp_cnt += wp_inc )
{
const int thd_per_block =
vary_warps ? wp_cnt << 5 : thd_per_block_no_vary;
if ( vary_warps && wp_cnt > 4 && wp_cnt & 0x3 ) continue;
const int64_t num_ops_fp = l.sz_w_elts * s.nn;
const int64_t num_ops_ls =
s.nn * l.sz_w_elts
+ s.no * s.nm * l.sz_ai_elts + l.sz_ao_elts;
const int64_t amt_data_bytes =
l.sz_w_bytes + l.sz_ai_bytes + l.sz_ao_bytes;
{
CE(cudaMemset(l.ao_d,0,l.sz_ao_bytes));
CE(cudaEventRecord(gpu_start_ce,0));
typedef void (*KPtr)(Layer);
for ( NPerf_data_reset(); NPerf_need_run_get(); )
KPtr(ki->func_ptr) <<< num_blocks, thd_per_block >>>(l);
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE( cudaEventElapsedTime
(&cuda_time_ms,gpu_start_ce,gpu_stop_ce) );
const double this_elapsed_time_s =
NPerf_metrics_collection_get()
? NPerf_kernel_et_get() : cuda_time_ms * 0.001;
const double thpt_compute_gflops =
num_ops_fp / this_elapsed_time_s * 1e-9;
const double thpt_data_gbps =
amt_data_bytes / this_elapsed_time_s * 1e-9;
const double chip_ls_ops = info.chip_sp_flops / 4;
const double t_bound_fp = num_ops_fp / info.chip_sp_flops;
const double t_bound_ls = num_ops_ls / chip_ls_ops;
const double t_bound_insn = t_bound_fp + t_bound_ls;
{
const double comp_frac = t_bound_insn / this_elapsed_time_s;
const double bw_frac =
1e9 * thpt_data_gbps / info.chip_bw_Bps;
const double fp_frac = t_bound_fp / this_elapsed_time_s;
const int num_wps = ( thd_per_block + 31 ) >> 5;
const int max_bl_per_mp =
ki->get_max_active_blocks_per_mp(thd_per_block);
const int bl_per_mp_available =
0.999 + double(num_blocks) / num_mp;
const int bl_per_mp =
min( bl_per_mp_available, max_bl_per_mp );
const int act_wps = num_wps * bl_per_mp;
pTable_Row row(table);
table.entry("nn","%2d",s.nn);
table.entry("nc","%2d",s.nc);
table.entry("ni","%2d",s.ni);
table.entry("wp",num_wps);
if ( num_blocks > num_mp )
table.entry("ac",act_wps);
if ( NPerf_metrics_collection_get() )
{
const double transaction_sz_bytes = 32;
double dram_rd_bytes =
NPerf_metric_value_get("dram_read_bytes");
double dram_wr_bytes =
NPerf_metric_value_get("dram_write_bytes");
double l2_rd_bytes =
NPerf_metric_value_get("l2_global_load_bytes");
double l2_wr_bytes =
NPerf_metric_value_get("l2_write_transactions")
* transaction_sz_bytes;
table.entry
("I/op","%4.1f",
NPerf_metric_value_get("inst_executed")
* 32.0 / num_ops_fp );
table.entry
("DUse","%4.1f",
( dram_rd_bytes + dram_wr_bytes ) / amt_data_bytes);
if ( false )
table.entry("DW","%4.1f", dram_wr_bytes / l.sz_ao_bytes );
table.entry
("2Use","%5.1f",
( l2_rd_bytes + l2_wr_bytes ) / amt_data_bytes);
if ( false )
table.entry("2W","%4.1f", l2_wr_bytes / l.sz_ao_bytes);
}
table.entry("t/µs","%6.0f", this_elapsed_time_s * 1e6);
table.entry("FP θ","%4.0f", thpt_compute_gflops);
if ( false )
table.entry("GB/s","%4.0f", thpt_data_gbps);
const size_t max_st_len =
max(5, output_width - 1 - table.row_len_get() );
pStringF fmt("%%-%zds",max_st_len);
string util_hdr =
"=== Util: FP++ Insn-- Data** ";
if ( max_st_len > util_hdr.length() )
util_hdr += string(max_st_len - util_hdr.length(),'=');
typedef struct { double f; char c; } Elt;
vector<Elt> segments =
{ { fp_frac, '+' }, { comp_frac, '-' }, { bw_frac, '*' } };
sort( segments.begin(), segments.end(),
[](Elt& a, Elt& b){ return a.f < b.f; } );
string bar;
for ( Elt& e: segments )
if ( size_t p = e.f * max_st_len + 0.5; p > bar.length() )
bar += string( p - bar.length(), e.c );
if ( bar.length() > max_st_len )
{
bar.resize(max_st_len);
bar[max_st_len-1] = '>';
}
table.entry(util_hdr,fmt, bar, pTable::pT_Left);
}
CE( cudaMemcpy
( l.ao_h, l.ao_d, l.sz_ao_bytes, cudaMemcpyDefault) );
{
int err_count = 0;
Layer_Shape& s = l.s;
const int no = s.no; const int nm = s.nm; const int nn = s.nn;
for ( int in = 0; in < nn; in++ )
for ( int im = 0; im < nm; im++ )
for ( int io = 0; io < no; io++ )
{
size_t idx = io + no * ( im + nm * in );
if ( fabs( l.ao_check_h[idx] - l.ao_h[idx] ) > 1e-4 )
{
err_count++;
if ( err_count < 5 )
printf
("Error at %d, %d, %d: "
"%.7f != %.7f (correct)\n",
in, im, io, l.ao_h[idx],
l.ao_check_h[idx]);
}}
if ( err_count )
printf("Total errors %d\n", err_count);
}
}
}
}
}
}