#include <cuda/std/bit>
#include <cuda_runtime.h>
#include <gp/cuda-gpuinfo.h>
#include <ptable.h>
#include <nperf.h>
#include <misc.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <deque>
#include <algorithm>
#include <ranges>
#include <cuda/std/bit>
typedef uint32_t pClock_t;
constexpr size_t timing_item_size = 16;
struct __align__(timing_item_size) Timing_Item {
pClock_t time_start;
uint32_t smid_start;
pClock_t time_end;
uint32_t smid_end; };
static_assert( timing_item_size == sizeof(Timing_Item) );
struct Timing { Timing_Item *timing_items; };
__constant__ Timing timing_dev;
__device__ uint32_t
smid_get()
{
uint smid = 0;
asm( "mov.u32 %0, %%smid;" : "=r" (smid) );
return smid;
}
__device__ void
timing_start(const int wp)
{
constexpr int wp_sz = 32;
if ( threadIdx.x % wp_sz == 0 )
{
Timing_Item& ti = timing_dev.timing_items[wp];
ti.time_start = clock();
ti.smid_start = smid_get();
}
__syncwarp();
}
__device__ void
timing_end(const int wp)
{
constexpr int wp_sz = 32;
__syncwarp();
if ( threadIdx.x % wp_sz == 0 )
{
Timing_Item& ti = timing_dev.timing_items[wp];
ti.time_end = clock();
ti.smid_end = smid_get();
}
}
typedef float elt_t;
struct App
{
int n_l;
int d_l;
elt_t *l_in_d, *l_out_d;
};
__constant__ App c_app;
template< int D_L = 0 >
__global__ void
norm_base(elt_t* __restrict__ l_out, const elt_t* __restrict__ l_in)
{
constexpr int wp_sz = 32;
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int n_threads = blockDim.x * gridDim.x;
const int d_l = D_L ?: c_app.d_l;
const int n_l = c_app.n_l;
timing_start(tid/wp_sz);
for ( int h = tid; h < n_l; h += n_threads )
{
elt_t sum = 0;
for ( int i = 0; i < d_l; i++ ) sum += l_in[ h * d_l + i ];
const elt_t avg = sum / d_l;
for ( int i = 0; i < d_l; i++ )
l_out[ h * d_l + i ] = l_in[ h * d_l + i ] - avg;
}
timing_end(tid/wp_sz);
}
__device__ elt_t
group_sum(elt_t thd_val, uint group_size)
{
assert( group_size <= 32 );
const uint32_t mask =
group_size > 1 && group_size < 32 ? __activemask() : ~0;
elt_t sum = thd_val;
for ( int dist = 1; dist < group_size; dist <<= 1 )
sum += __shfl_xor_sync(mask, sum, dist );
return sum;
}
template<int D_L = 0, int grp_size = 1>
__global__ void
norm_group(elt_t* __restrict__ l_out, const elt_t* __restrict__ l_in)
{
constexpr int wp_sz = 32;
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int n_threads = blockDim.x * gridDim.x;
const int d_l = D_L ?: c_app.d_l;
const int n_l = c_app.n_l;
timing_start(tid/wp_sz);
const int sub_lane = threadIdx.x % grp_size;
const int h_start = tid / grp_size;
for ( int h = h_start; h < n_l; h += n_threads / grp_size )
{
const size_t idx_vec_start = h * d_l;
const size_t idx_vec_thd_start = idx_vec_start + sub_lane;
elt_t thd_sum = 0;
for ( int i = 0; i < d_l; i += grp_size )
thd_sum += l_in[ idx_vec_thd_start + i ];
const elt_t sum = group_sum(thd_sum,grp_size);
const elt_t avg = sum / d_l;
for ( int i = 0; i < d_l; i += grp_size )
l_out[ idx_vec_thd_start + i ] = l_in[ idx_vec_thd_start + i ] - avg;
}
timing_end(tid/wp_sz);
}
int
hw04_group_size_choose(int l1_size_bytes, uint block_size, int d_l)
{
[[ maybe_unused ]] constexpr int elt_size_bytes = sizeof(elt_t);
constexpr int wp_lg = 5;
[[ maybe_unused ]] constexpr int wp_sz = 1 << wp_lg;
const int block_size_pot = bit_ceil(block_size);
assert( block_size_pot == block_size );
const int l1_size_elts = l1_size_bytes / elt_size_bytes;
const int n_wps_bl = block_size >> wp_lg;
const int n_wps_vec = d_l >> wp_lg;
assert( n_wps_vec << wp_lg == d_l );
const int n_vec_raw = 2 * l1_size_elts / ( 6 * d_l );
const int grp_size_raw =
n_vec_raw ? ( block_size + n_vec_raw - 1 ) / n_vec_raw : block_size;
const uint grp_wps_raw = grp_size_raw + wp_sz - 1 >> wp_lg;
const int grp_wps_pot = bit_ceil(grp_wps_raw);
const int grp_wps =
grp_wps_pot > n_wps_bl ? n_wps_bl : grp_wps_pot;
const int grp_size = grp_wps * wp_sz;
assert( grp_size <= d_l );
assert( grp_size && grp_size <= block_size );
assert( block_size / grp_size * grp_size == block_size );
return grp_size;
}
template<int D_L = 0, int GRP_SIZE = 1>
__global__ void
norm_hw04(elt_t* __restrict__ l_out, const elt_t* __restrict__ l_in)
{
constexpr int wp_sz = 32;
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int n_threads = blockDim.x * gridDim.x;
const int d_l = D_L ?: c_app.d_l;
const int n_l = c_app.n_l;
timing_start(tid/wp_sz);
constexpr int grp_size = GRP_SIZE;
const int sub_lane = threadIdx.x % grp_size;
const int h_start = tid / grp_size;
const int lane = threadIdx.x % wp_sz;
const int wp_idx = threadIdx.x / wp_sz;
constexpr int grp_wps = grp_size / wp_sz;
__shared__ elt_t ssum[32];
for ( int h = h_start; h < n_l; h += n_threads / grp_size )
{
const size_t idx_vec_start = h * d_l;
const size_t idx_vec_thd_start = idx_vec_start + sub_lane;
elt_t thd_sum = 0;
for ( int i = 0; i < d_l; i += grp_size )
thd_sum += l_in[ idx_vec_thd_start + i ];
const elt_t wp_sum = group_sum(thd_sum,32);
elt_t vec_sum;
if ( grp_wps == 1 )
{
vec_sum = wp_sum;
}
else
{
__syncthreads(); if ( !lane ) ssum[wp_idx] = wp_sum;
__syncthreads(); if ( !wp_idx )
ssum[threadIdx.x] = group_sum( ssum[threadIdx.x], grp_wps );
__syncthreads(); vec_sum = ssum[wp_idx];
}
const elt_t avg = vec_sum / d_l;
for ( int i = 0; i < d_l; i += grp_size )
l_out[ idx_vec_thd_start + i ] = l_in[ idx_vec_thd_start + i ] - avg;
}
timing_end(tid/wp_sz);
}
GPU_Info
print_gpu_and_kernel_info()
{
GPU_Info info;
gpu_info_print();
int dev = gpu_choose_index();
CE(cudaSetDevice(dev));
printf("Using GPU %d\n",dev);
info.get_gpu_info(dev);
return info;
}
int
main(int argc, char **argv)
{
App app;
NPerf_init();
GPU_Info info = print_gpu_and_kernel_info();
const uint num_mp = info.cuda_prop.multiProcessorCount;
constexpr int wp_lg = 5;
const int arg1_int = argc < 2 ? num_mp : atoi(argv[1]);
const uint num_blocks =
arg1_int == 0 ? num_mp : arg1_int < 0 ? -arg1_int * num_mp : arg1_int;
const bool opt_p = true;
const int thd_per_block_arg = argc < 3 ? 0 : atoi(argv[2]);
const int thd_per_block_goal =
thd_per_block_arg == 0 ? 1024 : thd_per_block_arg;
const int num_threads = num_blocks * thd_per_block_goal;
const bool vary_warps = thd_per_block_arg == 0;
const int wp_per_block_arg = thd_per_block_arg >> wp_lg;
const int l1_size_bytes = info.cuda_prop.sharedMemPerBlockOptin;
const int l2_size_bytes = info.cuda_prop.l2CacheSize;
const float l2_bandwidth_Bps = 32 * bit_floor(num_mp-1) * info.clock_freq_hz;
const float default_n_l2_units = 0.25;
const float arg3_val = argc < 4 ? -default_n_l2_units : atof(argv[3]);
const int n_bytes_raw =
arg3_val < 0 ? -arg3_val * l2_size_bytes : arg3_val * ( 1 << 20 );
const int n_elts_max = n_bytes_raw / sizeof(elt_t);
if ( num_threads <= 0 || n_elts_max <= 0 )
{
printf("Usage: %s [ NUM_CUDA_BLOCKS ] [THD_PER_BLOCK|p] "
"[-DATA_SIZE_L2_UNITS|DATA_SIZE_MiB]\n",
argv[0]);
exit(1);
}
NPerf_metric_collect("sm__inst_executed.sum");
NPerf_metric_collect("gld_efficiency");
if ( opt_p )
{
NPerf_metric_collect
("sm__instruction_throughput.avg.pct_of_peak_sustained_elapsed");
NPerf_metric_collect("l1tex__m_xbar2l1tex_read_bytes.sum");
NPerf_metric_collect("l1tex__m_l1tex2xbar_write_bytes.sum");
NPerf_metric_collect("dram__bytes_read.sum");
NPerf_metric_collect("dram__bytes_write.sum");
NPerf_metric_collect("l1tex__t_requests.sum");
NPerf_metric_collect("l1tex__data_bank_conflicts_pipe_lsu.sum");
}
const size_t max_size_elts = n_elts_max;
const size_t max_size_bytes = max_size_elts * sizeof( app.l_in_d[0] );
const size_t overrun_size_bytes = 1024 * sizeof( app.l_in_d[0] );
struct App_Kernel_Info {
App_Kernel_Info
(int idx, Kernel_Info& k,const char *name_base, const char *name,
int grp_sz, int d_l):
idx(idx),k_ptr(k.func_ptr), name(name), name_base(name_base),
grp_size(grp_sz), d_l(d_l){}
int idx;
GPU_Info_Func k_ptr;
const char *name;
const char *name_base;
int grp_size, d_l;
};
vector<App_Kernel_Info> kernels;
#define PROCESS_KERNEL(k) \
{ const int idx = kernels.size(); \
kernels.emplace_back(idx,info.GET_INFO((k)),#k,#k,1,0); }
#define TDH_GRP_KERNEL3(k,kb,grp_sz,d_l) \
{ const int idx = kernels.size(); \
kernels.emplace_back(idx,info.GET_INFO(k),kb,#k,grp_sz,d_l); }
#define TDH_GRP_KERNEL2(k,grp_sz,d_l) \
TDH_GRP_KERNEL3((k<d_l,grp_sz>),#k,grp_sz,d_l);
#define TDH_GRP_KERNEL1(k,grp_sz) \
TDH_GRP_KERNEL2(k,grp_sz,0); TDH_GRP_KERNEL2(k,grp_sz,32);
#define TDH_GRP_KERNEL(k) TDH_GRP_KERNEL1(k,32);
#define TDH_GRPS_KERNEL(k) \
TDH_GRP_KERNEL1(k,32); TDH_GRP_KERNEL1(k,64); \
TDH_GRP_KERNEL1(k,128); TDH_GRP_KERNEL1(k,256); \
TDH_GRP_KERNEL1(k,512); TDH_GRP_KERNEL1(k,1024);
#define TDH_KERNEL3(k,kb,d_l) \
{ const int idx = kernels.size(); \
kernels.emplace_back(idx,info.GET_INFO(k),kb,#k,0,d_l); }
#define TDH_KERNEL2(k,d_l) TDH_KERNEL3((k<d_l>),#k,d_l);
#define TDH_KERNEL(k) \
TDH_KERNEL2(k,0); TDH_KERNEL2(k,32);
TDH_KERNEL(norm_base);
TDH_GRP_KERNEL(norm_group);
TDH_GRPS_KERNEL(norm_hw04);
vector<string> knames = { "norm_base", "norm_group", "norm_hw04" };
map<string, map<int, vector<App_Kernel_Info*> > > kmap;
map<string, map<int, App_Kernel_Info* > > kmap_gsz;
for ( auto& k: kernels ) kmap[k.name_base][k.d_l].push_back(&k);
for ( auto& k: kernels )
if ( !k.d_l ) kmap_gsz[k.name_base][k.grp_size] = &k;
for ( auto& n: knames ) assert( kmap.contains(n) );
for ( auto& n: knames ) assert( kmap_gsz.contains(n) );
for ( auto& [name,k_name]: kmap )
for ( auto& [d_l,kd]: k_name )
ranges::sort( kd, {}, [](auto *a){ return a->grp_size; } );
const double clock_period_us = 1e3 / info.cuda_prop.clockRate;
const int max_wps = num_blocks * 32;
vector<Timing_Item> timing_items(max_wps);
Timing timing;
const size_t timing_items_bytes =
timing_items.size() * sizeof(timing_items[0]);
CE( cudaMalloc( &timing.timing_items, timing_items_bytes ) );
CE( cudaMemcpyToSymbol
( timing_dev, &timing, sizeof(timing), 0, cudaMemcpyHostToDevice) );
vector<elt_t> l_in(n_elts_max);
vector<elt_t> l_out(n_elts_max);
vector<elt_t> l_out_check(n_elts_max);
CE( cudaMalloc( &app.l_in_d, max_size_bytes + overrun_size_bytes ) );
CE( cudaMalloc( &app.l_out_d, max_size_bytes + overrun_size_bytes ) );
for ( auto& e: l_in ) e = drand48();
double elapsed_time_s = 86400; const int output_width = stdout_width_get();
{
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));
CE( cudaMemcpy
( app.l_in_d, l_in.data(), max_size_bytes, cudaMemcpyHostToDevice ) );
printf("Launching with %d blocks of up to %d threads. \n",
num_blocks, thd_per_block_goal);
for ( int d_l: { 1024, 2048, 4096 } )
{
const int n_l = n_elts_max / d_l;
assert( n_l > 0 );
const size_t in_size_elts = d_l * n_l;
const size_t out_size_elts = in_size_elts;
const size_t in_size_bytes = in_size_elts * sizeof( elt_t );
const size_t out_size_bytes = in_size_bytes;
const int64_t num_ops_fp = 2 * in_size_elts;
const int64_t num_ops_ls = 2 * in_size_elts + out_size_elts;
const int64_t amt_data_bytes = in_size_bytes + out_size_bytes;
const bool l2_contained = amt_data_bytes < l2_size_bytes * 1.1;
app.n_l = n_l;
app.d_l = d_l;
CE( cudaMemcpyToSymbol
( c_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );
#pragma omp parallel for
for ( int i=0; i<n_l; i++ )
{
const auto idxs = views::iota( i*d_l, (i+1)*d_l );
elt_t sum = 0;
for ( auto j: idxs ) sum += l_in[j];
const elt_t avg = sum / d_l;
for ( auto j: idxs ) l_out_check[j] = l_in[j] - avg;
}
for ( auto kname: knames )
{
auto& k_grps = kmap_gsz[kname];
vector< pair<App_Kernel_Info*,int> > k_run_stuff;
size_t l1_avail_min_bytes = l1_size_bytes + 1;
int l1_avail_min_gp_size = -1;
for ( auto [grp_size, ki]: k_grps )
{
if ( grp_size > d_l ) continue;
const int kernel = ki->idx;
cudaFuncAttributes& cfa = info.ki[kernel].cfa;
const size_t l1_avail_bytes =
l1_size_bytes - cfa.sharedSizeBytes;
if ( set_min( l1_avail_min_bytes, l1_avail_bytes ) )
l1_avail_min_gp_size = grp_size;
const auto func_ptr = info.ki[kernel].func_ptr;
const int grp_wps = ki->grp_size >> wp_lg;
const int wp_limit = cfa.maxThreadsPerBlock >> 5;
deque<int> n_wps{ 1, 2, 4, 8, 16, 32 };
if ( !vary_warps ) n_wps = { wp_per_block_arg };
while ( n_wps.size() && n_wps.back() > wp_limit )
n_wps.pop_back();
while ( n_wps.size() && n_wps[0] < grp_wps ) n_wps.pop_front();
for ( int n_wp: n_wps ) k_run_stuff.emplace_back( ki, n_wp );
}
ranges::sort( k_run_stuff, [](auto& a, auto& b) {
return a.second == b.second ?
a.first->grp_size < b.first->grp_size : a.second < b.second;} );
const int wp_start = k_run_stuff.front().second;
int block_size_last = 0;
int block_size_run = 0;
pTable table(stdout);
for ( auto& [ki, wp_cnt]: k_run_stuff )
{
const int kernel = ki->idx;
cudaFuncAttributes& cfa = info.ki[kernel].cfa;
const auto func_ptr = info.ki[kernel].func_ptr;
const int grp_wps = ki->grp_size >> wp_lg;
const int thd_per_block = wp_cnt << wp_lg;
const int grid_wps = wp_cnt * num_blocks;
const size_t l1_avail_bytes =
l1_size_bytes - cfa.sharedSizeBytes;
const int grp_wps_best =
hw04_group_size_choose( l1_avail_bytes, thd_per_block, d_l )
>> wp_lg;
CE(cudaMemset(app.l_out_d,0,out_size_bytes));
CE( cudaMemset( timing.timing_items,0,timing_items_bytes ));
NPerf_metrics_off();
typedef void (*KPtr)(elt_t*, const elt_t*);
KPtr( info.ki[kernel].func_ptr )
<<< num_blocks, thd_per_block >>>(app.l_out_d,app.l_in_d);
CE( cudaMemcpy
( timing_items.data(), timing.timing_items,
timing_items_bytes, cudaMemcpyDeviceToHost ) );
CE(cudaMemset(app.l_out_d,0,out_size_bytes));
NPerf_metrics_on();
CE(cudaEventRecord(gpu_start_ce,0));
for ( NPerf_data_reset(); NPerf_need_run_get(); )
KPtr( info.ki[kernel].func_ptr )
<<< num_blocks, thd_per_block >>>
(app.l_out_d,app.l_in_d);
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE( cudaEventElapsedTime
( &cuda_time_ms, gpu_start_ce, gpu_stop_ce) );
map<int32_t,Timing_Item> sm_start_end;
int n_migs = 0; for ( auto& ti: views::take(timing_items,grid_wps) )
if ( ti.smid_start != ti.smid_end )
{
n_migs++;
}
else
{
auto& tis = sm_start_end[ti.smid_start];
if ( tis.time_start == tis.time_end ) tis = ti;
set_min( tis.time_start, ti.time_start );
set_max( tis.time_end, ti.time_end );
}
if ( n_migs )
printf("-- Number of migrations: %d\n",n_migs);
assert( sm_start_end.size() == min(num_blocks,num_mp) );
int64_t et_sum = 0;
vector<clock_t> et;
for ( auto& [smid,tis]: sm_start_end )
{
const int64_t elapsed = tis.time_end - tis.time_start;
et_sum += elapsed;
et.push_back( elapsed );
}
ranges::sort( et, ranges::greater() );
const double et_clock_max_us = et[0] * clock_period_us;
const double et_clock_avg_us =
et_sum * clock_period_us / sm_start_end.size();
const double imbalance_measured =
et_clock_max_us ? et_clock_avg_us / et_clock_max_us : 0.0;
const double this_elapsed_time_s =
NPerf_metrics_collection_get()
? NPerf_kernel_et_get() : cuda_time_ms * 0.001;
const double thpt_compute_gflops =
num_ops_fp / this_elapsed_time_s * 1e-9;
const double thpt_data_gbps =
amt_data_bytes / this_elapsed_time_s * 1e-9;
const double fp_bw =
sizeof(elt_t) == 4 ? info.chip_sp_flops :
sizeof(elt_t) == 8 ? info.chip_dp_flops : 1;
const double chip_ls_ops = info.chip_sp_flops / 4;
const double t_bound_fp_s = num_ops_fp / fp_bw;
const double t_bound_ls_s = num_ops_ls / chip_ls_ops;
const double t_bound_insn_s = t_bound_fp_s + t_bound_ls_s;
const double t_bound_mem_s =
amt_data_bytes / info.chip_bw_Bps;
const double t_bound_l2_s =
amt_data_bytes / l2_bandwidth_Bps;
const double frac_fp =
min( 2.0, t_bound_fp_s / this_elapsed_time_s );
const double frac_insn =
min( 2.0, t_bound_insn_s / this_elapsed_time_s );
const double frac_mem =
min( 2.0, t_bound_mem_s / this_elapsed_time_s );
const double frac_l2 =
min( 2.0, t_bound_l2_s / this_elapsed_time_s );
const double comm_frac =
min(2.0,1e9 * thpt_data_gbps / info.chip_bw_Bps);
const double comm_l2_frac =
min(2.0,1e9 * thpt_data_gbps / l2_bandwidth_Bps );
const int num_wps = ( thd_per_block + 31 ) >> 5;
const int max_bl_per_mp =
info.get_max_active_blocks_per_mp(kernel,thd_per_block);
const int bl_per_mp_available =
0.999 + double(num_blocks) / num_mp;
const int bl_per_mp =
min( bl_per_mp_available, max_bl_per_mp );
const int act_wps = num_wps * bl_per_mp;
const int act_thds_gpu =
min( num_mp * act_wps * 32, num_blocks * thd_per_block );
if ( wp_cnt == wp_start )
printf
("\n%s: Uses %d registers. L1 Avail %zu B (gp=%d)\n"
"n_l %d d_l %d L2 Occ %.0f%% + %.0f%%\n",
info.ki[kernel].name,
info.ki[kernel].cfa.numRegs,
l1_avail_min_bytes, l1_avail_min_gp_size,
n_l, d_l,
1e2 * in_size_bytes / l2_size_bytes,
1e2 * out_size_bytes / l2_size_bytes);
if ( block_size_last != thd_per_block )
{
if ( block_size_run > 2 ) printf("\n");
block_size_last = thd_per_block;
block_size_run = 0;
}
block_size_run++;
table.row_start();
table.entry("wp",num_wps);
table.entry("gp",grp_wps);
if ( num_blocks > num_mp )
table.entry("ac",act_wps);
table.entry("Imm", "%4.2f", imbalance_measured);
table.entry("t/µs","%4.0f", this_elapsed_time_s * 1e6);
table.entry
("I/el","%4.1f",
NPerf_metric_value_get("sm__inst_executed.sum") * 32.0
/ in_size_elts );
if ( opt_p )
{
if ( false )
table.entry
("%","%2.0f",
NPerf_metric_value_get
("sm__instruction_throughput"
".avg.pct_of_peak_sustained_elapsed") );
table.entry
("BXW","%4.1f",
NPerf_metric_value_get
("l1tex__data_bank_conflicts_pipe_lsu.sum") /
NPerf_metric_value_get ("l1tex__t_requests.sum"));
table.header_span_start("L2-Cache");
const double data_l2_l1_bytes =
NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum");
const double data_l1_l2_bytes =
NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum");
table.entry
("N*R", "%4.1f", data_l2_l1_bytes / in_size_bytes );
if ( false )
table.entry
("N*W", "%4.1f", data_l1_l2_bytes / out_size_bytes );
table.entry
("%pk", "%3.0f",
100.0 * ( data_l1_l2_bytes + data_l2_l1_bytes )
/ ( this_elapsed_time_s * l2_bandwidth_Bps ) );
table.entry
("GB/s", "%4.0f",
1e-9*
( NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum")
+ NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum") )
/ this_elapsed_time_s );
table.header_span_end();
if ( !l2_contained )
{
table.header_span_start("DRAM");
if ( false )
table.entry
("N*RW", "%4.1f",
( NPerf_metric_value_get("dram__bytes_read.sum")
+NPerf_metric_value_get("dram__bytes_write.sum"))
/ ( in_size_bytes + out_size_bytes ) );
table.entry
("GB/s","%4.0f",
1e-9 *
( NPerf_metric_value_get("dram__bytes_write.sum")
+ NPerf_metric_value_get("dram__bytes_read.sum") )
/ this_elapsed_time_s );
table.header_span_end();
}
}
table.entry("FP θ","%4.0f", thpt_compute_gflops);
pStringF grp_wps_best_txt("%2d", grp_wps_best);
table.entry
("BG", "%2s",
grp_wps_best == grp_wps ? "==" : grp_wps_best_txt.s);
const size_t max_st_len =
max(5, output_width - 1 - table.row_len_get() );
pStringF fmt("%%-%zds",max_st_len);
typedef struct { double f; char c; } Elt;
const double frac_data = l2_contained ? frac_l2 : frac_mem;
vector<Elt> segments =
{ { frac_fp, '+' }, { frac_insn, '-' }, { frac_data, '*' } };
string util_hdr = "=== Util: FP++ Insn-- ";
util_hdr += l2_contained ? "L2** " : "Mem** ";
if ( max_st_len > util_hdr.length() )
util_hdr += string(max_st_len - util_hdr.length(),'=');
ranges::sort( segments, {}, &Elt::f );
string bar;
for ( Elt& e: segments )
if ( size_t p = e.f * max_st_len + 0.5; p > bar.length() )
bar += string( p - bar.length(), e.c );
if ( bar.length() > max_st_len )
{
bar.resize(max_st_len);
bar[max_st_len-1] = '>';
}
table.entry(util_hdr,fmt, bar, pTable::pT_Left);
table.row_end();
elapsed_time_s = min(this_elapsed_time_s,elapsed_time_s);
CE( cudaMemcpy
( l_out.data(), app.l_out_d, out_size_bytes,
cudaMemcpyDeviceToHost ) );
int err_count = 0;
elt_t max_err = 0;
for ( int i=0; i<n_l; i++ )
for ( int j=0; j<d_l; j++ )
{
const int idx = i * d_l + j;
const elt_t err =
fabs( l_out_check[idx] - l_out[idx] );
set_max( max_err, err );
if ( err > 1e-5 )
{
err_count++;
if ( err_count < 5 )
printf
( "Error at vec %d elt %d: "
"%.7f != %.7f (correct)\n",
i, j, l_out[idx], l_out_check[idx] );
}
}
if ( err_count )
printf("Total errors %d, max error %f\n",
err_count, max_err );
}
}
}
}
}