048 777 435 :Example: 41 41 41 41 41
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>
#include <cuda_runtime.h>
#include <assert.h>
#include <nperf.h>
#include "util.h"
#include <ptable.h>
#define N 16
#define M 16
#if ( N + M ) * 1024 * 2 < 32769
#define SMALL
#endif
typedef float Elt_Type;
struct App
{
int num_vecs;
Elt_Type matrix[M][N];
Elt_Type *h_in, *h_out, *h_out_check;
Elt_Type *d_in, *d_out;
float4 *d_in_f4, *d_out_f4;
};
App app;
__constant__ App d_app;
typedef void (*KPtr)(Elt_Type *dout, const Elt_Type *din);
extern "C" __global__ void
mxv_g_only(Elt_Type* __restrict__ dout, const Elt_Type* __restrict__ din)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
for ( int r=0; r<M; r++ )
{
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * din[ h * N + c ];
dout[ h * M + r ] = elt;
}
}
extern "C" __global__ void
mxv_i_lbuf()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vin[N];
for ( int c=0; c<N; c++ ) vin[c] = d_app.d_in[ h * N + c ];
for ( int r=0; r<M; r++ )
{
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * vin[c];
d_app.d_out[ h * M + r ] = elt;
}
}
}
extern "C" __global__ void
mxv_o_lbuf()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vout[M]{};
for ( int c=0; c<N; c++ )
{
const Elt_Type vin = d_app.d_in[ h * N + c ];
for ( int r=0; r<M; r++ ) vout[r] += d_app.matrix[r][c] * vin;
}
for ( int r=0; r<M; r++ ) d_app.d_out[ h * M + r ] = vout[ r ];
}
}
extern "C" __global__ void
mxv_o_per_thd()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid / M; const int r = tid % M;
const int stop = d_app.num_vecs;
const int inc = num_threads / M;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vout = 0;
for ( int c=0; c<N; c++ )
vout += d_app.matrix[r][c] * d_app.d_in[ h * N + c ];
d_app.d_out[ h * M + r ] = vout;
}
}
extern "C" __global__ void
mxv_sh()
{
const int CS = 32 / sizeof(Elt_Type);
const int num_threads = blockDim.x * gridDim.x;
const int bl_start = blockIdx.x * blockDim.x;
const int stop = d_app.num_vecs;
const int inc = num_threads;
const int thd_x_offset = threadIdx.x % CS;
const int thd_x_idx_st = threadIdx.x / CS;
const int64_t BLOCK_SIZE = blockDim.x;
constexpr int64_t MAX_BLOCK_SIZE = 1024;
__shared__ Elt_Type vxfer[MAX_BLOCK_SIZE][CS + 1];
for ( int hb = bl_start; hb<stop; hb += inc )
{
Elt_Type vout[M]{};
for ( int c=0; c<N; c += CS )
{
__syncthreads();
for ( int v = thd_x_idx_st; v < BLOCK_SIZE; v += BLOCK_SIZE/CS )
vxfer[ v ][ thd_x_offset ] =
d_app.d_in[ hb * N + v * N + c + thd_x_offset ];
__syncthreads();
Elt_Type vin[CS];
for ( int cc=0; cc<CS; cc++ ) vin[cc] = vxfer[threadIdx.x][cc];
for ( int r=0; r<M; r++ )
for ( int cc=0; cc<CS; cc++ )
if ( c+cc < N ) vout[r] += d_app.matrix[r][c+cc] * vin[cc];
}
for ( int r=0; r<M; r += CS )
{
__syncthreads();
for ( int rr=0; rr<CS; rr++ ) vxfer[threadIdx.x][rr] = vout[r+rr];
__syncthreads();
for ( int g=0; g<CS; g++ )
{
const int v = g * BLOCK_SIZE / CS + thd_x_idx_st;
if ( thd_x_offset + r < M )
d_app.d_out[ hb * M + v * M + r + thd_x_offset ] =
vxfer[ v ][ thd_x_offset ];
}
}
}
}
constexpr int mxv_sh_ochunk_CS = 32 / sizeof(Elt_Type);
extern "C" __global__ void
mxv_sh_ochunk()
{
constexpr int CS = mxv_sh_ochunk_CS;
const int num_threads = blockDim.x * gridDim.x;
const int bl_start = blockIdx.x * blockDim.x / CS;
const int stop = d_app.num_vecs;
const int inc = num_threads / CS;
const int thd_c_offset = threadIdx.x % CS;
const int thd_r_offset = threadIdx.x % CS;
const int thd_v_offset = threadIdx.x / CS;
constexpr int MAX_BLOCK_SIZE = 1024;
__shared__ Elt_Type vxfer[MAX_BLOCK_SIZE];
constexpr int ML = ( M + CS - 1 ) / CS;
for ( int hb = bl_start; hb<stop; hb += inc )
{
Elt_Type vout[ML]{};
#pragma unroll
for ( int c=0; c<N; c += CS )
{
vxfer[threadIdx.x] =
d_app.d_in[ ( hb + thd_v_offset ) * N + c + thd_c_offset ];
Elt_Type vin[CS];
for ( int cc=0; cc<CS; cc++ )
vin[cc] = vxfer[ thd_v_offset * CS + cc ];
for ( int rr=0; rr<ML; rr++ )
{
const int r = rr * CS + thd_r_offset;
for ( int cc=0; cc<CS; cc++ )
if ( c+cc < N )
vout[rr] += d_app.matrix[r][c+cc] * vin[cc];
}
}
#pragma unroll
for ( int rr=0; rr<ML; rr++ )
if ( const int r = rr * CS + thd_r_offset; r < M )
d_app.d_out[ ( hb + thd_v_offset ) * M + r ] = vout[rr];
}
}
extern "C" __global__ void
mxv_vec_ld()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid;
const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vin[N];
float4* const vin4 = (float4*) &vin[0];
for ( int c4=0; c4<N/4; c4++ )
vin4[c4] = d_app.d_in_f4[ h * ( N >> 2 ) + c4 ];
Elt_Type vout[M]{};
float4* const vout4 = (float4*) &vout[0];
for ( int r=0; r<M; r++ )
for ( int c=0; c<N; c++ )
vout[r] += d_app.matrix[r][c] * vin[c];
for ( int r4=0; r4<M/4; r4++ )
d_app.d_out_f4[ h * ( M >> 2 ) + r4 ] = vout4[r4];
}
}
template<bool use_shared = true>
__device__ void
mxv_vls()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int offset = threadIdx.x & 1;
const int start = tid;
const int stop = d_app.num_vecs;
const int inc = num_threads;
__shared__ float4 v0[1024];
for ( int h=start; h<stop; h += inc )
{
const int h0 = h - offset;
__syncthreads();
Elt_Type vin[N];
float4* const vin4 = (float4*) &vin[0];
for ( int cc=0; cc<N; cc += 8 )
{
const int c = cc + 4 * offset;
float4 v0_o = d_app.d_in_f4[ ( h0 * N + c ) >> 2 ];
float4 v1_o = d_app.d_in_f4[ ( ( h0 + 1 ) * N + c ) >> 2 ];
float4 fswap = offset ? v0_o : v1_o;
if ( use_shared )
{
v0[threadIdx.x] = fswap;
fswap = v0[threadIdx.x ^ 1];
}
else
{
fswap.x = __shfl_xor_sync(~0,fswap.x,1);
fswap.y = __shfl_xor_sync(~0,fswap.y,1);
fswap.z = __shfl_xor_sync(~0,fswap.z,1);
fswap.w = __shfl_xor_sync(~0,fswap.w,1);
}
vin4[cc/4] = offset ? fswap : v0_o;
if ( cc + 4 < N ) vin4[cc/4+1] = offset ? v1_o : fswap;
}
Elt_Type vbuf[8];
float4* const vbuf4 = (float4*) &vbuf[0];
const int M8 = ( M + 7 ) & ~7;
#pragma unroll
for ( int64_t r=0; r<M8; r++ )
{
const int bpos = r % 8;
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * vin[c];
vbuf[bpos] = elt;
if ( bpos == 7 )
{
float4 fswap = offset ? vbuf4[0] : vbuf4[1];
if ( use_shared )
{
v0[threadIdx.x] = fswap;
fswap = v0[threadIdx.x ^ 1];
}
else
{
fswap.x = __shfl_xor_sync(~0,fswap.x,1);
fswap.y = __shfl_xor_sync(~0,fswap.y,1);
fswap.z = __shfl_xor_sync(~0,fswap.z,1);
fswap.w = __shfl_xor_sync(~0,fswap.w,1);
}
float4 v0_o = offset ? fswap : vbuf4[0];
float4 v1_o = offset ? vbuf4[1] : fswap;
if ( const int rr = r - 7 + offset * 4; rr < M )
{
d_app.d_out_f4[ ( h0 * M + rr ) >> 2 ] = v0_o;
d_app.d_out_f4[ ( (h0+1) * M + rr ) >> 2 ] = v1_o;
}
}
}
}
}
extern "C" __global__ void mxv_vls_shared() {mxv_vls<true>();}
extern "C" __global__ void mxv_vls_shuffle() {mxv_vls<false>();}
#ifdef SMALL
extern "C" __global__ void
mxv_sh_easy()
{
const int num_threads = blockDim.x * gridDim.x;
const int bl_start = blockIdx.x * blockDim.x;
const int stop = d_app.num_vecs;
const int inc = num_threads;
const int64_t BLOCK_SIZE = 1024;
#if N > M
const int sm_stride = N;
#else
const int sm_stride = M;
#endif
__shared__ Elt_Type vins[BLOCK_SIZE][sm_stride];
const int offset = threadIdx.x % N;
const int idx_start = threadIdx.x / N;
for ( int hb = bl_start; hb<stop; hb += inc )
{
for ( int g=0; g<N; g++ )
vins[ idx_start + g * blockDim.x/N ][ offset ] =
d_app.d_in[ hb * N + g * blockDim.x + threadIdx.x ];
__syncthreads();
Elt_Type vin[N];
for ( int c=0; c<N; c++ ) vin[c] = vins[ threadIdx.x ][ c ];
Elt_Type vout[M]{};
for ( int r=0; r<M; r++ )
for ( int c=0; c<N; c++ )
vout[r] += d_app.matrix[r][c] * vin[c];
__syncthreads();
for ( int r=0; r<M; r++ ) vins[ threadIdx.x ][ r ] = vout[ r ];
__syncthreads();
for ( int r=0; r<M; r++ )
d_app.d_out[ hb * M + r * blockDim.x + threadIdx.x ] =
vins[ threadIdx.x / M + r * blockDim.x/M ] [ threadIdx.x % M ];
__syncthreads();
}
}
#endif
GPU_Info
print_gpu_and_kernel_info()
{
GPU_Info info;
gpu_info_print();
int dev = gpu_choose_index();
CE(cudaSetDevice(dev));
printf("Using GPU %d\n",dev);
info.get_gpu_info(dev);
info.GET_INFO(mxv_g_only);
info.GET_INFO(mxv_i_lbuf);
info.GET_INFO(mxv_o_lbuf);
info.GET_INFO(mxv_o_per_thd);
#if N / 4 == (N+3)/4
info.GET_INFO(mxv_vec_ld);
#endif
#if N / 4 == (N+3)/4 && M / 4 == (M+3)/4
info.GET_INFO(mxv_vls_shared);
info.GET_INFO(mxv_vls_shuffle);
#endif
info.GET_INFO(mxv_sh);
info.GET_INFO(mxv_sh_ochunk);
#ifdef SMALL
info.GET_INFO(mxv_sh_easy);
#endif
printf("\nCUDA Kernel Resource Usage:\n");
for ( int i=0; i<info.num_kernels; i++ )
{
printf("For %s:\n", info.ki[i].name);
printf(" %6zd shared, %zd const, %zd loc, %d regs; "
"%d max threads per block.\n",
info.ki[i].cfa.sharedSizeBytes,
info.ki[i].cfa.constSizeBytes,
info.ki[i].cfa.localSizeBytes,
info.ki[i].cfa.numRegs,
info.ki[i].cfa.maxThreadsPerBlock);
}
return info;
}
int
main(int argc, char **argv)
{
const bool debug = false;
NPerf_init();
GPU_Info info = print_gpu_and_kernel_info();
const int num_mp = info.cuda_prop.multiProcessorCount;
const int arg1_int = argc < 2 ? num_mp : atoi(argv[1]);
const int num_blocks =
arg1_int == 0 ? num_mp :
arg1_int < 0 ? -arg1_int * num_mp : arg1_int;
const bool opt_p = argc >= 3 && string(argv[2]) == "p";
const int thd_per_block_arg = argc < 3 ? 1024 : opt_p ? 0 : atoi(argv[2]);
const int thd_per_block_goal =
thd_per_block_arg == 0 ? 1024 : thd_per_block_arg;
const int num_threads = num_blocks * thd_per_block_goal;
const bool vary_warps = thd_per_block_arg == 0;
const int l2_size_bytes = info.cuda_prop.l2CacheSize;
const int l2_size_elts = l2_size_bytes / ( sizeof(Elt_Type) * N );
const float default_num_vecs_l2_units = 2;
const float arg3_val = argc < 4 ? -default_num_vecs_l2_units : atof(argv[3]);
app.num_vecs =
arg3_val == 0 ? default_num_vecs_l2_units * l2_size_elts :
arg3_val < 0 ? -arg3_val * l2_size_elts : int( arg3_val * (1<<20) );
if ( num_threads <= 0 || app.num_vecs <= 0 )
{
printf("Usage: %s [ NUM_CUDA_BLOCKS ] [THD_PER_BLOCK|p] "
"[-DATA_SIZE_L2_UNITS|DATA_SIZE_MiB]\n",
argv[0]);
exit(1);
}
NPerf_metric_collect("sm__inst_executed.sum");
NPerf_metric_collect("gld_efficiency");
if ( opt_p )
{
NPerf_metric_collect
("sm__instruction_throughput.avg.pct_of_peak_sustained_elapsed");
NPerf_metric_collect("l1tex__m_xbar2l1tex_read_bytes.sum");
NPerf_metric_collect("l1tex__m_l1tex2xbar_write_bytes.sum");
NPerf_metric_collect("dram__bytes_read.sum");
NPerf_metric_collect("dram__bytes_write.sum");
NPerf_metric_collect("l1tex__t_requests.sum");
NPerf_metric_collect("l1tex__data_bank_conflicts_pipe_lsu.sum");
}
if ( false )
NPerf_metrics_off();
const size_t in_size_elts = size_t(app.num_vecs) * N;
const size_t in_size_bytes = in_size_elts * sizeof( app.h_in[0] );
const size_t out_size_elts = size_t(app.num_vecs) * M;
const size_t out_size_bytes = out_size_elts * sizeof( app.h_out[0] );
const int overrun_size_elts = thd_per_block_goal * max(N,M);
const int overrun_size_bytes = overrun_size_elts * sizeof( app.h_out[0] );
app.h_in = new Elt_Type[ in_size_elts ];
app.h_out = new Elt_Type[ out_size_elts ];
app.h_out_check = new Elt_Type[ out_size_elts ];
CE( cudaMalloc( &app.d_in, in_size_bytes + overrun_size_bytes ) );
app.d_in_f4 = (float4*) app.d_in;
CE( cudaMalloc( &app.d_out, out_size_bytes + overrun_size_bytes ) );
app.d_out_f4 = (float4*) app.d_out;
printf("Matrix size: %d x %d. Vectors: %d, %.1f%% of L2 Cache.\n",
N, M, app.num_vecs,
100.0 * in_size_bytes / info.cuda_prop.l2CacheSize );
for ( int i=0; i<app.num_vecs; i++ )
for ( int c=0; c<N; c++ )
app.h_in[ i * N + c ] = debug ? Elt_Type(c) : drand48();
for ( int r=0; r<M; r++ )
for ( int c=0; c<N; c++ )
app.matrix[r][c] = debug ? r == c : drand48();
for ( int i=0; i<app.num_vecs; i++ )
for ( int r=0; r<M; r++ )
{
app.h_out_check[ i * M + r ] = 0;
for ( int c=0; c<N; c++ )
app.h_out_check[ i * M + r ] +=
app.h_in[ i * N + c ] * app.matrix[r][c];
}
const int64_t num_ops_fp = int64_t(M) * N * app.num_vecs; const int64_t insns_addr = 4, insns_loop = 3;
const int64_t insns_ld_st = N + M;
const int64_t num_ops = num_ops_fp + insns_ld_st + insns_addr + insns_loop;
const int64_t amt_data_bytes = in_size_bytes + out_size_bytes;
double elapsed_time_s = 86400; const int output_width = stdout_width_get();
#if 0
const double lat_mem_cyc = 345;
const double lat_iter_cyc = lat_mem_cyc + M * N + M;
const double lat_iter_s = lat_iter_cyc / ( info.cuda_prop.clockRate * 1e3 );
const double data_iter_B = sizeof(Elt_Type) * ( M + N );
const double p = info.chip_bw_Bps * lat_iter_s / data_iter_B;
const int sm_thpt_ls = 64;
const double ni_fp = N * M;
const double ni_mem = N + M;
const double t_issue_1 = ni_fp / (num_mp*info.cc_per_mp)
+ ni_mem / ( num_mp * sm_thpt_ls );
const double q = lat_iter_cyc / t_issue_1;
printf("Analysis for mxv_o_lbuf: L = %.1f ns, "
"p = %.1f wp/sm, q= %.1f wp/sm\n",
lat_iter_s * 1e9, p/32/num_mp, q/32/num_mp );
#endif
{
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));
CE( cudaMemcpy
( app.d_in, app.h_in, in_size_bytes, cudaMemcpyHostToDevice ) );
CE( cudaMemcpyToSymbol
( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );
printf("Launching with %d blocks of up to %d threads. \n",
num_blocks, thd_per_block_goal);
for ( int kernel = 0; kernel < info.num_kernels; kernel++ )
{
cudaFuncAttributes& cfa = info.ki[kernel].cfa;
const auto func_ptr = info.ki[kernel].func_ptr;
const int wp_limit = cfa.maxThreadsPerBlock >> 5;
const int thd_limit = wp_limit << 5;
const int thd_per_block_no_vary = min(thd_per_block_goal,thd_limit);
const int wp_start = 1;
const int wp_stop = vary_warps ? wp_limit : wp_start;
const int wp_inc = 4;
const int thd_per_vec =
func_ptr == mxv_o_per_thd ? M :
func_ptr == mxv_sh_ochunk ? mxv_sh_ochunk_CS : 1;
pTable table;
for ( int wp_cnt = wp_start; wp_cnt <= wp_stop;
wp_cnt += ( wp_cnt < 4 ? 1 : wp_inc ) )
{
const int thd_per_block =
vary_warps ? wp_cnt << 5 : thd_per_block_no_vary;
CE(cudaMemset(app.d_out,0,out_size_bytes));
CE(cudaEventRecord(gpu_start_ce,0));
for ( NPerf_data_reset(); NPerf_need_run_get(); )
KPtr(info.ki[kernel].func_ptr)<<<num_blocks,thd_per_block>>>
(app.d_out,app.d_in);
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE(cudaEventElapsedTime(&cuda_time_ms,gpu_start_ce,gpu_stop_ce));
const double this_elapsed_time_s =
NPerf_metrics_collection_get()
? NPerf_kernel_et_get() : cuda_time_ms * 0.001;
const double thpt_compute_gflops =
num_ops / this_elapsed_time_s * 1e-9;
const double thpt_data_gbps =
amt_data_bytes / this_elapsed_time_s * 1e-9;
if ( vary_warps )
{
const double comp_frac =
1e9 * thpt_compute_gflops
/ ( sizeof(Elt_Type) == 4 ? info.chip_sp_flops :
sizeof(Elt_Type) == 8 ? info.chip_dp_flops : 1 );
const double comm_frac =
min(2.0,1e9 * thpt_data_gbps / info.chip_bw_Bps);
const int num_wps = ( thd_per_block + 31 ) >> 5;
const int max_bl_per_mp =
info.get_max_active_blocks_per_mp(kernel,thd_per_block);
const int bl_per_mp_available =
0.999 + double(num_blocks) / num_mp;
const int bl_per_mp =
min( bl_per_mp_available, max_bl_per_mp );
const int act_wps = num_wps * bl_per_mp;
const int act_thds_gpu =
min( num_mp * act_wps * 32, num_blocks * thd_per_block );
const double iter_per_thd =
thd_per_vec * app.num_vecs / act_thds_gpu;
if ( wp_cnt == wp_start )
printf("\nKernel %s. Uses %d registers.\n",
info.ki[kernel].name, info.ki[kernel].cfa.numRegs );
table.row_start();
table.entry("wp",num_wps);
if ( num_blocks > num_mp )
table.entry("ac",act_wps);
table.entry("t/µs","%4.0f", this_elapsed_time_s * 1e6);
table.entry("Lw/µs","%5.1f",
this_elapsed_time_s*1e6 / iter_per_thd );
table.entry
("I/fp","%4.1f",
NPerf_metric_value_get("sm__inst_executed.sum")*32.0 / num_ops_fp );
if ( opt_p )
{
table.entry
("%","%2.0f",
NPerf_metric_value_get
("sm__instruction_throughput.avg.pct_of_peak_sustained_elapsed") );
table.entry
("BXW","%4.1f",
NPerf_metric_value_get
("l1tex__data_bank_conflicts_pipe_lsu.sum") /
NPerf_metric_value_get ("l1tex__t_requests.sum"));
table.header_span_start("L2-Cache");
table.entry
("N*R", "%4.1f",
NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum")
/ in_size_bytes );
table.entry
("N*W", "%4.1f",
NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum")
/ out_size_bytes );
table.entry
("GB/s", "%4.0f",
1e-9*
( NPerf_metric_value_get
("l1tex__m_xbar2l1tex_read_bytes.sum")
+ NPerf_metric_value_get
("l1tex__m_l1tex2xbar_write_bytes.sum") )
/ this_elapsed_time_s );
table.header_span_end();
table.header_span_start("DRAM");
table.entry
("N*RW", "%4.1f",
( NPerf_metric_value_get("dram__bytes_read.sum")
+ NPerf_metric_value_get("dram__bytes_write.sum") )
/ ( in_size_bytes + out_size_bytes ) );
table.entry
("GB/s","%4.0f",
1e-9 *
( NPerf_metric_value_get("dram__bytes_write.sum")
+ NPerf_metric_value_get("dram__bytes_read.sum") )
/ this_elapsed_time_s );
table.header_span_end();
}
const bool plot_bandwidth = true;
table.entry("FP θ","%4.0f", thpt_compute_gflops);
const int max_st_len =
max(5, output_width - 1 - table.row_len_get() );
pStringF fmt("%%-%ds",max_st_len);
string util_hdr =
plot_bandwidth ? "Data BW Util" : "FP Utilization";
const double frac = plot_bandwidth ? comm_frac : comp_frac;
util_hdr += string(max_st_len - util_hdr.length(),'-');
table.entry
(util_hdr,fmt,
string( size_t(max(0.0,frac*max_st_len)), '*' ),
pTable::pT_Left);
} else {
printf
("%-15s %2d wp %7.0f µs %8.3f GF %8.3f GB/s "
"%5.2f I/fp %5.1f%%\n",
info.ki[kernel].name,
(thd_per_block + 31 ) >> 5,
this_elapsed_time_s * 1e6,
thpt_compute_gflops, thpt_data_gbps,
NPerf_metric_value_get("sm__inst_executed.sum")
* 32 / num_ops_fp,
NPerf_metric_value_get("gld_efficiency")
);
}
elapsed_time_s = min(this_elapsed_time_s,elapsed_time_s);
CE( cudaMemcpy
( app.h_out, app.d_out, out_size_bytes,
cudaMemcpyDeviceToHost ) );
int err_count = 0;
for ( int i=0; i<app.num_vecs; i++ )
for ( int r=0; r<M; r++ )
{
const int idx = i * M + r;
if ( fabs( app.h_out_check[idx] - app.h_out[idx] ) > 1e-5 )
{
err_count++;
if ( err_count < 5 )
printf
( "Error at vec %d elt %d: %.7f != %.7f (correct)\n",
i, r, app.h_out[idx], app.h_out_check[idx] );
}
}
if ( err_count )
printf("Total errors %d\n", err_count);
}
printf("%s",table.body_get());
}
}
}