#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <time.h>
#include <new>
#include <cuda_runtime.h>
#include <assert.h>
#include <nperf.h>
#include "util.h"
#include <ptable.h>
#define N 8
#define M 8
#if N * 1024 * 4 < 32769
#define SMALL
#endif
typedef float Elt_Type;
struct App
{
int num_vecs;
Elt_Type matrix[M][N];
Elt_Type *h_in, *h_out, *h_out_check;
Elt_Type *d_in, *d_out;
float4 *d_in_f4, *d_out_f4;
};
App app;
__constant__ App d_app;
typedef void (*KPtr)(Elt_Type *dout, const Elt_Type *din);
extern "C" __global__ void
mxv_g_only(Elt_Type* __restrict__ dout, const Elt_Type* __restrict__ din)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
for ( int r=0; r<M; r++ )
{
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * din[ h * N + c ];
dout[ h * M + r ] = elt;
}
}
extern "C" __global__ void
mxv_i_lbuf()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vin[N];
for ( int c=0; c<N; c++ ) vin[c] = d_app.d_in[ h * N + c ];
for ( int r=0; r<M; r++ )
{
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * vin[c];
d_app.d_out[ h * M + r ] = elt;
}
}
}
extern "C" __global__ void
mxv_o_lbuf()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vout[M];
for ( int c=0; c<N; c++ )
{
Elt_Type vin_c = d_app.d_in[ h * N + c ];
for ( int r=0; r<M; r++ )
{
Elt_Type elt = d_app.matrix[r][c] * vin_c;
if ( c == 0 ) vout[r] = elt;
else vout[ r ] += elt;
}
}
for ( int r=0; r<M; r++ ) d_app.d_out[h*M+r] = vout[r];
}
}
extern "C" __global__ void
mxv_o_per_thd()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid / M;
const int stop = d_app.num_vecs;
const int inc = num_threads / M;
const int r = tid % M;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vout = 0;
for ( int c=0; c<N; c++ )
{
Elt_Type vin_c = d_app.d_in[ h * N + c ];
Elt_Type elt = d_app.matrix[r][c] * vin_c;
vout += elt;
}
d_app.d_out[h*M+r] = vout;
}
}
extern "C" __global__ void
mxv_sh()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid; const int stop = d_app.num_vecs;
const int inc = num_threads;
const int BLOCK_SIZE = 1024;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vin[N];
__shared__ Elt_Type vins[BLOCK_SIZE][N];
__syncthreads();
for ( int t=0; t<blockDim.x; t += blockDim.x )
{
int ht = h - threadIdx.x + t;
if ( threadIdx.x < N )
vins[t][threadIdx.x] = d_app.d_in[ ht * N + threadIdx.x ];
}
__syncthreads();
for ( int c=0; c<N; c++ ) vin[c] = vins[threadIdx.x][c];
for ( int r=0; r<M; r++ )
{
Elt_Type elt = 0;
for ( int c=0; c<N; c++ ) elt += d_app.matrix[r][c] * vin[c];
d_app.d_out[ h * M + r ] = elt;
}
}
}
const int mxv_sh_ochunk_CS = 8;
extern "C" __global__ void
mxv_sh_ochunk()
{
const int CS = mxv_sh_ochunk_CS;
const int num_threads = blockDim.x * gridDim.x;
const int bl_start = blockIdx.x * blockDim.x / CS;
const int stop = d_app.num_vecs;
const int inc = num_threads / CS;
const int thd_c_offset = threadIdx.x % CS;
const int thd_r_offset = threadIdx.x % CS;
const int thd_v_offset = threadIdx.x / CS;
const int MAX_BLOCK_SIZE = 1024;
__shared__ Elt_Type vxfer[MAX_BLOCK_SIZE];
const int ML = ( M + CS - 1 ) / CS;
for ( int hb = bl_start; hb<stop; hb += inc )
{
Elt_Type vout[ML];
for ( int rl=0; rl<ML; rl++ ) vout[rl] = 0;
#pragma unroll
for ( int c=0; c<N; c += CS )
{
vxfer[threadIdx.x] =
d_app.d_in[ ( hb + thd_v_offset ) * N + c + thd_c_offset ];
Elt_Type vin[CS];
for ( int cc=0; cc<CS; cc++ )
vin[cc] = vxfer[ thd_v_offset * CS + cc ];
for ( int rr=0; rr<ML; rr++ )
{
const int r = rr * CS + thd_r_offset;
for ( int cc=0; cc<CS; cc++ )
if ( c+cc < N )
vout[rr] += d_app.matrix[r][c+cc] * vin[cc];
}
}
#pragma unroll
for ( int rr=0; rr<ML; rr++ )
{
const int r = rr * CS + thd_r_offset;
if ( r < M )
d_app.d_out[ ( hb + thd_v_offset ) * M + r ] = vout[rr];
}
}
}
extern "C" __global__ void
mxv_vec_ld()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int start = tid;
const int stop = d_app.num_vecs;
const int inc = num_threads;
for ( int h=start; h<stop; h += inc )
{
Elt_Type vin[N];
for ( int c=0; c<N; c += 4 )
{
float4 f4 = d_app.d_in_f4[ h * ( N >> 2 ) + ( c >> 2 )];
vin[c] = f4.x;
vin[c+1] = f4.y;
vin[c+2] = f4.z;
vin[c+3] = f4.w;
}
Elt_Type vout[M];
for ( int r=0; r<M; r++ )
{
vout[r] = 0;
for ( int c=0; c<N; c++ )
vout[r] += d_app.matrix[r][c] * vin[c];
}
for ( int r=0; r<M; r+=4 )
{
float4 f4 = { vout[r], vout[r+1], vout[r+2], vout[r+3] };
d_app.d_out_f4[ ( h * M + r ) >> 2 ] = f4;
}
}
}
typedef union { float f; int i; } float_int;
struct fi4 {
union { float x; int i; };
union { float y; int j; };
union { float z; int k; };
union { float w; int l; };
};
extern "C" __global__ void
mxv_vls()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int num_threads = blockDim.x * gridDim.x;
const int offset = threadIdx.x & 1;
const int start = tid;
const int stop = d_app.num_vecs;
const int inc = num_threads;
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#undef USE_SHARED
#else
#define USE_SHARED
#endif
__shared__ float4 v0[1024];
for ( int h=start; h<stop; h += inc )
{
const int h0 = h - offset;
__syncthreads();
Elt_Type vin[N];
for ( int cc=0; cc<N; cc += 8 )
{
const int c = cc + 4 * offset;
#if 1
const int64_t addr =
(int64_t) &d_app.d_in_f4[ ( h0 * N + c ) >> 2 ];
float4 v0_o, v1_o;
asm( "ld.global.v4.f32 {%0,%1,%2,%3}, [%4]; // Hello, world!"
: "=f"(v0_o.x), "=f"(v0_o.y), "=f"(v0_o.z),"=f"(v0_o.w)
: "l" ( addr ) );
asm( "ld.global.v4.f32 {%0,%1,%2,%3}, [%4];"
: "=f"(v1_o.x), "=f"(v1_o.y), "=f"(v1_o.z),"=f"(v1_o.w)
: "l" ( addr + N * sizeof(v1_o.x) ) );
#else
float4 v0_o = d_app.d_in_f4[ ( h0 * N + c ) >> 2 ];
float4 v1_o = d_app.d_in_f4[ ( ( h0 + 1 ) * N + c ) >> 2 ];
#endif
#ifdef USE_SHARED
v0[threadIdx.x] = offset ? v0_o : v1_o;
float4 v_xfer = v0[threadIdx.x ^ 1];
float4 v_03 = offset ? v_xfer : v0_o;
float4 v_47 = offset ? v1_o : v_xfer;
#else
union { float4 fl4; fi4 fi4; } f40x, f41x;
f40x.fl4 = v0_o;
f41x.fl4 = v1_o;
fi4 f40 = f40x.fi4;
fi4 f41 = f41x.fi4;
fi4 fswap = offset ? f40 : f41;
const unsigned mask = ~0;
fswap.i = __shfl_xor_sync(mask,fswap.i,1);
fswap.j = __shfl_xor_sync(mask,fswap.j,1);
fswap.k = __shfl_xor_sync(mask,fswap.k,1);
fswap.l = __shfl_xor_sync(mask,fswap.l,1);
fi4 v_03 = offset ? fswap : f40;
fi4 v_47 = offset ? f41 : fswap;
#endif
vin[cc] = v_03.x;
vin[cc+1] = v_03.y;
vin[cc+2] = v_03.z;
vin[cc+3] = v_03.w;
if ( cc + 4 < N )
{
vin[cc+4] = v_47.x;
vin[cc+5] = v_47.y;
vin[cc+6] = v_47.z;
vin[cc+7] = v_47.w;
}
}
Elt_Type vbuf[8];
const int M8 = ( M + 7 ) & ~7;
#pragma unroll
for ( int64_t r=0; r<M8; r++ )
{
const int bpos = r % 8;
Elt_Type elt = 0;
for ( int c=0; c<N; c++ )
elt += d_app.matrix[r][c] * vin[c];
vbuf[bpos] = elt;
if ( bpos == 7 )
{
float4 vo_0 = { vbuf[0], vbuf[1], vbuf[2], vbuf[3] };
float4 vo_1 = { vbuf[4], vbuf[5], vbuf[6], vbuf[7] };
v0[threadIdx.x] = offset ? vo_0 : vo_1;
float4 v_xfer = v0[threadIdx.x^1];
float4 v0_o = offset ? v_xfer : vo_0;
float4 v1_o = offset ? vo_1 : v_xfer;
const int rr = r - 7 + offset * 4;
if ( rr < M )
{
d_app.d_out_f4[ ( h0 * M + rr ) >> 2 ] = v0_o;
d_app.d_out_f4[ ( (h0+1) * M + rr ) >> 2 ] = v1_o;
}
}
}
}
}
#ifdef SMALL
extern "C" __global__ void
mxv_sh_easy()
{
const int num_threads = blockDim.x * gridDim.x;
const int bl_start = blockIdx.x * blockDim.x;
const int stop = d_app.num_vecs;
const int inc = num_threads;
const int64_t BLOCK_SIZE = 1024;
#if N > M
const int sm_stride = N;
#else
const int sm_stride = M;
#endif
__shared__ Elt_Type vins[BLOCK_SIZE][sm_stride];
const int offset = threadIdx.x % N;
const int idx_start = threadIdx.x / N;
for ( int hb = bl_start; hb<stop; hb += inc )
{
for ( int g=0; g<N; g++ )
vins[ idx_start + g * BLOCK_SIZE/N ][ offset ] =
d_app.d_in[ g * BLOCK_SIZE + hb * N + threadIdx.x ];
__syncthreads();
Elt_Type vin[N];
for ( int c=0; c<N; c++ ) vin[c] = vins[ threadIdx.x ][ c ];
Elt_Type vout[M];
for ( int r=0; r<M; r++ )
{
vout[r] = 0;
for ( int c=0; c<N; c++ )
vout[r] += d_app.matrix[r][c] * vin[c];
}
__syncthreads();
for ( int r=0; r<M; r++ ) vins[ threadIdx.x ][ r ] = vout[ r ];
__syncthreads();
for ( int r=0; r<M; r++ )
d_app.d_out[ hb * M + r * BLOCK_SIZE + threadIdx.x ] =
vins[ threadIdx.x / M + r * BLOCK_SIZE/M ] [ threadIdx.x % M ];
}
}
#endif
GPU_Info
print_gpu_and_kernel_info()
{
GPU_Info info;
gpu_info_print();
int dev = gpu_choose_index();
CE(cudaSetDevice(dev));
printf("Using GPU %d\n",dev);
info.get_gpu_info(dev);
info.GET_INFO(mxv_g_only);
info.GET_INFO(mxv_i_lbuf);
info.GET_INFO(mxv_o_lbuf);
info.GET_INFO(mxv_o_per_thd);
info.GET_INFO(mxv_sh);
#if 0
#if N / 4 == (N+3)/4
info.GET_INFO(mxv_vec_ld);
#endif
#if N / 4 == (N+3)/4 && M / 4 == (M+3)/4
info.GET_INFO(mxv_vls);
#endif
info.GET_INFO(mxv_sh);
info.GET_INFO(mxv_sh_ochunk);
#ifdef SMALL
info.GET_INFO(mxv_sh_easy);
#endif
#endif
printf("\nCUDA Kernel Resource Usage:\n");
for ( int i=0; i<info.num_kernels; i++ )
{
printf("For %s:\n", info.ki[i].name);
printf(" %6zd shared, %zd const, %zd loc, %d regs; "
"%d max threads per block.\n",
info.ki[i].cfa.sharedSizeBytes,
info.ki[i].cfa.constSizeBytes,
info.ki[i].cfa.localSizeBytes,
info.ki[i].cfa.numRegs,
info.ki[i].cfa.maxThreadsPerBlock);
}
return info;
}
int
main(int argc, char **argv)
{
const bool debug = false;
NPerf_init();
GPU_Info info = print_gpu_and_kernel_info();
const int num_mp = info.cuda_prop.multiProcessorCount;
const int arg1_int = argc < 2 ? num_mp : atoi(argv[1]);
const int num_blocks =
arg1_int == 0 ? num_mp :
arg1_int < 0 ? -arg1_int * num_mp : arg1_int;
const bool opt_p = argc >= 3 && string(argv[2]) == "p";
const int thd_per_block_arg = argc < 3 ? 1024 : opt_p ? 0 : atoi(argv[2]);
const int thd_per_block_goal =
thd_per_block_arg == 0 ? 1024 : thd_per_block_arg;
const int num_threads = num_blocks * thd_per_block_goal;
const bool vary_warps = thd_per_block_arg == 0;
app.num_vecs = argc < 4 ? 1 << 20 : int( atof(argv[3]) * (1<<20) );
if ( num_threads <= 0 || app.num_vecs <= 0 )
{
printf("Usage: %s [ NUM_CUDA_BLOCKS ] [THD_PER_BLOCK|p] "
"[DATA_SIZE_MiB]\n",
argv[0]);
exit(1);
}
NPerf_metric_collect("inst_executed");
NPerf_metric_collect("gld_efficiency");
if ( opt_p )
{
NPerf_metric_collect("gst_efficiency");
NPerf_metric_collect("l2_read_throughput");
NPerf_metric_collect("l2_write_throughput");
NPerf_metric_collect("dram_read_throughput");
NPerf_metric_collect("dram_write_throughput");
}
if ( false )
NPerf_metrics_off();
const size_t in_size_elts = size_t(app.num_vecs) * N;
const size_t in_size_bytes = in_size_elts * sizeof( app.h_in[0] );
const size_t out_size_elts = size_t(app.num_vecs) * M;
const size_t out_size_bytes = out_size_elts * sizeof( app.h_out[0] );
const int overrun_size_elts = thd_per_block_goal * max(N,M);
const int overrun_size_bytes = overrun_size_elts * sizeof( app.h_out[0] );
app.h_in = new Elt_Type[ in_size_elts ];
app.h_out = new Elt_Type[ out_size_elts ];
app.h_out_check = new Elt_Type[ out_size_elts ];
CE( cudaMalloc( &app.d_in, in_size_bytes + overrun_size_bytes ) );
app.d_in_f4 = (float4*) app.d_in;
CE( cudaMalloc( &app.d_out, out_size_bytes + overrun_size_bytes ) );
app.d_out_f4 = (float4*) app.d_out;
printf("Matrix size: %d x %d. Vectors: %d. %d blocks of %d thds.\n",
N, M, app.num_vecs, num_blocks, thd_per_block_goal);
for ( int i=0; i<app.num_vecs; i++ )
for ( int c=0; c<N; c++ )
app.h_in[ i * N + c ] = debug ? Elt_Type(c) : drand48();
for ( int r=0; r<M; r++ )
for ( int c=0; c<N; c++ )
app.matrix[r][c] = debug ? r == c : drand48();
for ( int i=0; i<app.num_vecs; i++ )
for ( int r=0; r<M; r++ )
{
app.h_out_check[ i * M + r ] = 0;
for ( int c=0; c<N; c++ )
app.h_out_check[ i * M + r ] +=
app.h_in[ i * N + c ] * app.matrix[r][c];
}
const int64_t num_ops_fp = int64_t(M) * N * app.num_vecs; const int64_t insns_addr = 4, insns_loop = 3;
const int64_t insns_ld_st = N + M;
const int64_t num_ops = num_ops_fp + insns_ld_st + insns_addr + insns_loop;
const int64_t amt_data_bytes = in_size_bytes + out_size_bytes;
double elapsed_time_s = 86400; const int output_width = stdout_width_get();
#if 0
const double lat_mem_cyc = 345;
const double lat_iter_cyc = lat_mem_cyc + M * N + M;
const double lat_iter_s = lat_iter_cyc / ( info.cuda_prop.clockRate * 1e3 );
const double data_iter_B = sizeof(Elt_Type) * ( M + N );
const double p = info.chip_bw_Bps * lat_iter_s / data_iter_B;
const int sm_thpt_ls = 64;
const double ni_fp = N * M;
const double ni_mem = N + M;
const double t_issue_1 = ni_fp / (num_mp*info.cc_per_mp)
+ ni_mem / ( num_mp * sm_thpt_ls );
const double q = lat_iter_cyc / t_issue_1;
printf("Analysis for mxv_o_lbuf: L = %.1f ns, "
"p = %.1f wp/sm, q= %.1f wp/sm\n",
lat_iter_s * 1e9, p/32/num_mp, q/32/num_mp );
#endif
{
cudaEvent_t gpu_start_ce, gpu_stop_ce;
CE(cudaEventCreate(&gpu_start_ce));
CE(cudaEventCreate(&gpu_stop_ce));
CE( cudaMemcpy
( app.d_in, app.h_in, in_size_bytes, cudaMemcpyHostToDevice ) );
CE( cudaMemcpyToSymbol
( d_app, &app, sizeof(app), 0, cudaMemcpyHostToDevice ) );
printf("Launching with %d blocks of up to %d threads. \n",
num_blocks, thd_per_block_goal);
for ( int kernel = 0; kernel < info.num_kernels; kernel++ )
{
cudaFuncAttributes& cfa = info.ki[kernel].cfa;
const auto func_ptr = info.ki[kernel].func_ptr;
const int wp_limit = cfa.maxThreadsPerBlock >> 5;
const int thd_limit = wp_limit << 5;
const int thd_per_block_no_vary = min(thd_per_block_goal,thd_limit);
const int wp_start = 1;
const int wp_stop = vary_warps ? wp_limit : wp_start;
const int wp_inc = 4;
const int thd_per_vec =
func_ptr == mxv_o_per_thd ? M :
func_ptr == mxv_sh_ochunk ? mxv_sh_ochunk_CS : 1;
pTable table;
for ( int wp_cnt = wp_start; wp_cnt <= wp_stop;
wp_cnt += ( wp_cnt < 4 ? 1 : wp_inc ) )
{
const int thd_per_block =
vary_warps ? wp_cnt << 5 : thd_per_block_no_vary;
CE(cudaMemset(app.d_out,0,out_size_bytes));
CE(cudaEventRecord(gpu_start_ce,0));
for ( NPerf_data_reset(); NPerf_need_run_get(); )
KPtr(info.ki[kernel].func_ptr)<<<num_blocks,thd_per_block>>>
(app.d_out,app.d_in);
CE(cudaEventRecord(gpu_stop_ce,0));
CE(cudaEventSynchronize(gpu_stop_ce));
float cuda_time_ms = -1.1;
CE(cudaEventElapsedTime(&cuda_time_ms,gpu_start_ce,gpu_stop_ce));
const double this_elapsed_time_s =
NPerf_metrics_collection_get()
? NPerf_kernel_et_get() : cuda_time_ms * 0.001;
const double thpt_compute_gflops =
num_ops / this_elapsed_time_s * 1e-9;
const double thpt_data_gbps =
amt_data_bytes / this_elapsed_time_s * 1e-9;
if ( vary_warps )
{
const double comp_frac =
1e9 * thpt_compute_gflops
/ ( sizeof(Elt_Type) == 4 ? info.chip_sp_flops :
sizeof(Elt_Type) == 8 ? info.chip_dp_flops : 1 );
const double comm_frac =
min(2.0,1e9 * thpt_data_gbps / info.chip_bw_Bps);
const int num_wps = ( thd_per_block + 31 ) >> 5;
const int max_bl_per_mp =
info.get_max_active_blocks_per_mp(kernel,thd_per_block);
const int bl_per_mp_available =
0.999 + double(num_blocks) / num_mp;
const int bl_per_mp =
min( bl_per_mp_available, max_bl_per_mp );
const int act_wps = num_wps * bl_per_mp;
const int act_thds_gpu =
min( num_mp * act_wps * 32, num_blocks * thd_per_block );
const double iter_per_thd =
thd_per_vec * app.num_vecs / act_thds_gpu;
if ( wp_cnt == wp_start )
printf("Kernel %s:\n", info.ki[kernel].name);
table.row_start();
table.entry("wp",num_wps);
table.entry("ac",act_wps);
table.entry("t/µs","%6.0f", this_elapsed_time_s * 1e6);
table.entry("Lw/µs","%6.1f",
this_elapsed_time_s*1e6 / iter_per_thd );
table.entry
("I/op","%4.1f",
NPerf_metric_value_get("inst_executed") * 32.0 / num_ops );
if ( opt_p )
{
table.header_span_start("R-Eff-%");
table.entry
("Ld","%3.0f",
NPerf_metric_value_get("gld_efficiency"));
table.entry
("St","%3.0f",
NPerf_metric_value_get("gst_efficiency"));
table.header_span_end();
table.header_span_start("L2-Cache");
table.entry
("Rd Th","%5.1f",
NPerf_metric_value_get("l2_read_throughput") * 1e-9 );
table.entry
("Wr Th","%5.1f",
NPerf_metric_value_get("l2_write_throughput") * 1e-9 );
table.header_span_end();
table.header_span_start("DRAM");
table.entry
("Rd Th","%5.1f",
NPerf_metric_value_get("dram_read_throughput") * 1e-9 );
table.entry
("Wr Th","%5.1f",
NPerf_metric_value_get("dram_write_throughput") * 1e-9 );
table.header_span_end();
}
const bool plot_bandwidth = true;
table.entry("FP Th","%4.0f", thpt_compute_gflops);
table.entry("GB/s","%4.0f", thpt_data_gbps);
const int max_st_len =
max(5, output_width - 1 - table.row_len_get() );
pStringF fmt("%%-%ds",max_st_len);
string util_hdr =
plot_bandwidth ? "Data BW Util" : "FP Utilization";
const double frac = plot_bandwidth ? comm_frac : comp_frac;
util_hdr += string(max_st_len - util_hdr.length(),'-');
table.entry
(util_hdr,fmt,
string( size_t(max(0.0,frac*max_st_len)), '*' ),
pTable::pT_Left);
} else {
printf
("%-15s %2d wp %7.0f µs %8.3f GF %8.3f GB/s "
"%5.2f I/F %5.1f%%\n",
info.ki[kernel].name,
(thd_per_block + 31 ) >> 5,
this_elapsed_time_s * 1e6,
thpt_compute_gflops, thpt_data_gbps,
NPerf_metric_value_get("inst_executed") * 32 / num_ops,
NPerf_metric_value_get("gld_efficiency")
);
}
elapsed_time_s = min(this_elapsed_time_s,elapsed_time_s);
CE( cudaMemcpy
( app.h_out, app.d_out, out_size_bytes, cudaMemcpyDeviceToHost) );
int err_count = 0;
for ( int i=0; i<app.num_vecs; i++ )
for ( int r=0; r<M; r++ )
{
const int idx = i * M + r;
if ( fabs( app.h_out_check[idx] - app.h_out[idx] ) > 1e-5 )
{
err_count++;
if ( err_count < 5 )
printf("Error at vec %d elt %d: %.7f != %.7f (correct)\n",
i, r, app.h_out[idx], app.h_out_check[idx] );
}
}
if ( err_count )
printf("Total errors %d\n", err_count);
}
printf("%s",table.body_get());
}
}
}