#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <ranges>
#include <random>
#include <vector>
#include <algorithm>
#include <gp/misc.h>
#include <thread>
#include <memory>
#include <fstream>
#include <filesystem>
#include <ptable.h>
#include <papi.h>
using namespace std;
struct Cache_Level_Info
{
int bytes, associativity, n_sets, line_size_bytes;
};
struct Cache_Info
{
Cache_Level_Info l1_core_data; Cache_Level_Info l2_core; Cache_Level_Info l3_chip; };
typedef float elt_t;
#include "support.h"
enum Kernel_Version
{ KV_Unset = 0, KV_HW01, KV_HW02p1,
KV_HW02p2,
KV_ENUM_SIZE };
const char* const tv_string[] = { "Unset", "HW01", "HW02p1", "HW02p2"};
class App_Data {
public:
App_Data(){};
Cache_Info cache_info;
double clock_hz; double data_bw_Bps; int nt; int n_cores; int hw_vec_width_bits; int n_vec_unit_p_core; int n_hw_vec_regs; int n_samples; };
struct Thd_Misc_Data {
int m1, vec_wid_elts;
double duration_s;
int64_t n_insn;
};
struct table_elt {
double et;
pString line;
table_elt(double e, const pString& l):et(e),line(l){};
};
class MM_Data {
public:
const int over = 16; MM_Data( App_Data& app, int d1, int d2, int d3, int s2a, int s3):
app( app ), d1( d1 ), d2( d2 ), d3( d3 ),
s2a( s2a ), s3( s3 ),
a( (over+d1)*d2 ), b( (d2+over)*d3 ),
ap( (over+d1)*s2a ), bp( (d2+over)*s3 ),
g( (d1+over)*d3 ), g_simple( (d1+over)*s3 ), gp( (d1+over)*s3 ),
table(stdout),
misc_data(app.nt)
{}
const App_Data& app;
const int d1, d2, d3;
const int s2a, s3;
int m3;
Kernel_Version version;
aligned_array<elt_t,64> a, b; aligned_array<elt_t,64> ap, bp; aligned_array<elt_t,64> g; aligned_array<elt_t,64> g_simple, gp;
pTable table;
vector<table_elt> table_row_candidates;
vector<table_elt> table_lines;
vector<Thd_Misc_Data> misc_data;
};
void
mm_simple( MM_Data& mmd )
{
const elt_t* __restrict__ a = assume_aligned<64>( mmd.ap.data() );
const elt_t* __restrict__ b = assume_aligned<64>( mmd.bp.data() );
elt_t* __restrict__ g_simple = assume_aligned<64>( mmd.g_simple.data() );
const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3, s2a = mmd.s2a, s3 = mmd.s3;
#pragma omp parallel for
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d3; c++ )
{
elt_t e = 0;
for ( int k=0; k<d2; k++ ) e += a[ r*s2a + k ] * b[ k*s3 + c ];
g_simple[ r*s3 + c ] = e;
}
}
template< int t1, int t2, int t3, int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw01( int tid, MM_Data* mmdp )
{
MM_Data& mmd = *mmdp;
p_papi_insn papi_insn;
papi_insn.start(); const double t_start = time_wall_fp();
const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
const int s2a = mmd.s2a, s3 = mmd.s3;
assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );
constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;
typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
const elt_t* a = assume_aligned<64>( mmd.ap.data() );
const elt_t* b = assume_aligned<64>( mmd.bp.data() );
elt_t* g = assume_aligned<64>( mmd.gp.data() );
const vf* bv = assume_aligned<64>( (vf*)(b) );
vf* gv = assume_aligned<64>( (vf*)(g) );
mmd.misc_data[tid].m1 = t1;
for ( int rr = tid * t1; rr < d1; rr += mmd.app.nt * t1 )
for ( int cc = 0; cc < d3; cc += t3 )
{
vf ee[t1][t3/vec_wid_elts]{};
for ( int kk = 0; kk < d2; kk += t2 )
for ( int k = kk; k < kk + t2; k++ )
{
vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ] );
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
for ( int r = rr; r < rr + t1; r++ )
ee[r-rr][(c-cc)/vec_wid_elts] +=
a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
}
for ( int r = rr; r < rr + t1; r++ )
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
gv[ r * s3 / vec_wid_elts + c / vec_wid_elts ] =
ee[r-rr][(c-cc)/vec_wid_elts];
}
mmd.misc_data[tid].n_insn = papi_insn.stop();
mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}
template< int t1, int t2, int t3, int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw02p1( int tid, MM_Data* mmdp )
{
MM_Data& mmd = *mmdp;
p_papi_insn papi_insn;
papi_insn.start(); const double t_start = time_wall_fp();
const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
const int s2a = mmd.s2a, s3 = mmd.s3;
assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );
constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;
typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
const elt_t* a = assume_aligned<64>( mmd.ap.data() );
const elt_t* b = assume_aligned<64>( mmd.bp.data() );
elt_t* g = assume_aligned<64>( mmd.gp.data() );
const vf* bv = assume_aligned<64>( (vf*)(b) );
vf* gv = assume_aligned<64>( (vf*)(g) );
mmd.misc_data[tid].m1 = d1;
for ( int cc = tid * t3; cc < d3; cc += mmd.app.nt * t3 )
for ( int rr = 0; rr < d1; rr += t1 )
{
vf ee[t1][t3/vec_wid_elts]{};
for ( int kk = 0; kk < d2; kk += t2 )
for ( int k = kk; k < kk + t2; k++ )
{
vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ] );
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
for ( int r = rr; r < rr + t1; r++ )
ee[r-rr][(c-cc)/vec_wid_elts] +=
a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
}
for ( int r = rr; r < rr + t1; r++ )
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
gv[ r * s3 / vec_wid_elts + c / vec_wid_elts ] =
ee[r-rr][(c-cc)/vec_wid_elts];
}
mmd.misc_data[tid].n_insn = papi_insn.stop();
mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}
template< int t1, int t2, int t3,
int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw02p2( int tid, MM_Data* mmdp )
{
MM_Data& mmd = *mmdp;
p_papi_insn papi_insn;
papi_insn.start(); const double t_start = time_wall_fp();
const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
const int s2a = mmd.s2a, s3 = mmd.s3;
assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );
constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;
typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
const elt_t* a = assume_aligned<64>( mmd.ap.data() );
const elt_t* b = assume_aligned<64>( mmd.bp.data() );
elt_t* g = assume_aligned<64>( mmd.gp.data() );
const vf* bv = assume_aligned<64>( (vf*)(b) );
vf* gv = assume_aligned<64>( (vf*)(g) );
const int nt = mmd.app.nt;
const int m3 = mmd.m3;
const int tiles_per_col = d1/t1;
const int mtiles_per_row = d3 / m3;
const int mtiles_max = mtiles_per_row * tiles_per_col;
const int mtiles_max_p_thd = div_ceil( mtiles_max, nt );
const uint m1_max_1 = mtiles_max_p_thd * t1;
const int m1_max = min( uint(d1), bit_floor( m1_max_1 ) );
const double util_min = 0.95;
int m1 = m1_max;
while ( m1 > t1 )
{
const int n_mt = d1 / m1 * mtiles_per_row;
const int tiles_p_thd = div_ceil( n_mt, nt );
const double util = double(n_mt) / ( tiles_p_thd * nt );
if ( util >= util_min ) break;
m1 = max( t1, m1 / 2 );
}
mmd.misc_data[tid].m1 = m1;
const int mtiles_per_col = d1 / m1;
const int n_mtiles = mtiles_per_col * mtiles_per_row;
for ( int mtile_idx = tid; mtile_idx < n_mtiles; mtile_idx += nt )
{
const int mtile_c = mtile_idx % mtiles_per_row;
const int mtile_r = mtile_idx / mtiles_per_row;
const int rr_start = mtile_r * m1;
const int rr_stop = rr_start + m1;
const int cc_start = mtile_c * m3;
const int cc_stop = cc_start + m3;
for ( int rr = rr_start; rr < rr_stop; rr += t1 )
for ( int cc = cc_start; cc < cc_stop; cc += t3 )
{
vf ee[t1][t3/vec_wid_elts]{};
for ( int kk = 0; kk < d2; kk += t2 )
for ( int k = kk; k < kk + t2; k++ )
{
vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ]);
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
for ( int r = rr; r < rr + t1; r++ )
ee[r-rr][(c-cc)/vec_wid_elts] +=
a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
}
for ( int r = rr; r < rr + t1; r++ )
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
gv[ r*mmd.s3/vec_wid_elts + c/vec_wid_elts ] =
ee[r-rr][(c-cc)/vec_wid_elts];
}
}
mmd.misc_data[tid].n_insn = papi_insn.stop();
mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}
template< auto... Ts >
void
mm_tiledr( int tid, MM_Data* mmdp )
{
MM_Data& mmd = *mmdp;
switch ( mmd.version ) {
case KV_HW01: mm_tiled_hw01<Ts...>(tid,mmdp); break;
case KV_HW02p1: mm_tiled_hw02p1<Ts...>(tid,mmdp); break;
case KV_HW02p2: mm_tiled_hw02p2<Ts...>(tid,mmdp); break;
default: assert( false );
}
}
template< auto... Ts >
void
mm_tiled( int tid, MM_Data* mmdp )
{
switch ( mmdp->app.hw_vec_width_bits ){
case 512: mm_tiledr<Ts...,512,32>(tid,mmdp); break;
case 256: mm_tiledr<Ts...,256,16>(tid,mmdp); break;
case 128: mm_tiledr<Ts...,128,16>(tid,mmdp); break;
default:
fprintf(stderr, "This code can only be run on a CPU with vector registers "
"that (that, not which) this code can detect. "
"Have a better one.\n");
exit(1);
}
}
template< int t1, int t2 = t1, int t3 = t1 >
void
mm_tiled_do_sample( MM_Data& mmd )
{
const App_Data& app = mmd.app;
const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
const auto nt = app.nt;
# pragma omp parallel for
for ( auto& elt: mmd.gp ) elt = 0;
vector<thread> threads;
const double t_start_s = time_wall_fp();
for ( int i=0; i<nt; i++ )
threads.emplace_back( mm_tiled<t1,t2,t3>, i, &mmd );
for ( auto& thd: threads ) thd.join();
const double t_end_s = time_wall_fp();
const bool re_check_cpu_clocks = false;
const double cpu_clock_hz =
re_check_cpu_clocks ? cpu_core_fastest_hz() : app.clock_hz;
int n_err = 0, n_err_z = 0;
double max_err = 0;
float tol = numeric_limits<elt_t>::epsilon() * d2 * 5;
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d3; c++ )
{
const elt_t hc = mmd.g_simple[ r*mmd.s3 + c]; const elt_t hd = mmd.gp[ r*mmd.s3 + c ];
const float d = fabs( hd - hc );
if ( d > max_err ) max_err = d;
if ( d > tol )
{
n_err++;
if ( hd == 0 ) n_err_z++;
}
}
bool verbose = false;
if ( n_err || verbose )
{
const size_t g_size = d1 * d3;
printf("Number of errors %d out of %zd. %d zeros\n",
n_err, g_size, n_err_z);
printf("Max err = %.9f, tolerance = %.9f\n", max_err, tol );
}
const int n_elt_p_vec = mmd.app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
const int fp_bwd_flop_cyc =
min(nt,mmd.app.n_cores) * mmd.app.n_vec_unit_p_core * n_elt_p_vec;
const double fp_bwd_gflop = fp_bwd_flop_cyc * cpu_clock_hz * 1e-9;
const double duration_s = t_end_s - t_start_s;
const double n_madd = double(d1) * d2 * d3;
const double fp_thpt_gflop = 1e-9 * n_madd / duration_s;
double dur_thds_total_s = 0;
for ( auto& md: mmd.misc_data ) dur_thds_total_s += md.duration_s;
const double util_measured = dur_thds_total_s / ( duration_s * nt );
const int m1 = mmd.misc_data[0].m1;
const int m3 = mmd.m3;
const int vec_wid_elts = mmd.misc_data[0].vec_wid_elts;
const int l2_elts = mmd.app.cache_info.l2_core.bytes / sizeof(elt_t);
double cache_needed_a_elts = t1 * d2;
double cache_needed_b_elts = m3 * d2;
mmd.table.row_start();
mmd.table.header_span_start("Tile");
mmd.table.entry("t1", "%3d", t1);
mmd.table.entry("t3", "%3d", t3);
mmd.table.entry("m1", "%5d", m1);
mmd.table.entry("m3", "%4d", m3);
mmd.table.header_span_end();
mmd.table.header_span_start("L2 Occ");
mmd.table.entry("m1*d2", "%5.2f", cache_needed_a_elts / l2_elts);
mmd.table.entry("d2*m3", "%5.2f", cache_needed_b_elts / l2_elts);
mmd.table.header_span_end();
mmd.table.entry( "Dur/ms", "%7.2f", duration_s * 1000 );
mmd.table.header_span_start("FP Throughput");
mmd.table.entry( "GFLOPs", "%7.2f", fp_thpt_gflop );
mmd.table.entry( "% Peak", "%7.2f", 100 * fp_thpt_gflop / fp_bwd_gflop );
mmd.table.header_span_end();
if ( re_check_cpu_clocks )
{
mmd.table.header_span_start("Clk");
mmd.table.entry( "MHz", "%4.0f", cpu_clock_hz * 1e-6 );
mmd.table.header_span_end();
}
int64_t tot_insn = 0;
for ( auto& md: mmd.misc_data ) tot_insn += md.n_insn;
if ( false )
mmd.table.entry("VW", "%2d", vec_wid_elts);
mmd.table.header_span_start("Insn/");
mmd.table.entry("Vec I", "%5.2f", tot_insn*vec_wid_elts/n_madd );
mmd.table.header_span_end();
mmd.table.header_span_start("Util");
mmd.table.entry( "Mes", "%4.0f", 100.0 * util_measured );
mmd.table.header_span_end();
mmd.table_row_candidates.emplace_back( duration_s, mmd.table.row_end_take() );
}
template< auto... Ts>
void
mm_tiled_do_1( MM_Data& mmd )
{
const int n_samples = mmd.app.n_samples;
mmd.table_row_candidates.clear();
for ( int i=0; i<n_samples; i++ ) mm_tiled_do_sample<Ts...>(mmd);
ranges::sort( mmd.table_row_candidates, {}, &table_elt::et );
auto& sample = mmd.table_row_candidates[n_samples/2];
printf("%s\n", sample.line.s );
mmd.table_lines.push_back(move(sample));
}
template< int t1, int t2, int t3 >
void
mm_tiled_do( MM_Data& mmd )
{
switch ( mmd.version ) {
case KV_HW01:
mmd.m3 = mmd.d3;
mm_tiled_do_1<t1,t2,t3>( mmd );
break;
case KV_HW02p1:
mmd.m3 = t3;
mm_tiled_do_1<t1,t2,t3>( mmd );
break;
case KV_HW02p2:
{
const int max_m3 = 256;
for ( int i_m3 = t3; i_m3 <= max_m3; i_m3 <<= 1 )
{
mmd.m3 = i_m3;
mm_tiled_do_1<t1,t2,t3>( mmd );
}
}
break;
default: assert( false );
}
}
void
mm_do( App_Data& app, int d1, int d2, int d3, bool pad = false )
{
int s2a = d2, s3 = d3;
if ( pad )
{
[[ maybe_unused ]]
const int line_size_bytes = app.cache_info.l3_chip.line_size_bytes;
const int line_size_elts = line_size_bytes / sizeof(elt_t);
const int n_lines_d2 = d2 / line_size_elts;
int n_pri_d2 = n_lines_d2 | 1;
s2a = n_pri_d2 * line_size_elts;
const int n_lines_d3 = d3 / line_size_elts;
int n_pri_d3 = n_lines_d3 | 1;
s3 = n_pri_d3 * line_size_elts;
}
MM_Data mmd(app,d1,d2,d3,s2a,s3);
ranges::generate(mmd.a,rand_pm1);
ranges::generate(mmd.b,rand_pm1);
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d2; c++ )
mmd.ap[ r * s2a + c ] = mmd.a[ r * d2 + c ];
for ( int r=0; r<d2; r++ )
for ( int c=0; c<d3; c++ )
mmd.bp[ r * s3 + c ] = mmd.b[ r * d3 + c ];
const double t_start_s = time_wall_fp();
mm_simple( mmd );
const double t_simple_end_s = time_wall_fp();
const double dur_simple_s = t_simple_end_s - t_start_s;
for ( const auto version: { KV_HW01, KV_HW02p1, KV_HW02p2 } )
{
mmd.version = version;
printf("\nMatrix %d ⨉ %d %d ⨉ %d. Duration simple: %.3f ms\n",
d1, d2, d2, d3, dur_simple_s * 1000 );
printf("Stride %d ⨉ %d %d ⨉ %d = %d x %d. %d thds. %s\n",
d1, s2a, d2, s3, d1, s3,
app.nt, tv_string[version]);
mmd.table = pTable(stdout);
mmd.table_lines.clear();
constexpr int t2 = 16;
switch( app.hw_vec_width_bits ){
case 256:
mm_tiled_do< 8,t2, 8>( mmd ); mm_tiled_do< 4,t2,16>( mmd ); mm_tiled_do< 2,t2,32>( mmd ); mm_tiled_do< 1,t2,64>( mmd ); break;
case 512:
mm_tiled_do<16,t2,16>( mmd ); mm_tiled_do< 8,t2,32>( mmd ); mm_tiled_do< 4,t2,64>( mmd ); mm_tiled_do< 2,t2,128>( mmd ); mm_tiled_do< 1,t2,256>( mmd ); break;
default:
assert( false );
}
ranges::sort( mmd.table_lines, {}, &table_elt::et );
int n_left = mmd.table_lines.size();
const int top_n = min(n_left,400);
n_left -= top_n;
const int bot_n = min(n_left,4);
if ( bot_n == 0 )
printf("** Sorted by Execution Time **\n");
else
printf("** The %d Lowest and %d Highest Execution Times **\n",
top_n, bot_n);
mmd.table.heading_show();
for ( auto& l: mmd.table_lines | views::take(top_n) )
printf("%s\n",l.line.s);
for ( auto& l: mmd.table_lines
| views::reverse | views::take(bot_n) | views::reverse )
printf("%s\n",l.line.s);
}
}
int
main( int argc, char **argv )
{
if ( int retval = PAPI_library_init(PAPI_VER_CURRENT);
retval != PAPI_VER_CURRENT )
{
printf("Library initialization error! \n");
exit(1);
}
CP( PAPI_thread_init(pthread_self) );
const PAPI_hw_info_t *hwinfo = PAPI_get_hardware_info();
App_Data app;
app.n_samples = 5;
app.n_cores = hwinfo->cores * hwinfo->sockets;
app.nt = argc == 1 ? max(1, app.n_cores - 2): atoi(argv[1]);
app.hw_vec_width_bits =
__builtin_cpu_supports("avx512f") ? 512 :
__builtin_cpu_supports("avx2") ? 256 : 128;
app.n_vec_unit_p_core = app.hw_vec_width_bits == 512 ? 2 : 1; app.n_hw_vec_regs = app.hw_vec_width_bits == 512 ? 32 : 16;
app.data_bw_Bps = measure_data_bw_Bps(); app.clock_hz = cpu_core_fastest_hz();
app.cache_info = cpu_get_cache();
const double data_bw_eps = app.data_bw_Bps / sizeof(elt_t);
const int n_elt_p_vec = app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
const int fp_bwd_flop_cyc =
min(app.nt,app.n_cores) * app.n_vec_unit_p_core * n_elt_p_vec;
const double fp_bwd_flop_s = fp_bwd_flop_cyc * app.clock_hz;
const double comp_comm = fp_bwd_flop_s / max(1.0,data_bw_eps);
printf
("CPU model %s\n", cpu_info_get_model().c_str() );
printf
("Node has %d CPUs, CPU has %d cores, total %d cores. Measured clock %.2f GHz.\n",
hwinfo->sockets, hwinfo->cores, app.n_cores, 1e-9 * app.clock_hz);
printf("(This run spawns %d threads.)\n", app.nt);
printf("L1 Data Per Core %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l1_core_data.bytes >> 10,
app.cache_info.l1_core_data.associativity,
app.cache_info.l1_core_data.line_size_bytes);
printf("L2 Unif Per Core %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l2_core.bytes >> 10,
app.cache_info.l2_core.associativity,
app.cache_info.l2_core.line_size_bytes);
printf("L3 Unif Per Chip %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l3_chip.bytes >> 10,
app.cache_info.l3_chip.associativity,
app.cache_info.l3_chip.line_size_bytes);
printf
( "Vector width %d bits, %d vector registers, and\n"
"%d vector unit(s) per core. (Assumed.)\n",
app.hw_vec_width_bits, app.n_hw_vec_regs, app.n_vec_unit_p_core );
printf("Data bw %.3f GB/s (measured). For %d threads: comp/comm = %.1f\n",
1e-9*app.data_bw_Bps, app.nt, comp_comm);
mm_do(app,2048,1024,4096,true);
mm_do(app,2048,2048,4096,true);
mm_do(app,2048,4096,4096,true);
return 0;
}