#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <ranges>
#include <random>
#include <vector>
#include <algorithm>
#include <gp/misc.h>
#include <thread>
#include <memory>
#include <fstream>
#include <filesystem>
#include <regex>
#include <ptable.h>
using namespace std;
struct Cache_Level_Info
{
int bytes, associativity, n_sets, line_size_bytes;
};
struct Cache_Info
{
Cache_Level_Info l1_core_data; Cache_Level_Info l2_core; Cache_Level_Info l3_chip; };
typedef float elt_t;
#include "support.h"
class App_Data {
public:
App_Data(){};
Cache_Info cache_info;
double clock_hz; double data_bw_Bps; int nt; int n_cores; int hw_vec_width_bits; int n_vec_unit_p_core; int n_hw_vec_regs; };
struct Thd_Misc_Data {
int t1v, vec_wid_elts;
double duration_s;
};
class MM_Data {
public:
const int over = 16; MM_Data( App_Data& app, int d1, int d2, int d3,
int s1a, int s1g, int s2a, int s2b, int s3b, int s3g):
app( app ), d1( d1 ), d2( d2 ), d3( d3 ),
s1a( s1a ), s1g( s1g ), s2a( s2a ), s2b( s2b), s3b( s3b ), s3g( s3g ),
a( (over+d1)*d2 ), b( (d2+over)*d3 ),
ap( (over+s1a)*s2a ), bp( (over+s2b)*s3b ),
g_simple( (over+d1)*d3 ), g( (over+d1)*d3 ), gp( (over+s1g)*s3g ),
table(stdout),
misc_data(app.nt)
{}
const App_Data& app;
const int d1, d2, d3;
const int s1a, s1g, s2a, s2b, s3b, s3g;
aligned_array<elt_t,64> a, b; aligned_array<elt_t,64> ap, bp; aligned_array<elt_t,64> g_simple, g; aligned_array<elt_t,64> gp;
pTable table;
vector<Thd_Misc_Data> misc_data;
};
void
mm_simple( MM_Data& mmd )
{
const elt_t* __restrict__ a = assume_aligned<64>( mmd.a.data() );
const elt_t* __restrict__ b = assume_aligned<64>( mmd.b.data() );
elt_t* __restrict__ g_simple = assume_aligned<64>( mmd.g_simple.data() );
const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
#pragma omp parallel for
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d3; c++ )
{
elt_t e = 0;
for ( int k=0; k<d2; k++ ) e += a[ r*d2 + k ] * b[ k*d3 + c ];
g_simple[ r*d3 + c ] = e;
}
}
template< int t1, int t2, int t3, int vec_wid_bits, int n_vec_registers >
void
mm_tiledr( int tid, MM_Data* mmdp )
{
MM_Data& mmd = *mmdp;
double t_start = time_wall_fp();
const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
[[maybe_unused]] const int s1a = mmd.s1a, s2a = mmd.s2a, s3b = mmd.s3b;
[[maybe_unused]] const int s1g = mmd.s1g, s2b = mmd.s2b, s3g = mmd.s3g;
assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );
constexpr int hw_vec_regs_bytes = vec_wid_bits * n_vec_registers / 8;
constexpr bool use_vec = t3 >= 8 && n_vec_registers;
constexpr int vec_wid_elts =
min( size_t(t3), vec_wid_bits / ( 8 * sizeof(elt_t) ) );
mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;
typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
const elt_t* a = assume_aligned<64>( mmd.ap.data() );
const elt_t* b = assume_aligned<64>( mmd.bp.data() );
elt_t* g = assume_aligned<64>( mmd.gp.data() );
const vf* bv = assume_aligned<64>( (vf*)(b) );
vf* gv = assume_aligned<64>( (vf*)(g) );
constexpr int n_vec_elt = hw_vec_regs_bytes / sizeof( elt_t ) / 2;
constexpr int t1v = min( t1, max( 1, n_vec_elt / t3 ) );
static_assert( t1 / t1v * t1v == t1 );
mmd.misc_data[tid].t1v = use_vec ? t1v : t1;
for ( int rr = tid * t1; rr < d1; rr += mmd.app.nt * t1 )
for ( int cc = 0; cc < d3; cc += t3 )
{
if ( !use_vec )
{
elt_t ee[t1][t3]{};
for ( int kk = 0; kk < d2; kk += t2 )
{
for ( int k = kk; k < kk + t2; k++ )
for ( int r = rr; r < rr + t1; r++ )
for ( int c = cc; c < cc + t3; c++ )
ee[r-rr][c-cc] += a[ r*s2a + k ] * b[ k*s3b + c ];
}
for ( int r = rr; r < rr + t1; r++ )
for ( int c = cc; c < cc + t3; c++ )
g[ r*s3g + c ] = ee[r-rr][c-cc];
}
else
{
for ( int rrr = rr; rrr < rr + t1; rrr += t1v )
{
vf ee[t1v][t3/vec_wid_elts]{};
for ( int kk = 0; kk < d2; kk += t2 )
for ( int k = kk; k < kk + t2; k++ )
{
vf* brow = (vf*)assume_aligned<64>
( &bv[ k*s3b/vec_wid_elts ] );
for ( int ccc = cc; ccc < cc + t3; ccc += vec_wid_elts )
{
for ( int r = rrr; r < rrr + t1v; r++ )
ee[r-rrr][(ccc-cc)/vec_wid_elts] +=
a[ r*s2a + k ] * brow[ ccc / vec_wid_elts ];
}
}
for ( int r = rrr; r < rrr + t1v; r++ )
for ( int c = cc; c < cc + t3; c += vec_wid_elts )
gv[ r*mmd.s3g/vec_wid_elts + c/vec_wid_elts ] =
ee[r-rrr][(c-cc)/vec_wid_elts];
}
}
}
mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}
template< int t1, int t2, int t3 >
void
mm_tiled( int tid, MM_Data* mmdp )
{
switch ( mmdp->app.hw_vec_width_bits ){
case 512: mm_tiledr<t1,t2,t3,512,32>(tid,mmdp); break;
case 256: mm_tiledr<t1,t2,t3,256,16>(tid,mmdp); break;
case 128: mm_tiledr<t1,t2,t3,128,16>(tid,mmdp); break;
default: mm_tiledr<t1,t2,t3,64,0>(tid,mmdp); break;
}
}
template< int t1, int t2 = t1, int t3 = t1 >
void
mm_tiled_do( MM_Data& mmd )
{
const App_Data& app = mmd.app;
const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
const auto nt = app.nt;
# pragma omp parallel for
for ( auto& elt: mmd.gp ) elt = 0;
vector<thread> threads;
const double t_start_s = time_wall_fp();
for ( int i=0; i<nt; i++ )
threads.emplace_back( mm_tiled<t1,t2,t3>, i, &mmd );
for ( auto& thd: threads ) thd.join();
const double t_end_s = time_wall_fp();
const bool re_check_cpu_clocks = true;
const double cpu_clock_hz =
re_check_cpu_clocks ? cpu_core_fastest_hz() : app.clock_hz;
int n_err = 0, n_err_z = 0;
double max_err = 0;
float tol = numeric_limits<elt_t>::epsilon() * d2 * 5;
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d3; c++ )
{
const elt_t hc = mmd.g_simple[ r*d3 + c];
const elt_t hd = mmd.gp[ r*mmd.s3g + c ];
const float d = fabs( hd - hc );
if ( d > max_err ) max_err = d;
if ( d > tol )
{
n_err++;
if ( hd == 0 ) n_err_z++;
}
}
bool verbose = false;
if ( n_err || verbose )
{
const size_t g_size = d1 * d3;
printf("Number of errors %d out of %zd. %d zeros\n",
n_err, g_size, n_err_z);
printf("Max err = %.9f, tolerance = %.9f\n", max_err, tol );
}
const double data_bw_eps = mmd.app.data_bw_Bps / sizeof(elt_t);
const int n_elt_p_vec = mmd.app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
const int fp_bwd_flop_cyc =
min(nt,mmd.app.n_cores) * mmd.app.n_vec_unit_p_core * n_elt_p_vec;
const double fp_bwd_gflop = fp_bwd_flop_cyc * cpu_clock_hz * 1e-9;
const double duration_s = t_end_s - t_start_s;
const double n_madd = double(d1) * d2 * d3;
const double fp_thpt_gflop = 1e-9 * n_madd / duration_s;
const int n_panels = d1 / t1;
assert( n_panels * t1 == d1 ); const int n_panels_per_thread = ( n_panels + nt - 1 ) / nt;
double util_expected = double(n_panels) / ( n_panels_per_thread * nt );
double dur_thds_total_s = 0;
for ( auto& md: mmd.misc_data ) dur_thds_total_s += md.duration_s;
double util_measured = dur_thds_total_s / ( duration_s * nt );
const size_t point_e_cache_size_min_elts =
t1 * t3 + t3 + 1;
const size_t point_e_n_executions = ( d1 / t1 ) * ( d2 / t2 ) * ( d3 / t3 );
const size_t point_e_xfer_elts =
t1 * t3 * ( d1 / t1 ) * ( d3 / t3 ) + point_e_n_executions * ( t1 * t2 + t2 * t3 );
[[ maybe_unused ]] const int t1v = mmd.misc_data[0].t1v;
[[ maybe_unused ]] const int vec_wid_elts = mmd.misc_data[0].vec_wid_elts;
size_t point_v_cache_size_min_elts =
t1v * t3 + t1v + vec_wid_elts;
size_t n_point_v = d1/t1v * d3/t3;
size_t point_v_xfer_elts =
d1 * d3
+ n_point_v * ( t1v * d2 + d2 * t3 );
size_t point_c_cache_size_min_elts =
t1v * t3 + t1v + d2 * t3;
size_t n_point_c = d1/t1 * d3/t3;
size_t point_c_xfer_elts =
d1 * d3
+ n_point_c * ( t1 * d2 + d2 * t3 );
size_t point_p_cache_size_min_elts =
t1v * t3 + t1 * d2 + d2 * t3;
size_t n_point_p = d1 / t1;
size_t point_p_xfer_elts =
d1 * d3
+ n_point_p * ( t1 * d2 + d2 * d3 );
const double data_limit_elts = data_bw_eps * duration_s;
pTable_Row row(mmd.table);
mmd.table.header_span_start("Tile");
mmd.table.entry("t1", "%3d", t1);
mmd.table.entry("t2", "%2d", t2);
mmd.table.entry("t3", "%3d", t3);
mmd.table.header_span_end();
mmd.table.entry( "Dur/ms", "%7.2f", duration_s * 1000 );
mmd.table.header_span_start("FP Throughput");
mmd.table.entry( "GFLOPs", "%7.2f", fp_thpt_gflop );
mmd.table.entry( "% Peak", "%7.2f", 100 * fp_thpt_gflop / fp_bwd_gflop );
mmd.table.header_span_end();
if ( re_check_cpu_clocks )
{
mmd.table.header_span_start("Clk");
mmd.table.entry( "MHz", "%4.0f", cpu_clock_hz * 1e-6 );
mmd.table.header_span_end();
}
mmd.table.header_span_start("Utiliz");
mmd.table.entry( "Exp", "%3.0f", 100.0 * util_expected );
mmd.table.entry( "Mes", "%3.0f", 100.0 * util_measured );
mmd.table.header_span_end();
const size_t l2_size_elts = app.cache_info.l2_core.bytes / sizeof(elt_t);
[[ maybe_unused ]]
const size_t l3_size_elts = app.cache_info.l3_chip.bytes / sizeof(elt_t);
mmd.table.header_span_start("E");
mmd.table.entry( "%L2", "%3.0f",
1e2 * point_e_cache_size_min_elts / l2_size_elts );
mmd.table.entry( "%BW", "%3.0f", 1e2 * point_e_xfer_elts / data_limit_elts );
mmd.table.header_span_end();
mmd.table.header_span_start("V");
mmd.table.entry( "%L2", "%3.0f",
1e2 * point_v_cache_size_min_elts / l2_size_elts );
mmd.table.entry( "%BW", "%4.0f", 1e2 * point_v_xfer_elts / data_limit_elts );
mmd.table.header_span_end();
mmd.table.header_span_start("C");
mmd.table.entry( "%L2", "%3.0f",
1e2 * point_c_cache_size_min_elts / l2_size_elts );
mmd.table.entry( "%BW", "%3.0f", 1e2 * point_c_xfer_elts / data_limit_elts );
mmd.table.header_span_end();
mmd.table.header_span_start("P");
mmd.table.entry( "%L2", "%3.0f",
1e2 * point_p_cache_size_min_elts / l2_size_elts );
mmd.table.entry( "%BW", "%3.0f", 1e2 * point_p_xfer_elts / data_limit_elts );
mmd.table.header_span_end();
}
void
mm_do( App_Data& app, int d1, int d2, int d3, bool pad = false )
{
int s1a = d1, s2a = d2, s3b = d3;
int s1g = d1, s2b = d2, s3g = d3;
if ( pad )
{
[[ maybe_unused ]]
const int line_size_bytes = app.cache_info.l3_chip.line_size_bytes;
const int line_size_elts = line_size_bytes / sizeof(elt_t);
const int n_lines_d2 = d2 / line_size_elts;
int n_pri_d2 = n_lines_d2 | 1;
s2a = n_pri_d2 * line_size_elts;
const int n_lines_d3 = d3 / line_size_elts;
int n_pri_d3 = n_lines_d3 | 1;
s3b = s3g = n_pri_d3 * line_size_elts;
}
MM_Data mmd(app,d1,d2,d3,s1a,s1g,s2a,s2b,s3b,s3g);
ranges::generate(mmd.a,rand_pm1);
ranges::generate(mmd.b,rand_pm1);
for ( int r=0; r<d1; r++ )
for ( int c=0; c<d2; c++ )
mmd.ap[ r * s2a + c ] = mmd.a[ r * d2 + c ];
for ( int r=0; r<d2; r++ )
for ( int c=0; c<d3; c++ )
mmd.bp[ r * s3b + c ] = mmd.b[ r * d3 + c ];
const double t_start_s = time_wall_fp();
mm_simple( mmd );
const double t_simple_end_s = time_wall_fp();
const double dur_simple_s = t_simple_end_s - t_start_s;
printf("\nMatrix %d ⨉ %d %d ⨉ %d. Duration simple: %.3f ms\n",
d1, d2, d2, d3, dur_simple_s * 1000 );
printf("Stride %d ⨉ %d %d ⨉ %d = %d x %d. Padding %sRequested\n",
s1a, s2a, s2b, s3b, s1g, s3g,
pad ? "" : "Not ");
int64_t n_madd = int64_t(d1) * d2 * d3;
if ( n_madd < int64_t(2048) * 2048 * 2048 )
{
mm_tiled_do<1,16,1>( mmd );
mm_tiled_do<4,16,4>( mmd );
}
mm_tiled_do<8,16,8>( mmd );
mm_tiled_do<8,16,16>( mmd );
mm_tiled_do<8,16,32>( mmd );
mm_tiled_do<8,16,64>( mmd );
mm_tiled_do<16,16,8>( mmd );
mm_tiled_do<16>( mmd );
mm_tiled_do<16,16,32>( mmd );
mm_tiled_do<16,16,64>( mmd );
mm_tiled_do<16,16,128>( mmd );
mm_tiled_do<16,16,256>( mmd );
mm_tiled_do<32,16,8>( mmd );
mm_tiled_do<32,16,16>( mmd );
mm_tiled_do<32,16,32>( mmd );
mm_tiled_do<32,16,64>( mmd );
mm_tiled_do<32,16,128>( mmd );
mm_tiled_do<64,16,8>( mmd );
mm_tiled_do<64,16,16>( mmd );
mm_tiled_do<64,16,32>( mmd );
mm_tiled_do<64,16,64>( mmd );
mm_tiled_do<64,16,128>( mmd );
mm_tiled_do<128,16,8>( mmd );
mm_tiled_do<128,16,16>( mmd );
mm_tiled_do<128,16,32>( mmd );
mm_tiled_do<128,16,64>( mmd );
mm_tiled_do<128,16,128>( mmd );
mm_tiled_do<256,16,8>( mmd );
mm_tiled_do<256,16,16>( mmd );
mm_tiled_do<256,16,32>( mmd );
mm_tiled_do<256,16,64>( mmd );
mm_tiled_do<256,16,128>( mmd );
}
int
main( int argc, char **argv )
{
App_Data app;
app.n_cores = thread::hardware_concurrency();
app.nt = argc == 1 ? app.n_cores : atoi(argv[1]);
app.hw_vec_width_bits =
__builtin_cpu_supports("avx512f") ? 512 :
__builtin_cpu_supports("avx2") ? 256 : 128;
app.n_vec_unit_p_core = app.hw_vec_width_bits == 512 ? 2 : 1; app.n_hw_vec_regs = app.hw_vec_width_bits == 512 ? 32 : 16;
app.data_bw_Bps = measure_data_bw_Bps(); app.clock_hz = cpu_core_fastest_hz();
app.cache_info = cpu_get_cache();
const double data_bw_eps = app.data_bw_Bps / sizeof(elt_t);
const int n_elt_p_vec = app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
const int fp_bwd_flop_cyc =
min(app.nt,app.n_cores) * app.n_vec_unit_p_core * n_elt_p_vec;
const double fp_bwd_flop_s = fp_bwd_flop_cyc * app.clock_hz;
const double comp_comm = fp_bwd_flop_s / max(1.0,data_bw_eps);
printf
("CPU model %s\n", cpu_info_get_model().c_str() );
printf
("CPU has %d cores at %.2f GHz. (This run spawns %d threads.)\n",
app.n_cores, 1e-9 * app.clock_hz, app.nt);
printf("L1 Data Per Core %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l1_core_data.bytes >> 10,
app.cache_info.l1_core_data.associativity,
app.cache_info.l1_core_data.line_size_bytes);
printf("L2 Unif Per Core %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l2_core.bytes >> 10,
app.cache_info.l2_core.associativity,
app.cache_info.l2_core.line_size_bytes);
printf("L3 Unif Per Chip %6d kiB, %2d-way, line size %d bytes.\n",
app.cache_info.l3_chip.bytes >> 10,
app.cache_info.l3_chip.associativity,
app.cache_info.l3_chip.line_size_bytes);
printf
( "Vector width %d bits, %d vector registers, and\n"
"%d vector unit(s) per core. (Assumed.)\n",
app.hw_vec_width_bits, app.n_hw_vec_regs, app.n_vec_unit_p_core );
printf("Data bw %.3f GB/s (measured). For %d threads: comp/comm = %.1f\n",
1e-9*app.data_bw_Bps, app.nt, comp_comm);
mm_do(app,256,256,256);
mm_do(app,1024,1024,1024);
mm_do(app,1024,1024,1024,true);
mm_do(app,2048,2048,4096);
mm_do(app,2048,2048,4096,true);
}