/// LSU EE 7722 GPU Microarchitecture
//
 /// Spring 2025
 /// Homework 2 -- SOLUTION
 //
 //  Assignment: https://www.ece.lsu.edu/koppel/gp/2025/hw02.pdf
 //  Solution Discussion: https://www.ece.lsu.edu/koppel/gp/2025/hw02_sol.pdf
 //
 //  Solution is throughout this file.
 //
 //  Modified this file only.

#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <ranges>
#include <random>
#include <vector>
#include <algorithm>
#include <gp/misc.h>
#include <thread>
#include <memory>
#include <fstream>
#include <filesystem>
#include <ptable.h>
#include <papi.h>

using namespace std;


struct Cache_Level_Info
{
  int bytes, associativity, n_sets, line_size_bytes;
};

struct Cache_Info
{
  Cache_Level_Info l1_core_data; // Level 1 Data Cache. Each core has 1.
  Cache_Level_Info l2_core;  // Level 2 Cache. Each core has 1.
  Cache_Level_Info l3_chip;  // Level 3 Cache. Each CPU has 1, shared by cores.
};

typedef float elt_t;

#include "support.h"

enum Kernel_Version
{ KV_Unset = 0, KV_HW01, KV_HW02p1,
  KV_HW02p2,
  KV_ENUM_SIZE };
const char* const tv_string[] = { "Unset", "HW01", "HW02p1", "HW02p2"};

class App_Data {
public:
  App_Data(){};

  Cache_Info cache_info;
  double clock_hz;       // Fastest core frequency after exercising.
  double data_bw_Bps;    // Measured off-chip bandwidth.
  int nt;                // Number of threads to use.
  int n_cores;           // Number of cores on CPU(s).
  int hw_vec_width_bits; // Hardware vector width.
  int n_vec_unit_p_core; // Number of vector units per core.
  int n_hw_vec_regs;     // Number of hardware vector registers.
  int n_samples;         // Number of runs per configuration.
};

struct Thd_Misc_Data {

  int m1, vec_wid_elts;

  double duration_s;
  int64_t n_insn;

};


struct table_elt {
  double et;
  pString line;
  table_elt(double e, const pString& l):et(e),line(l){};
};

class MM_Data {
public:
  const int over = 16;  // Additional rows added to protect overruns.
  MM_Data( App_Data& app, int d1, int d2, int d3, int s2a, int s3):
    app( app ), d1( d1 ), d2( d2 ), d3( d3 ),
    s2a( s2a ), s3( s3 ),
    a( (over+d1)*d2 ), b( (d2+over)*d3 ),
    ap( (over+d1)*s2a ), bp( (d2+over)*s3 ),
    g( (d1+over)*d3 ), g_simple( (d1+over)*s3 ), gp( (d1+over)*s3 ),
    table(stdout),
    misc_data(app.nt)
  {}

  const App_Data& app;

  // Array Dimensions
  //
  const int d1, d2, d3;

  // Array Strides
  //
  const int s2a, s3;

  // Width of Macro Tile.
  //
  int m3;

  Kernel_Version version;

  // Aligned Array Objects
  //
  // All are allocated so their starting address is a multiple of 64.
  // This makes them suitable for AVX512 vectors, as well as smaller
  // vectors.
  //
  // They can be accessed in the same way as std::vector containers.
  //
  aligned_array<elt_t,64> a, b;         // a  is d1 ⨉ d2,  b is d2 ⨉ d3.
  aligned_array<elt_t,64> ap, bp;       // ap is d1 ⨉ s2a,  bp is d2 ⨉ s3.
  aligned_array<elt_t,64> g;            // g is d1 ⨉ d3.
  aligned_array<elt_t,64> g_simple, gp; // gp is d1 ⨉ s3

  // Object used for printing neat tables.
  //
  pTable table;

  vector<table_elt> table_row_candidates;

  vector<table_elt> table_lines; // Best lines in table.

  // Vector for storing per-thread data.
  //
  vector<Thd_Misc_Data> misc_data;

};


void
mm_simple( MM_Data& mmd )
{
  // Compute product of a and b using a simple method.

  // Matrix a: d1 rows, d2 columns.
  // Matrix b: d2 rows, d3 columns.
  // Matrix g: d1 rows, d3 columns.

  // Convenient and Compiler-Friendly Abbreviations
  //
  const elt_t* __restrict__ a = assume_aligned<64>( mmd.ap.data() );
  const elt_t* __restrict__ b = assume_aligned<64>( mmd.bp.data() );
  elt_t* __restrict__ g_simple = assume_aligned<64>( mmd.g_simple.data() );
  //
  // Note: assume_aligned allows more efficient use of vector
  // instructions. __restrict__ provides more freedom for rearranging
  // load and store instructions.

  const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3, s2a = mmd.s2a, s3 = mmd.s3;

  // Use OpenMP pragmata to tell compiler to parallelize loop.
  //
#pragma omp parallel for
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d3; c++ )
      {
        elt_t e = 0;
        for ( int k=0; k<d2; k++ ) e += a[ r*s2a + k ] * b[ k*s3 + c ];
        g_simple[ r*s3 + c ] = e;
      }
}


template< int t1, int t2, int t3, int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw01( int tid, MM_Data* mmdp )
{
  MM_Data& mmd = *mmdp;

  p_papi_insn papi_insn;
  papi_insn.start();     // Start counting the number of instructions.
  const double t_start = time_wall_fp();

  const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  const int s2a = mmd.s2a, s3 = mmd.s3;

  // Code won't work if dimension is not a multiple of tile size.
  assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );

  constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
  assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
  mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;

  // Define vector type vf.
  //
  typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
  //
  // This type, vf, is used to help the compiler do a better job
  // emitting vector (SSE, AVX2, or in the lab AVX512) instructions.

  // Convenient Abbreviations
  //
  const elt_t* a = assume_aligned<64>( mmd.ap.data() );
  const elt_t* b = assume_aligned<64>( mmd.bp.data() );
  elt_t* g = assume_aligned<64>( mmd.gp.data() );
  //
  // Note that this code always uses the padded arrays.
  //
  // The assume_aligned template tells the compiler to assume
  // that a, b, and c, are multiples of 64.

  // Abbreviations for Vector Pointers to b and g.
  //
  const vf* bv = assume_aligned<64>( (vf*)(b) );
  vf* gv = assume_aligned<64>( (vf*)(g) );
  //
  // Note: b and bv hold exactly the same address. But the compiler
  // treats b as a pointer to elt_t, while the compiler treats bv as a
  // pointer to vf. Remember that vf is a vector of elt_t elements.

  // Number of rows in macro tile.
  //
  mmd.misc_data[tid].m1 = t1;

  // Tiled Matrix Multiplication
  //
  for ( int rr = tid * t1;  rr < d1;  rr += mmd.app.nt * t1 )

    // Compute g for t1 rows starting at row rr.

    for ( int cc = 0;  cc < d3;  cc += t3 )
      {
        // Compute One Complete t1 ⨉ t3 Output Tile (of g)

        // Storage for the tile. Each element is a vector.
        //
        vf ee[t1][t3/vec_wid_elts]{};

        for ( int kk = 0;  kk < d2;  kk += t2 )

          for ( int k = kk;  k < kk + t2;  k++ )
            {
              // Tell compiler to trust that brow is aligned
              // for vectors.
              vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ] );

              for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )
                for ( int r = rr;  r < rr + t1;  r++ )
                  ee[r-rr][(c-cc)/vec_wid_elts] +=
                    a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
              // Note: In product above each element of
              // brow is multiplied by the same value,
              // a[r*s2a+k].
            }

        // Write completed t1 * t3 tile to g.
        //
        for ( int r = rr;  r < rr + t1;  r++ )
          for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )
            gv[ r * s3 / vec_wid_elts + c / vec_wid_elts ] =
              ee[r-rr][(c-cc)/vec_wid_elts];
      }

  mmd.misc_data[tid].n_insn = papi_insn.stop();
  mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}



template< int t1, int t2, int t3, int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw02p1( int tid, MM_Data* mmdp )
{
  MM_Data& mmd = *mmdp;

  p_papi_insn papi_insn;
  papi_insn.start();     // Start counting the number of instructions.
  const double t_start = time_wall_fp();

  const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  const int s2a = mmd.s2a, s3 = mmd.s3;

  // Code won't work if dimension is not a multiple of tile size.
  assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );

  constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
  assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
  mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;

  // Define vector type vf.
  //
  typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
  //
  // This type, vf, is used to help the compiler do a better job
  // emitting vector (SSE, AVX2, or in the lab AVX512) instructions.

  // Convenient Abbreviations
  //
  const elt_t* a = assume_aligned<64>( mmd.ap.data() );
  const elt_t* b = assume_aligned<64>( mmd.bp.data() );
  elt_t* g = assume_aligned<64>( mmd.gp.data() );
  //
  // Note that this code always uses the padded arrays.
  //
  // The assume_aligned template tells the compiler to assume
  // that a, b, and c, are multiples of 64.

  // Abbreviations for Vector Pointers to b and g.
  //
  const vf* bv = assume_aligned<64>( (vf*)(b) );
  vf* gv = assume_aligned<64>( (vf*)(g) );
  //
  // Note: b and bv hold exactly the same address. But the compiler
  // treats b as a pointer to elt_t, while the compiler treats bv as a
  // pointer to vf. Remember that vf is a vector of elt_t elements.


  /// SOLUTION -- Problem 1
  //
  //  Set m1 to d1, since each thread iterates over d1 columns (using
  //  tiles of size t1).
  //
  mmd.misc_data[tid].m1 = d1;


  /// SOLUTION -- Problem 1
  //
  //  - Outer loop iterates over columns (was rows in hw01).
  //
  //  - Assign initial column based on tid ..
  //    .. no two threads have the same value of cc.
  //
  //  - Inner loop iterated through every row (in each thread).
  //
  //
  // Tiled Matrix Multiplication
  //
  for ( int cc = tid * t3;  cc < d3;  cc += mmd.app.nt * t3 )
    for ( int rr = 0;  rr < d1;  rr += t1 )
      {
        // Compute One Complete t1 ⨉ t3 Output Tile (of g)

        // Storage for the tile. Each element is a vector.
        //
        vf ee[t1][t3/vec_wid_elts]{};

        for ( int kk = 0;  kk < d2;  kk += t2 )

          for ( int k = kk;  k < kk + t2;  k++ )
            {
              // Tell compiler to trust that brow is aligned
              // for vectors.
              vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ] );

              for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )
                for ( int r = rr;  r < rr + t1;  r++ )
                  ee[r-rr][(c-cc)/vec_wid_elts] +=
                    a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
              // Note: In product above each element of
              // brow is multiplied by the same value,
              // a[r*s2a+k].
            }

        // Write completed t1 * t3 tile to g.
        //
        for ( int r = rr;  r < rr + t1;  r++ )
          for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )
            gv[ r * s3 / vec_wid_elts + c / vec_wid_elts ] =
              ee[r-rr][(c-cc)/vec_wid_elts];
      }

  mmd.misc_data[tid].n_insn = papi_insn.stop();
  mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}

template< int t1, int t2, int t3,
          int vec_wid_bits, int64_t n_vec_registers >
void
mm_tiled_hw02p2( int tid, MM_Data* mmdp )
{
  MM_Data& mmd = *mmdp;

  p_papi_insn papi_insn;
  papi_insn.start();     // Start counting the number of instructions.
  const double t_start = time_wall_fp();

  const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  const int s2a = mmd.s2a, s3 = mmd.s3;

  // Code won't work if dimension is not a multiple of tile size.
  assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );

  constexpr int vec_wid_elts = vec_wid_bits / ( 8 * sizeof(elt_t) );
  assert( t3 / vec_wid_elts * vec_wid_elts == t3 );
  mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;

  // Define vector type vf.
  //
  typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
  //
  // This type, vf, is used to help the compiler do a better job
  // emitting vector (SSE, AVX2, or in the lab AVX512) instructions.

  // Convenient Abbreviations
  //
  const elt_t* a = assume_aligned<64>( mmd.ap.data() );
  const elt_t* b = assume_aligned<64>( mmd.bp.data() );
  elt_t* g = assume_aligned<64>( mmd.gp.data() );
  //
  // Note that this code always uses the padded arrays.
  //
  // The assume_aligned template tells the compiler to assume
  // that a, b, and c, are multiples of 64.

  // Abbreviations for Vector Pointers to b and g.
  //
  const vf* bv = assume_aligned<64>( (vf*)(b) );
  vf* gv = assume_aligned<64>( (vf*)(g) );
  //
  // Note: b and bv hold exactly the same address. But the compiler
  // treats b as a pointer to elt_t, while the compiler treats bv as a
  // pointer to vf. Remember that vf is a vector of elt_t elements.


  /// SOLUTION -- Problem 2
  //
  //  Choose a value of m1 that:
  //
  //    - Is a power of 2 <= d1.
  //
  //    - Allows for a utilization of at least 95%.
  //
  const int nt = mmd.app.nt;
  const int m3 = mmd.m3;
  const int tiles_per_col = d1/t1;

  const int mtiles_per_row = d3 / m3;
  const int mtiles_max = mtiles_per_row * tiles_per_col;
  const int mtiles_max_p_thd = div_ceil( mtiles_max, nt );

  // Value of m1 for which there would be one macrotile per thread ..
  // .. though that value of m1 might not work because it is > d1, etc.
  //
  const uint m1_max_1 = mtiles_max_p_thd * t1;
  //
  // Choose a workable value of m1 by forcing it to be a power of 2
  // and making sure it is <= d1.
  //
  const int m1_max = min( uint(d1), bit_floor( m1_max_1 ) );
  //
  // Though this value of m1 would work, there might be imbalance.

  // Try to find a value of m1 that allows for a utilization of at
  // least util_min (95%).
  //
  const double util_min = 0.95;
  int m1 = m1_max;
  while ( m1 > t1 )
    {
      const int n_mt = d1 / m1 * mtiles_per_row;
      const int tiles_p_thd = div_ceil( n_mt, nt );
      const double util = double(n_mt) / ( tiles_p_thd * nt );
      if ( util >= util_min ) break;
      m1 = max( t1, m1 / 2 );
    }

  // Write the chosen macrotile width so that it can be shown in the output.
  //
  mmd.misc_data[tid].m1 = m1;


  /// SOLUTION -- Problem 2
  //
  //  Iterate over macrotiles.
  //
  //  With the iteration order below the CPU (including all threads)
  //  will be accessing a limited number of rows of a (good) but many
  //  columns of b (bad).

  const int mtiles_per_col = d1 / m1;
  const int n_mtiles = mtiles_per_col * mtiles_per_row;

  for ( int mtile_idx = tid; mtile_idx < n_mtiles; mtile_idx += nt )
    {
      // Choose next macro tile.
      //
      const int mtile_c = mtile_idx % mtiles_per_row;
      const int mtile_r = mtile_idx / mtiles_per_row;

      // Get row and column start and end of macro tile.
      //
      const int rr_start = mtile_r * m1;
      const int rr_stop = rr_start + m1;
      const int cc_start = mtile_c * m3;
      const int cc_stop = cc_start + m3;

      // Iterate over the t1*t3 tiles within the macrotile.
      //
      for ( int rr = rr_start;  rr < rr_stop;  rr += t1 )
        for ( int cc = cc_start; cc < cc_stop; cc += t3 )
          {
            // The code below is little changed from the original routine.

            // Compute One Complete t1 ⨉ t3 Output Tile (of g)
            //
            vf ee[t1][t3/vec_wid_elts]{};

            for ( int kk = 0;  kk < d2;  kk += t2 )

              for ( int k = kk;  k < kk + t2;  k++ )
                {
                  // Tell compiler to trust that brow is aligned
                  // for vectors.
                  vf* brow = (vf*)assume_aligned<64>( &bv[ k*s3/vec_wid_elts ]);

                  for ( int c = cc;  c < cc + t3; c += vec_wid_elts )
                    for ( int r = rr; r < rr + t1; r++ )
                      ee[r-rr][(c-cc)/vec_wid_elts] +=
                        a[ r*s2a + k ] * brow[ c / vec_wid_elts ];
                }

            // Write completed m1 * t3 tile to g.
            //
            for ( int r = rr;  r < rr + t1;  r++ )
              for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )
                gv[ r*mmd.s3/vec_wid_elts + c/vec_wid_elts ] =
                  ee[r-rr][(c-cc)/vec_wid_elts];
          }
  }

  mmd.misc_data[tid].n_insn = papi_insn.stop();
  mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}

template< auto... Ts >
void
mm_tiledr( int tid, MM_Data* mmdp )
{
  MM_Data& mmd = *mmdp;
  switch ( mmd.version ) {
  case KV_HW01: mm_tiled_hw01<Ts...>(tid,mmdp); break;
  case KV_HW02p1: mm_tiled_hw02p1<Ts...>(tid,mmdp); break;
  case KV_HW02p2: mm_tiled_hw02p2<Ts...>(tid,mmdp); break;
  default: assert( false );
  }
}

template< auto... Ts >
void
mm_tiled( int tid, MM_Data* mmdp )
{
  switch ( mmdp->app.hw_vec_width_bits ){
  case 512: mm_tiledr<Ts...,512,32>(tid,mmdp); break;
  case 256: mm_tiledr<Ts...,256,16>(tid,mmdp); break;
  case 128: mm_tiledr<Ts...,128,16>(tid,mmdp); break;
  default:
    fprintf(stderr, "This code can only be run on a CPU with vector registers "
            "that (that, not which) this code can detect. "
            "Have a better one.\n");
    exit(1);
  }
}


template< int t1, int t2 = t1, int t3 = t1 >
void
mm_tiled_do_sample( MM_Data& mmd )
{
  const App_Data& app = mmd.app;
  const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  const auto nt = app.nt;

  /// Initialize Output Array to Zero
  //
# pragma omp parallel for
  for ( auto& elt: mmd.gp ) elt = 0;

  ///
  /// Launch Threads that Compute Matrix Product
  //
  vector<thread> threads;

  // Start threads ..
  //
  const double t_start_s = time_wall_fp();
  for ( int i=0; i<nt; i++ )
    threads.emplace_back( mm_tiled<t1,t2,t3>, i, &mmd );
  //
  // ..and then wait for threads to finish.
  //
  for ( auto& thd: threads ) thd.join();
  const double t_end_s = time_wall_fp();

  const bool re_check_cpu_clocks = false;
  const double cpu_clock_hz =
    re_check_cpu_clocks ? cpu_core_fastest_hz() : app.clock_hz;

  //
  /// Check Matrix Product
  //
  int n_err = 0, n_err_z = 0;
  double max_err = 0;

  // Compute error tolerance.
  float tol = numeric_limits<elt_t>::epsilon() * d2 * 5;
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d3; c++ )
      {
        const elt_t hc = mmd.g_simple[ r*mmd.s3 + c]; // Assumed Correct
        const elt_t hd = mmd.gp[ r*mmd.s3 + c ];
        const float d = fabs( hd - hc );
        if ( d > max_err ) max_err = d;
        if ( d > tol )
          {
            n_err++;
            if ( hd == 0 ) n_err_z++;
          }
      }

  bool verbose = false;
  if ( n_err || verbose )
    {
      const size_t g_size = d1 * d3;
      printf("Number of errors %d out of %zd. %d zeros\n",
             n_err, g_size, n_err_z);
      printf("Max err = %.9f,  tolerance = %.9f\n", max_err, tol );
    }


  ///
  /// Print Information About Tile and Performance
  ///
  //  const double data_bw_eps = mmd.app.data_bw_Bps / sizeof(elt_t);
  const int n_elt_p_vec = mmd.app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
  const int fp_bwd_flop_cyc =
    min(nt,mmd.app.n_cores) * mmd.app.n_vec_unit_p_core * n_elt_p_vec;
  const double fp_bwd_gflop = fp_bwd_flop_cyc * cpu_clock_hz * 1e-9;
  const double duration_s = t_end_s - t_start_s;
  const double n_madd = double(d1) * d2 * d3;
  const double fp_thpt_gflop = 1e-9 * n_madd / duration_s;


  double dur_thds_total_s = 0;
  for ( auto& md: mmd.misc_data ) dur_thds_total_s += md.duration_s;
  const double util_measured = dur_thds_total_s / ( duration_s * nt );

  // The following values may be useful for solving Problem 2.

  const int m1 = mmd.misc_data[0].m1;
  const int m3 = mmd.m3;
  const int vec_wid_elts = mmd.misc_data[0].vec_wid_elts;

  const int l2_elts = mmd.app.cache_info.l2_core.bytes / sizeof(elt_t);
  double cache_needed_a_elts = t1 * d2;
  double cache_needed_b_elts = m3 * d2;

  mmd.table.row_start();

  mmd.table.header_span_start("Tile");
  mmd.table.entry("t1", "%3d", t1);
  //  mmd.table.entry("t2", "%2d", t2);
  mmd.table.entry("t3", "%3d", t3);
  mmd.table.entry("m1", "%5d", m1);
  mmd.table.entry("m3", "%4d", m3);
  mmd.table.header_span_end();
  mmd.table.header_span_start("L2 Occ");
  mmd.table.entry("m1*d2", "%5.2f", cache_needed_a_elts / l2_elts);
  mmd.table.entry("d2*m3", "%5.2f", cache_needed_b_elts / l2_elts);
  mmd.table.header_span_end();
  mmd.table.entry( "Dur/ms", "%7.2f", duration_s * 1000 );
  mmd.table.header_span_start("FP Throughput");
  mmd.table.entry( "GFLOPs", "%7.2f", fp_thpt_gflop );
  mmd.table.entry( "% Peak", "%7.2f", 100 * fp_thpt_gflop / fp_bwd_gflop );
  mmd.table.header_span_end();
  if ( re_check_cpu_clocks )
    {
      mmd.table.header_span_start("Clk");
      mmd.table.entry( "MHz", "%4.0f", cpu_clock_hz * 1e-6 );
      mmd.table.header_span_end();
    }

  int64_t tot_insn = 0;
  for ( auto& md: mmd.misc_data ) tot_insn += md.n_insn;

  if ( false )
    mmd.table.entry("VW", "%2d", vec_wid_elts);

  mmd.table.header_span_start("Insn/");
  mmd.table.entry("Vec I", "%5.2f", tot_insn*vec_wid_elts/n_madd );
  mmd.table.header_span_end();

  mmd.table.header_span_start("Util");
  mmd.table.entry( "Mes", "%4.0f", 100.0 * util_measured );
  mmd.table.header_span_end();

  mmd.table_row_candidates.emplace_back( duration_s, mmd.table.row_end_take() );

}

template< auto... Ts>
void
mm_tiled_do_1( MM_Data& mmd )
{
  const int n_samples = mmd.app.n_samples;

  mmd.table_row_candidates.clear();
  for ( int i=0; i<n_samples; i++ ) mm_tiled_do_sample<Ts...>(mmd);
  ranges::sort( mmd.table_row_candidates, {}, &table_elt::et );
  auto& sample = mmd.table_row_candidates[n_samples/2];
  printf("%s\n", sample.line.s );
  mmd.table_lines.push_back(move(sample));

}

template< int t1, int t2, int t3 >
void
mm_tiled_do( MM_Data& mmd )
{
  switch ( mmd.version ) {
  case KV_HW01:
    mmd.m3 = mmd.d3;
    mm_tiled_do_1<t1,t2,t3>( mmd );
    break;
  case KV_HW02p1:
    mmd.m3 = t3;
    mm_tiled_do_1<t1,t2,t3>( mmd );
    break;
  case KV_HW02p2:
    {
      const int max_m3 = 256;
      for ( int i_m3 = t3; i_m3 <= max_m3; i_m3 <<= 1 )
        {
          mmd.m3 = i_m3;
          mm_tiled_do_1<t1,t2,t3>( mmd );
        }
    }
    break;
  default: assert( false );
  }
}

void
mm_do( App_Data& app, int d1, int d2, int d3, bool pad = false )
{
  /// Multiply  d1 * d2 matrix by d2 * d3 Matrix Using a Variety of Tilings

  // Set strides equal to dimensions. This results in a matrix without padding.
  //
  int s2a = d2, s3 = d3;

  if ( pad )
    {
      // Set strides to avoid aliasing in set-associative cache.

      // Helpful Information

      // Line size. Assume all levels use the size line size.
      //
      [[ maybe_unused ]]
      const int line_size_bytes = app.cache_info.l3_chip.line_size_bytes;
      //
      // See Cache_Info structure (near top of file) for more information.


      const int line_size_elts = line_size_bytes / sizeof(elt_t);
      const int n_lines_d2 = d2 / line_size_elts;
      int n_pri_d2 = n_lines_d2 | 1;
      s2a = n_pri_d2 * line_size_elts;
      const int n_lines_d3 = d3 / line_size_elts;
      int n_pri_d3 = n_lines_d3 | 1;
      s3 = n_pri_d3 * line_size_elts;

    }

  // Construct a Matrix Multiply Data Object
  //
  MM_Data mmd(app,d1,d2,d3,s2a,s3);

  // Fill a and b matrices with random numbers.
  //
  ranges::generate(mmd.a,rand_pm1);
  ranges::generate(mmd.b,rand_pm1);

  //  Copy data from the a and b matrices to their padded counterparts.
  //
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d2; c++ )
      mmd.ap[ r * s2a + c ] =  mmd.a[ r * d2 + c ];
  for ( int r=0; r<d2; r++ )
    for ( int c=0; c<d3; c++ )
      mmd.bp[ r * s3 + c ] =  mmd.b[ r * d3 + c ];

  const double t_start_s = time_wall_fp();

  // Compute matrix product on unpadded arrays using a simple method.
  //
  mm_simple( mmd );

  const double t_simple_end_s = time_wall_fp();
  const double dur_simple_s = t_simple_end_s - t_start_s;

  for ( const auto version: { KV_HW01, KV_HW02p1, KV_HW02p2 } )
    {
      mmd.version = version;

      printf("\nMatrix %d ⨉ %d  %d ⨉ %d.  Duration simple: %.3f ms\n",
             d1, d2, d2, d3, dur_simple_s * 1000 );
      printf("Stride %d ⨉ %d  %d ⨉ %d  = %d x %d. %d thds.  %s\n",
             d1, s2a, d2, s3, d1, s3,
             app.nt, tv_string[version]);

      mmd.table = pTable(stdout);
      mmd.table_lines.clear();
      constexpr int t2 = 16;

      switch( app.hw_vec_width_bits ){
      case 256:

        mm_tiled_do< 8,t2, 8>( mmd );    //  6
        mm_tiled_do< 4,t2,16>( mmd );    //  6
        mm_tiled_do< 2,t2,32>( mmd );    //  6
        mm_tiled_do< 1,t2,64>( mmd );    //  6
        break;

      case 512:

        mm_tiled_do<16,t2,16>( mmd );     //  8
        mm_tiled_do< 8,t2,32>( mmd );     //  8
        mm_tiled_do< 4,t2,64>( mmd );     //  8
        mm_tiled_do< 2,t2,128>( mmd );    //  8
        mm_tiled_do< 1,t2,256>( mmd );    //  8
        break;

      default:
        assert( false );
      }

      ranges::sort( mmd.table_lines, {}, &table_elt::et );
      int n_left = mmd.table_lines.size();
      const int top_n = min(n_left,400);
      n_left -= top_n;
      const int bot_n = min(n_left,4);

      if ( bot_n == 0 )
        printf("** Sorted by Execution Time **\n");
      else
        printf("** The %d Lowest and %d Highest Execution Times **\n",
               top_n, bot_n);
      mmd.table.heading_show();
      for ( auto& l: mmd.table_lines | views::take(top_n) )
        printf("%s\n",l.line.s);
      for ( auto& l: mmd.table_lines
              | views::reverse | views::take(bot_n) | views::reverse )
        printf("%s\n",l.line.s);
    }

}


int
main( int argc, char **argv )
{
  if ( int retval = PAPI_library_init(PAPI_VER_CURRENT);
       retval != PAPI_VER_CURRENT )
    {
      printf("Library initialization error! \n");
      exit(1);
    }

  CP( PAPI_thread_init(pthread_self) );

  const PAPI_hw_info_t *hwinfo = PAPI_get_hardware_info();

  App_Data app;

  // Number of times to run each configuration. The median time is used.
  app.n_samples = 5;

  app.n_cores = hwinfo->cores * hwinfo->sockets;

  // Get number of threads to spawn from the command-line argument.
  //
  // If no argument given, use number of cores as the default number of threads.
  //
  app.nt = argc == 1 ? max(1, app.n_cores - 2): atoi(argv[1]);

  app.hw_vec_width_bits =
    __builtin_cpu_supports("avx512f") ? 512 :
    __builtin_cpu_supports("avx2") ? 256 : 128;

  // The number of vector units computed per core below is correct
  // for some LSU ECE lab computers, but is not generally correct.
  //
  app.n_vec_unit_p_core = app.hw_vec_width_bits == 512 ? 2 : 1; // Fix
  app.n_hw_vec_regs = app.hw_vec_width_bits == 512 ? 32 : 16;

  // Note: cpu_core_fastest_hz must be called right after measure_data_bw_Bps.
  app.data_bw_Bps = measure_data_bw_Bps(); app.clock_hz = cpu_core_fastest_hz();

  app.cache_info = cpu_get_cache();

  // Compute Computation to Communication Ratio
  //
  const double data_bw_eps = app.data_bw_Bps / sizeof(elt_t);
  const int n_elt_p_vec = app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
  const int fp_bwd_flop_cyc =
    min(app.nt,app.n_cores) * app.n_vec_unit_p_core * n_elt_p_vec;
  const double fp_bwd_flop_s = fp_bwd_flop_cyc * app.clock_hz;
  const double comp_comm = fp_bwd_flop_s / max(1.0,data_bw_eps);

  printf
    ("CPU model %s\n", cpu_info_get_model().c_str() );
  printf
    ("Node has %d CPUs, CPU has %d cores, total %d cores. Measured clock %.2f GHz.\n",
     hwinfo->sockets, hwinfo->cores, app.n_cores, 1e-9 * app.clock_hz);
  printf("(This run spawns %d threads.)\n", app.nt);

  printf("L1 Data Per Core %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l1_core_data.bytes >> 10,
         app.cache_info.l1_core_data.associativity,
         app.cache_info.l1_core_data.line_size_bytes);
  printf("L2 Unif Per Core %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l2_core.bytes >> 10,
         app.cache_info.l2_core.associativity,
         app.cache_info.l2_core.line_size_bytes);
  printf("L3 Unif Per Chip %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l3_chip.bytes >> 10,
         app.cache_info.l3_chip.associativity,
         app.cache_info.l3_chip.line_size_bytes);

  printf
    ( "Vector width %d bits, %d vector registers, and\n"
      "%d vector unit(s) per core. (Assumed.)\n",
      app.hw_vec_width_bits, app.n_hw_vec_regs, app.n_vec_unit_p_core );

  printf("Data bw %.3f GB/s (measured). For %d threads:  comp/comm = %.1f\n",
         1e-9*app.data_bw_Bps, app.nt, comp_comm);


  // Compute Matrix * Matrix Products
  //
  //  mm_do(app,256,256,256,true);
  mm_do(app,2048,1024,4096,true);
  mm_do(app,2048,2048,4096,true);
  mm_do(app,2048,4096,4096,true);

  return 0;
}