/// LSU EE 7722 GPU Microarchitecture
//
 /// Spring 2025
 /// Homework 1 -- SOLUTION
 //
 //  Assignment: https://www.ece.lsu.edu/koppel/gp/2025/hw01.pdf
 //
 //  Solution goes throughout this file.
 //
 //  Modify this file only.

#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <ranges>
#include <random>
#include <vector>
#include <algorithm>
#include <gp/misc.h>
#include <thread>
#include <memory>
#include <fstream>
#include <filesystem>
#include <regex>
#include <ptable.h>

using namespace std;

struct Cache_Level_Info
{
  int bytes, associativity, n_sets, line_size_bytes;
};

struct Cache_Info
{
  Cache_Level_Info l1_core_data; // Level 1 Data Cache. Each core has 1.
  Cache_Level_Info l2_core;  // Level 2 Cache. Each core has 1.
  Cache_Level_Info l3_chip;  // Level 3 Cache. Each CPU has 1, shared by cores.
};

typedef float elt_t;

#include "support.h"

class App_Data {
public:
  App_Data(){};

  Cache_Info cache_info;
  double clock_hz;       // Fastest core frequency after exercising.
  double data_bw_Bps;    // Measured off-chip bandwidth.
  int nt;                // Number of threads to use.
  int n_cores;           // Number of cores on CPU(s).
  int hw_vec_width_bits; // Hardware vector width.
  int n_vec_unit_p_core; // Number of vector units per core.
  int n_hw_vec_regs;     // Number of hardware vector registers.
};

struct Thd_Misc_Data {

  int t1v, vec_wid_elts;

  /// Problem 2: Add additional elements if needed.
  ///
  /// SOLUTION -- Problem 1 -- Provide a variable to hold per-thread time.
  //
  double duration_s;

};

class MM_Data {
public:
  const int over = 16;  // Additional rows added to protect overruns.
  MM_Data( App_Data& app, int d1, int d2, int d3,
           int s1a, int s1g, int s2a, int s2b, int s3b, int s3g):
    app( app ), d1( d1 ), d2( d2 ), d3( d3 ),
    s1a( s1a ), s1g( s1g ), s2a( s2a ), s2b( s2b), s3b( s3b ), s3g( s3g ),
    a( (over+d1)*d2 ), b( (d2+over)*d3 ),
    ap( (over+s1a)*s2a ), bp( (over+s2b)*s3b ),
    g_simple( (over+d1)*d3 ), g( (over+d1)*d3 ), gp( (over+s1g)*s3g ),
    table(stdout),
    misc_data(app.nt)
  {}

  const App_Data& app;

  // Array Dimensions
  //
  const int d1, d2, d3;

  // Array Strides
  //
  const int s1a, s1g, s2a, s2b, s3b, s3g;

  // Aligned Array Objects
  //
  // All are allocated so their starting address is a multiple of 64.
  // This makes them suitable for AVX512 vectors, as well as smaller
  // vectors.
  //
  // They can be accessed in the same way as std::vector containers.
  //
  aligned_array<elt_t,64> a, b;    // a  is d1 ⨉ d2,  b is d2 ⨉ d3.
  aligned_array<elt_t,64> ap, bp;  // ap is s1a ⨉ s2a,  bp is s2b ⨉ s3b.
  aligned_array<elt_t,64> g_simple, g;  // Both are d1 ⨉ d3.
  aligned_array<elt_t,64> gp;      // gp is s1g ⨉ s3g

  // Object used for printing neat tables.
  //
  pTable table;

  // Vector for storing per-thread data.
  //
  vector<Thd_Misc_Data> misc_data;

};


void
mm_simple( MM_Data& mmd )
{
  // Compute product of a and b using a simple method.

  // Matrix a: d1 rows, d2 columns.
  // Matrix b: d2 rows, d3 columns.
  // Matrix g: d1 rows, d3 columns.

  // Convenient and Compiler-Friendly Abbreviations
  //
  const elt_t* __restrict__ a = assume_aligned<64>( mmd.a.data() );
  const elt_t* __restrict__ b = assume_aligned<64>( mmd.b.data() );
  elt_t* __restrict__ g_simple = assume_aligned<64>( mmd.g_simple.data() );
  //
  // Note: assume_aligned allows more efficient use of vector
  // instructions. __restrict__ provides more freedom for rearranging
  // load and store instructions.

  const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;

  // Use OpenMP pragmata to tell compiler to parallelize loop.
  //
#pragma omp parallel for
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d3; c++ )
      {
        elt_t e = 0;
        for ( int k=0; k<d2; k++ ) e += a[ r*d2 + k ] * b[ k*d3 + c ];
        g_simple[ r*d3 + c ] = e;
      }
}


template< int t1, int t2, int t3, int vec_wid_bits, int n_vec_registers >
void
mm_tiledr( int tid, MM_Data* mmdp )
{
  // Matrix a: d1 rows, d2 columns.
  // Matrix b: d2 rows, d3 columns.
  // Matrix g: d1 rows, d3 columns.

  MM_Data& mmd = *mmdp;

  /// SOLUTION -- Problem 1
  //
  //  Record start time of this thread.
  //
  double t_start = time_wall_fp();

  const int d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  [[maybe_unused]] const int s1a = mmd.s1a, s2a = mmd.s2a, s3b = mmd.s3b;
  [[maybe_unused]] const int s1g = mmd.s1g, s2b = mmd.s2b, s3g = mmd.s3g;

  // Code won't work if dimension is not a multiple of tile size.
  assert( d1 % t1 == 0 ); assert( d2 % t2 == 0 ); assert( d3 % t3 == 0 );

  constexpr int hw_vec_regs_bytes = vec_wid_bits * n_vec_registers / 8;
  constexpr bool use_vec = t3 >= 8 && n_vec_registers;

  // Note: This vector width is for loading and storing data. It is
  // not always the same as the hardware vector width.
  //
  constexpr int vec_wid_elts =
    min( size_t(t3), vec_wid_bits / ( 8 * sizeof(elt_t) ) );
  mmd.misc_data[tid].vec_wid_elts = vec_wid_elts;

  // Define vector type vf.
  //
  typedef elt_t vf [[ gnu::vector_size( vec_wid_elts * sizeof(elt_t) ) ]];
  //
  // This type, vf, is used to help the compiler do a better job
  // emitting vector (SSE, AVX2, or in the lab AVX512) instructions.

  // Convenient Abbreviations
  //
  const elt_t* a = assume_aligned<64>( mmd.ap.data() );
  const elt_t* b = assume_aligned<64>( mmd.bp.data() );
  elt_t* g = assume_aligned<64>( mmd.gp.data() );
  //
  // Note that this code always uses the padded arrays. This works on
  // the unmodified assignment because the padded array dimensions,
  // s1a,...,s3g, are initially set to the true array dimensions,
  // d1,d2,d3.
  //
  // The assume_aligned template tells the compiler to assume
  // that a, b, and c, are multiples of 64.

  // Abbreviations for Vector Pointers to b and g.
  //
  const vf* bv = assume_aligned<64>( (vf*)(b) );
  vf* gv = assume_aligned<64>( (vf*)(g) );
  //
  // Note: b and bv hold exactly the same address. But the compiler
  // treats b as a pointer to elt_t, while the compiler treats bv as a
  // pointer to vf. Remember that vf is a vector of elt_t elements.

  // Compute number of elements that can be stored using half
  // the number of vector registers.
  //
  constexpr int n_vec_elt = hw_vec_regs_bytes / sizeof( elt_t ) / 2;

  // Choose t1v so that there are enough vector registers
  // for a t1v by t3 tile.
  //
  constexpr int t1v = min( t1, max( 1, n_vec_elt / t3 ) );
  static_assert( t1 / t1v * t1v == t1 );
  mmd.misc_data[tid].t1v = use_vec ? t1v : t1;


  /// Tiled Matrix Multiplication
  //
  for ( int rr = tid * t1;  rr < d1;  rr += mmd.app.nt * t1 )

    /// Point P  (P is for Panel)

    for ( int cc = 0;  cc < d3;  cc += t3 )
      {
        //  Complete t1 ⨉ t3 Output Tile (of g) Computed Below
        //
        // The tile is from row rr to row rr + t1 - 1.  (See r loops below.)
        // The tile is from column cc to column cc + t3 - 1.

        if ( !use_vec )
          {
            // Compute the t1*t3 tile the conventional way, reading
            // and writing scalars.

            // Storage for the t1*t3 tile of g, each element is a scalar.
            //
            elt_t ee[t1][t3]{};
            //
            // Depending on the tile size, this storage will be in the
            // form of registers or memory.

            // One Complete t1 ⨉ t3 Output Tile (of g) Computed Below

            for ( int kk = 0;  kk < d2;  kk += t2 )
              {
                // One Complete t1 ⨉ t3 Computation Tile Computed Below

                /// Point E

                // Loops below multiply one t1*t2 tile by a t2*t3 tile ..
                // .. resulting in a t1*t3 tile ..
                // .. which is accumulated in ee.
                //
                for ( int k = kk;  k < kk + t2;  k++ )
                  for ( int r = rr;  r < rr + t1;  r++ )
                    for ( int c = cc;  c < cc + t3;  c++ )
                      /// SOLUTION -- Problem 3
                      //
                      //  Use s2a in place of d2 and s3b in place of d3.
                      //
                      ee[r-rr][c-cc] += a[ r*s2a + k ] * b[ k*s3b + c ];
              }

            // Write completed t1*t3 tile to g.
            //
            for ( int r = rr;  r < rr + t1;  r++ )
              for ( int c = cc;  c < cc + t3;  c++ )
                /// SOLUTION -- Problem 3
                //
                //  Use s3b in place of d3.
                //
                g[ r*s3g + c ] = ee[r-rr][c-cc];
          }
        else
          {
            // Compute the t1*t3 tile using vec_wid_elts component vectors.
            // This is done to help the compiler generate code using
            // vector (SIMD) instructions. (For the lab computers these
            // will be Intel AVX512 instructions.)
            //


            /// One Complete t1 ⨉ t3 Output Tile (of g) Computed Below

            /// Point C

            for ( int rrr = rr;  rrr < rr + t1;  rrr += t1v )
              {
                // One Complete t1v ⨉ t3 Output Tile (of g) Computed Below

                /// Point V

                // Storage for the tile. Each element is a vector.
                //
                vf ee[t1v][t3/vec_wid_elts]{};
                //
                // This array uses half the available vector registers,
                // leaving the rest of other purposes.

                for ( int kk = 0;  kk < d2;  kk += t2 )

                  for ( int k = kk;  k < kk + t2;  k++ )
                    {
                      // Tell compiler to trust that brow is aligned
                      // for vectors.

                      /// SOLUTION -- Problem 3
                      //
                      //  Use s3b in place of d3.
                      //
                      vf* brow = (vf*)assume_aligned<64>
                        ( &bv[ k*s3b/vec_wid_elts ] );

                      for ( int ccc = cc;  ccc < cc + t3;  ccc += vec_wid_elts )
                        {
                          for ( int r = rrr; r < rrr + t1v; r++ )

                            /// SOLUTION -- Problem 3
                            //
                            //  Use s2a in place of d2.
                            //
                            ee[r-rrr][(ccc-cc)/vec_wid_elts] +=
                              a[ r*s2a + k ] * brow[ ccc / vec_wid_elts ];
                          // Note: In product above each element of
                          // bvec is multiplied by the same value,
                          // a[r*d2+k].
                        }
                    }

                // Write completed t1v * t3 tile to g.
                //
                for ( int r = rrr;  r < rrr + t1v;  r++ )
                  for ( int c = cc;  c < cc + t3;  c += vec_wid_elts )

                    /// SOLUTION -- Problem 3
                    //
                    //  Use s3b in place of d3.
                    //
                    gv[ r*mmd.s3g/vec_wid_elts + c/vec_wid_elts ] =
                      ee[r-rrr][(c-cc)/vec_wid_elts];

              }
          }
      }

  /// SOLUTION -- Problem 1
  //
  //  Compute duration of this thread and write it to per-thread data
  //  structure.
  //
  mmd.misc_data[tid].duration_s = time_wall_fp() - t_start;
}

template< int t1, int t2, int t3 >
void
mm_tiled( int tid, MM_Data* mmdp )
{
  // Determine capacity of vector registers in bytes, and use that as
  // a template parameter for mm_tiledr.

  switch ( mmdp->app.hw_vec_width_bits ){
  case 512: mm_tiledr<t1,t2,t3,512,32>(tid,mmdp); break;
  case 256: mm_tiledr<t1,t2,t3,256,16>(tid,mmdp); break;
  case 128: mm_tiledr<t1,t2,t3,128,16>(tid,mmdp); break;
  default:  mm_tiledr<t1,t2,t3,64,0>(tid,mmdp); break;
  }
}

template< int t1, int t2 = t1, int t3 = t1 >
void
mm_tiled_do( MM_Data& mmd )
{
  const App_Data& app = mmd.app;
  const auto d1 = mmd.d1, d2 = mmd.d2, d3 = mmd.d3;
  const auto nt = app.nt;

  /// Initialize Output Array to Zero
  //
# pragma omp parallel for
  for ( auto& elt: mmd.gp ) elt = 0;

  ///
  /// Launch Threads that Compute Matrix Product
  //
  vector<thread> threads;

  // Start threads ..
  //
  const double t_start_s = time_wall_fp();
  for ( int i=0; i<nt; i++ )
    threads.emplace_back( mm_tiled<t1,t2,t3>, i, &mmd );
  //
  // ..and then wait for threads to finish.
  //
  for ( auto& thd: threads ) thd.join();
  const double t_end_s = time_wall_fp();

  const bool re_check_cpu_clocks = true;
  const double cpu_clock_hz =
    re_check_cpu_clocks ? cpu_core_fastest_hz() : app.clock_hz;

  //
  /// Check Matrix Product
  //
  int n_err = 0, n_err_z = 0;
  double max_err = 0;

  // Compute error tolerance.
  float tol = numeric_limits<elt_t>::epsilon() * d2 * 5;
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d3; c++ )
      {
        const elt_t hc = mmd.g_simple[ r*d3 + c]; // Assumed Correct
        //  elt_t hd = mmd.g[ r*d3 + c ];

        /// SOLUTION -- Problem 3
        //
        //  Use s3g in place of d3.
        //
        const elt_t hd = mmd.gp[ r*mmd.s3g + c ];
        const float d = fabs( hd - hc );
        if ( d > max_err ) max_err = d;
        if ( d > tol )
          {
            n_err++;
            if ( hd == 0 ) n_err_z++;
          }
      }

  bool verbose = false;
  if ( n_err || verbose )
    {
      const size_t g_size = d1 * d3;
      printf("Number of errors %d out of %zd. %d zeros\n",
             n_err, g_size, n_err_z);
      printf("Max err = %.9f,  tolerance = %.9f\n", max_err, tol );
    }


  ///
  /// Print Information About Tile and Performance
  ///
  const double data_bw_eps = mmd.app.data_bw_Bps / sizeof(elt_t);
  const int n_elt_p_vec = mmd.app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
  const int fp_bwd_flop_cyc =
    min(nt,mmd.app.n_cores) * mmd.app.n_vec_unit_p_core * n_elt_p_vec;
  const double fp_bwd_gflop = fp_bwd_flop_cyc * cpu_clock_hz * 1e-9;
  const double duration_s = t_end_s - t_start_s;
  const double n_madd = double(d1) * d2 * d3;
  const double fp_thpt_gflop = 1e-9 * n_madd / duration_s;


  /// SOLUTION -- Problem 1
  //
  //  Compute expected and measured utilization.
  //
  //  In these comments and the name of some variables the word
  //  *panel* refers to a group of t1 rows.
  //
  //  Each iteration of the rr loop in mm_tiledr computes one panel.
  //
  //  To compute utilization, work will be measured in units of panels.
  //
  const int n_panels = d1 / t1;
  assert( n_panels * t1 == d1 ); // Make sure that t1 divides d1 evenly.
  //
  //  n_panels is the total amount of work that needs to be done.
  //
  //  Each thread computes either ⌈ n_panels / nt ⌉ (upper bound) or ⌊
  //  n_panels / nt ⌋ (lower bound) panels.
  //
  //  Compute the larger number, which is the amount of work done
  //  by the busiest threads:
  //
  const int n_panels_per_thread = ( n_panels + nt - 1 ) / nt;
  //
  //  Note that (x+y-1)/y is x/y rounded up.
  //
  //  To compute the expected utilization divide the total work by the
  //  amount of work done if every thread had as much work as the
  //  busiest.
  //
  double util_expected = double(n_panels) / ( n_panels_per_thread * nt );
  //
  //  Note that utilization is bad when nt > n_panels because in that
  //  case some threads do no work at all.

  //  Next compute measured utilization.
  //
  //  First compute the sum of the per-thread durations collected in
  //  mm_tiledr:
  //
  double dur_thds_total_s = 0;
  for ( auto& md: mmd.misc_data ) dur_thds_total_s += md.duration_s;
  //
  //  Remember that duration_s is the time period from just before the
  //  first thread is spawned until after the last thread finishes.
  //
  //  Ideally the value of mmd.misc_data[i].duration_s for each thread
  //  is equal to duration_s. In that situation we'd like the measured
  //  utilization to be 1 (and it would be with the way its computed
  //  below).
  //
  //  In reality some threads finish earlier, and when n_panels < nt, some
  //  threads have no work at all. Also, mmd.misc_data[i].duration_s does
  //  not include the time needed to start a thread or to shut it down.
  //
  //  Since dur_thds_total_s is a sum of each
  //  mmd.misc_data[i].duration_s, measured utilization will be
  //  computed by dividing dur_thds_total_s by duration_s * nt, which
  //  is what dur_thds_total_s would be in the ideal case.
  //
  double util_measured = dur_thds_total_s / ( duration_s * nt );

  const size_t point_e_cache_size_min_elts =
    t1 * t3  // For ee (though registers might be used for ee)
    + t3     // For b[ k*d3 + c ].
    + 1;     // For a[ r*d2 + k ].

  const size_t point_e_n_executions = ( d1 / t1 ) * ( d2 / t2 ) * ( d3 / t3 );
  const size_t point_e_xfer_elts =
    t1 * t3 * ( d1 / t1 ) * ( d3 / t3 )  // Yes, it's just d1 * d3.
    + point_e_n_executions * ( t1 * t2 + t2 * t3 );


  // The following values may be useful for solving Problem 2.

  [[ maybe_unused ]] const int t1v = mmd.misc_data[0].t1v;
  [[ maybe_unused ]] const int vec_wid_elts = mmd.misc_data[0].vec_wid_elts;



  /// SOLUTION -- Problem 2
  //
  //  Most of the solution IS here, but data will need to
  //  be collected elsewhere.

  // [✔] For Point V, find the minimum cache size needed for perfect
  // reuse (so there is at most one miss per access to a, b).
  //
  size_t point_v_cache_size_min_elts =
    t1v * t3        // ee
    + t1v           // a
    + vec_wid_elts; // b

  // [✔] Compute the number of elements transferred to or from memory
  // assuming perfect reuse at point V but no reuse elsewhere. Account
  // for all threads.
  //
  // First compute the number of time Point V will be reached, accounting
  // for all threads:
  //
  size_t n_point_v = d1/t1v * d3/t3;
  //
  // The amount transferred includes the output array, plus
  // the amount accessed in Point V times the number of times it
  // is executed:
  //
  size_t point_v_xfer_elts =
    d1 * d3
    + n_point_v * ( t1v * d2     // a
                    + d2 * t3 ); // b

  // [✔] For Point C, find the minimum cache size needed for perfect
  // reuse (so there is at most one miss per access to a, b).
  //
  size_t point_c_cache_size_min_elts =
   t1v * t3   // ee
   + t1v      // a
   + d2 * t3; // b

  // [✔] Compute the number of elements transferred to or from memory
  // assuming perfect reuse at point C but no reuse elsewhere. Account
  // for all threads.
  //
  size_t n_point_c = d1/t1 * d3/t3;
  size_t point_c_xfer_elts =
    d1 * d3
    + n_point_c * ( t1 * d2      // a
                    + d2 * t3 ); // b

  // [✔] For Point P, find the minimum cache size needed for perfect
  // reuse (so there is at most one miss per access to a, b).
  //
  size_t point_p_cache_size_min_elts =
   t1v * t3   // ee
   + t1 * d2  // a
   + d2 * t3; // b

  // [✔] Compute the number of elements transferred to or from memory
  // assuming perfect reuse at point P but no reuse elsewhere. Account
  // for all threads.
  //
  size_t n_point_p = d1 / t1;
  size_t point_p_xfer_elts =
    d1 * d3
    + n_point_p * ( t1 * d2      // a
                    + d2 * d3 ); // b


  const double data_limit_elts = data_bw_eps * duration_s;

  pTable_Row row(mmd.table);

  mmd.table.header_span_start("Tile");
  mmd.table.entry("t1", "%3d", t1);
  mmd.table.entry("t2", "%2d", t2);
  mmd.table.entry("t3", "%3d", t3);
  mmd.table.header_span_end();
  mmd.table.entry( "Dur/ms", "%7.2f", duration_s * 1000 );
  mmd.table.header_span_start("FP Throughput");
  mmd.table.entry( "GFLOPs", "%7.2f", fp_thpt_gflop );
  mmd.table.entry( "% Peak", "%7.2f", 100 * fp_thpt_gflop / fp_bwd_gflop );
  mmd.table.header_span_end();
  if ( re_check_cpu_clocks )
    {
      mmd.table.header_span_start("Clk");
      mmd.table.entry( "MHz", "%4.0f", cpu_clock_hz * 1e-6 );
      mmd.table.header_span_end();
    }


  mmd.table.header_span_start("Utiliz");
  mmd.table.entry( "Exp", "%3.0f", 100.0 * util_expected );
  mmd.table.entry( "Mes", "%3.0f", 100.0 * util_measured );
  mmd.table.header_span_end();

  const size_t l2_size_elts = app.cache_info.l2_core.bytes / sizeof(elt_t);
  [[ maybe_unused ]]
  const size_t l3_size_elts = app.cache_info.l3_chip.bytes / sizeof(elt_t);

  mmd.table.header_span_start("E");
  mmd.table.entry( "%L2", "%3.0f",
                   1e2 * point_e_cache_size_min_elts / l2_size_elts );
  mmd.table.entry( "%BW", "%3.0f", 1e2 * point_e_xfer_elts / data_limit_elts );
  mmd.table.header_span_end();

  mmd.table.header_span_start("V");
  mmd.table.entry( "%L2", "%3.0f",
                   1e2 * point_v_cache_size_min_elts / l2_size_elts );
  mmd.table.entry( "%BW", "%4.0f", 1e2 * point_v_xfer_elts / data_limit_elts );
  mmd.table.header_span_end();

  mmd.table.header_span_start("C");
  mmd.table.entry( "%L2", "%3.0f",
                   1e2 * point_c_cache_size_min_elts / l2_size_elts );
  mmd.table.entry( "%BW", "%3.0f", 1e2 * point_c_xfer_elts / data_limit_elts );
  mmd.table.header_span_end();

  mmd.table.header_span_start("P");
  mmd.table.entry( "%L2", "%3.0f",
                   1e2 * point_p_cache_size_min_elts / l2_size_elts );
  mmd.table.entry( "%BW", "%3.0f", 1e2 * point_p_xfer_elts / data_limit_elts );
  mmd.table.header_span_end();

}

void
mm_do( App_Data& app, int d1, int d2, int d3, bool pad = false )
{
  /// Multiply  d1 * d2 matrix by d2 * d3 Matrix Using a Variety of Tilings

  // Set strides equal to dimensions. This results in a matrix without padding.
  //
  int s1a = d1, s2a = d2, s3b = d3;
  int s1g = d1, s2b = d2, s3g = d3;

  if ( pad )
    {
      // Set strides to avoid aliasing in set-associative cache.

      // Helpful Information

      // Line size. Assume all levels use the size line size.
      //
      [[ maybe_unused ]]
      const int line_size_bytes = app.cache_info.l3_chip.line_size_bytes;
      //
      // See Cache_Info structure (near top of file) for more information.

      /// Problem 3
      //
      // Some of solution goes here.

      /// SOLUTION -- Problem 3
      //
      //  Compute the number of cache lines occupied by a d2-element
      //  row.
      //
      const int line_size_elts = line_size_bytes / sizeof(elt_t);
      const int n_lines_d2 = d2 / line_size_elts;
      //
      //  Round up to an odd number of lines.
      //
      int n_pri_d2 = n_lines_d2 | 1;
      //
      //  Then set the stride to the number of elements that
      //  can fit into n_pri_d2 lines.
      //
      s2a = n_pri_d2 * line_size_elts;
      //
      //  Do the same thing for dimension d3.
      //
      const int n_lines_d3 = d3 / line_size_elts;
      int n_pri_d3 = n_lines_d3 | 1;
      s3b = s3g = n_pri_d3 * line_size_elts;
    }

  // Construct a Matrix Multiply Data Object
  //
  MM_Data mmd(app,d1,d2,d3,s1a,s1g,s2a,s2b,s3b,s3g);

  // Fill a and b matrices with random numbers.
  //
  ranges::generate(mmd.a,rand_pm1);
  ranges::generate(mmd.b,rand_pm1);

  ///  Problem 3
  /// SOLUTION -- Problem 3
  //
  //  Copy data from the a and b matrices to their padded counterparts.
  //
  //  [✔] Modify the code for strided storage in ap and bp.
  //
  //  Replace d2 with s2a when writing ap and replace d3 with s3b when
  //  writing bp.
  //
  for ( int r=0; r<d1; r++ )
    for ( int c=0; c<d2; c++ )
      mmd.ap[ r * s2a + c ] =  mmd.a[ r * d2 + c ];
  for ( int r=0; r<d2; r++ )
    for ( int c=0; c<d3; c++ )
      mmd.bp[ r * s3b + c ] =  mmd.b[ r * d3 + c ];

  const double t_start_s = time_wall_fp();

  // Compute matrix product on unpadded arrays using a simple method.
  //
  mm_simple( mmd );

  const double t_simple_end_s = time_wall_fp();
  const double dur_simple_s = t_simple_end_s - t_start_s;
  printf("\nMatrix %d ⨉ %d  %d ⨉ %d.  Duration simple: %.3f ms\n",
         d1, d2, d2, d3, dur_simple_s * 1000 );
  printf("Stride %d ⨉ %d  %d ⨉ %d  = %d x %d. Padding %sRequested\n",
         s1a, s2a, s2b, s3b, s1g, s3g,
         pad ? "" : "Not ");

  int64_t n_madd = int64_t(d1) * d2 * d3;

  // To save time skip the smaller tilings when matrix is large.
  if ( n_madd < int64_t(2048) * 2048 * 2048 )
    {
      mm_tiled_do<1,16,1>( mmd );
      mm_tiled_do<4,16,4>( mmd );
    }

  mm_tiled_do<8,16,8>( mmd );
  mm_tiled_do<8,16,16>( mmd );
  mm_tiled_do<8,16,32>( mmd );
  mm_tiled_do<8,16,64>( mmd );
  mm_tiled_do<16,16,8>( mmd );
  mm_tiled_do<16>( mmd );
  mm_tiled_do<16,16,32>( mmd );
  mm_tiled_do<16,16,64>( mmd );
  mm_tiled_do<16,16,128>( mmd );
  mm_tiled_do<16,16,256>( mmd );
  mm_tiled_do<32,16,8>( mmd );
  mm_tiled_do<32,16,16>( mmd );
  mm_tiled_do<32,16,32>( mmd );
  mm_tiled_do<32,16,64>( mmd );
  mm_tiled_do<32,16,128>( mmd );
  mm_tiled_do<64,16,8>( mmd );
  mm_tiled_do<64,16,16>( mmd );
  mm_tiled_do<64,16,32>( mmd );
  mm_tiled_do<64,16,64>( mmd );
  mm_tiled_do<64,16,128>( mmd );
  mm_tiled_do<128,16,8>( mmd );
  mm_tiled_do<128,16,16>( mmd );
  mm_tiled_do<128,16,32>( mmd );
  mm_tiled_do<128,16,64>( mmd );
  mm_tiled_do<128,16,128>( mmd );
  mm_tiled_do<256,16,8>( mmd );
  mm_tiled_do<256,16,16>( mmd );
  mm_tiled_do<256,16,32>( mmd );
  mm_tiled_do<256,16,64>( mmd );
  mm_tiled_do<256,16,128>( mmd );

}


int
main( int argc, char **argv )
{
  App_Data app;

  app.n_cores = thread::hardware_concurrency();
  //
  // Note: This is only correct when SMT (hyperthreading) is off.

  // Get number of threads to spawn from the command-line argument.
  //
  // If no argument given, use number of cores as the default number of threads.
  //
  app.nt = argc == 1 ? app.n_cores : atoi(argv[1]);

  app.hw_vec_width_bits =
    __builtin_cpu_supports("avx512f") ? 512 :
    __builtin_cpu_supports("avx2") ? 256 : 128;

  // The number of vector units computed per core below is correct
  // for some LSU ECE lab computers, but is not generally correct.
  //
  app.n_vec_unit_p_core = app.hw_vec_width_bits == 512 ? 2 : 1; // Fix
  app.n_hw_vec_regs = app.hw_vec_width_bits == 512 ? 32 : 16;

  // Note: cpu_core_fastest_hz must be called right after measure_data_bw_Bps.
  app.data_bw_Bps = measure_data_bw_Bps(); app.clock_hz = cpu_core_fastest_hz();

  app.cache_info = cpu_get_cache();

  // Compute Computation to Communication Ratio
  //
  const double data_bw_eps = app.data_bw_Bps / sizeof(elt_t);
  const int n_elt_p_vec = app.hw_vec_width_bits / ( 8 * sizeof(elt_t) );
  const int fp_bwd_flop_cyc =
    min(app.nt,app.n_cores) * app.n_vec_unit_p_core * n_elt_p_vec;
  const double fp_bwd_flop_s = fp_bwd_flop_cyc * app.clock_hz;
  const double comp_comm = fp_bwd_flop_s / max(1.0,data_bw_eps);

  printf
    ("CPU model %s\n", cpu_info_get_model().c_str() );
  printf
    ("CPU has %d cores at %.2f GHz.  (This run spawns %d threads.)\n",
     app.n_cores, 1e-9 * app.clock_hz, app.nt);

  printf("L1 Data Per Core %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l1_core_data.bytes >> 10,
         app.cache_info.l1_core_data.associativity,
         app.cache_info.l1_core_data.line_size_bytes);
  printf("L2 Unif Per Core %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l2_core.bytes >> 10,
         app.cache_info.l2_core.associativity,
         app.cache_info.l2_core.line_size_bytes);
  printf("L3 Unif Per Chip %6d kiB, %2d-way, line size %d bytes.\n",
         app.cache_info.l3_chip.bytes >> 10,
         app.cache_info.l3_chip.associativity,
         app.cache_info.l3_chip.line_size_bytes);

  printf
    ( "Vector width %d bits, %d vector registers, and\n"
      "%d vector unit(s) per core. (Assumed.)\n",
      app.hw_vec_width_bits, app.n_hw_vec_regs, app.n_vec_unit_p_core );

  printf("Data bw %.3f GB/s (measured). For %d threads:  comp/comm = %.1f\n",
         1e-9*app.data_bw_Bps, app.nt, comp_comm);


  // Compute Matrix * Matrix Products
  //
  mm_do(app,256,256,256);
  mm_do(app,1024,1024,1024);
  mm_do(app,1024,1024,1024,true);
  mm_do(app,2048,2048,4096);
  mm_do(app,2048,2048,4096,true);

}