/// LSU EE 4702
//
 /// CUDA Demo 04
 //
 //  This code demonstrates different methods of all-to-all access in
 //  CUDA. such as the accesses to array in the code below:
 //
 //  for ( int i=0; i<n; i++ ) for ( j=0; j<n; j++ ) sum ++ array[i] * array[j];
 //
 //  See routines time_step_intersect_1 and time_step_intersect_2 in
 //  demo-cuda-04-acc-pat-cuda.cu. Instead of array, this code
 //  accesses helix_position.


 ///  Note: Requires OpenGL 4.5 and CUDA


#if 0
/// Code Outline

 /// Physics Model
//
 /// Wire Segment
//
//   Shape: cylinder.



#endif

///  Keyboard Commands
 //
 /// Object (Eye, Light, Ball) Location or Push
 //   Arrows, Page Up, Page Down
 //   Will move object or push end of spring, depending on mode:
 //   'e': Move eye.
 //   'b': Grab top (last segment of) of spring.
 //   'l': Move light.
 //   'B': Release top (last segment of) spring.
 //
 /// Eye Direction
 //   Home, End, Delete, Insert
 //   Turn the eye direction.
 //   Home should rotate eye direction up, End should rotate eye
 //   down, Delete should rotate eye left, Insert should rotate eye
 //   right.  The eye direction vector is displayed in the upper left.

 /// Simulation Options
 //  (Also see variables below.)
 //
 //  'a'    Switch between CPU and GPU physics.
 //  'i'    Cycle three methods of interpenetration handling:
 //           GPU Physics Mode: None, GPU-Prob-1, GPU-Prob-2.
 //           CPU Physics Mode: None, CPU, CPU (last two equivalent).
 //  's'    Toggle alternative use of shared memory.

 //  'b'    Grab free end (last segment) of spring. Once grabbed, can
 //         be controlled with arrow keys (see above).
 //  'B'    Release spring.
 //  'p'    Pause simulation.
 //  'g'    Toggle gravity on and off.
 //  'y'    Toggle state of Boolean opt_tryout1, use it as you like.
 //  'Y'    Toggle state of Boolean opt_tryout2, use it as you like.

 //  'F11'  Change size of text.
 //  'F12'  Write screenshot to file.

 /// Variables
 //   Selected program variables can be modified using the keyboard.
 //   Use "Tab" to cycle through the variable to be modified, the
 //   name of the variable is displayed next to "VAR" on the bottom
 //   line of green text.

 //  'Tab' Cycle to next variable.
 //  '`'   Cycle to previous variable.
 //  '+'   Increase variable value.
 //  '-'   Decrease variable value.
 //
 //  -- Documented Variables --
 //
 //     Spring Constant (opt_spring_constant)
 //       Spring constant used for the spring.
 //       Higher values makes the spring tighter, but at some maximum
 //       the simulation will become unstable.
 //
 //     Seg / Block (interp_seg_per_block)
 //       The number of segments per CUDA block that the interpenetration
 //       routine will use.



#define GL_GLEXT_PROTOTYPES
#define GLX_GLXEXT_PROTOTYPES

#define GL_GLEXT_LEGACY
#include <GL/gl.h>
#include <GL/glext.h>

#include <GL/glu.h>
#include <GL/freeglut.h>

// Include files provided for this course.
//
#include <gp/util.h>
#include <gp/glextfuncs.h>
#include <gp/coord.h>
#include <gp/shader.h>
#include <gp/pstring.h>
#include <gp/misc.h>
#include <gp/gl-buffer.h>
#include <gp/texture-util.h>
#include "shapes.h"
#include <gp/colors.h>

#include <gp/cuda-util.h>

#include "demo-cuda-04-acc-pat.cuh"

// Define storage buffer binding indices and attribute locations.
//
#define UNIF_IDX_BULDGE_LOC 0
#define UNIF_IDX_BULDGE_DIST_THRESH 1
#define UNIF_IDX_WIRE_RADIUS 2
#define ATTR_IDX_HELIX_INDICES 1
#define SB_COORD 1
#define SB_LZ 2
#define SB_LY 3


enum GPU_Physics_Method
  { GP_cpu, GP_cuda_1, GP_cuda_2, GPU_cuda_ez,
    GP_ENUM_SIZE };
const char* const gpu_physics_method_str[] =
  { "CPU", "CUDA-M1", "CUDA-M2", "CUDA-EZ" };

struct Wire_Segment {
  pCoor position;
  pQuat orientation;

  pVect velocity;
  pVect omega;

  // Local-space axes.
  pVect lx, ly, lz;

  // Global-space coordinates of neighbor.
  pVect nx, ny, nz;
  pCoor pos_left, pos_right;

  pVect force;
  pVect torque;

};



class World {
public:
  World(pOpenGL_Helper &fb):ogl_helper(fb){init();}
  void init();
  static void render_w(void *moi){ ((World*)moi)->frame_callback(); }
  void frame_callback();
  void physics_advance();
  void render();
  void cb_keyboard();
  void modelview_update();

  // Class providing utilities, such as showing text.
  //
  pOpenGL_Helper& ogl_helper;

  // Class for easy keyboard control of variables.
  //
  pVariable_Control variable_control;

  // Class for showing frame timing.
  //
  pFrame_Timer frame_timer;

  pCoor light_location;
  float opt_light_intensity;

  pCoor helix_location;
  Wire_Segment *helix_end_ws;  // Last (end) segment of helix.
  bool opt_end_fixed;   // If true position of "end" of helix is fixed in space.
  float helix_radius;   // Radius of helix.
  float wire_radius;    // Radius of wire forming helix.
  int seg_per_helix_revolution;
  int seg_per_wire_revolution;
  int revolutions_per_helix;  // Number of times helix wraps around.

  bool coords_stale;
  bool buffer_objects_stale;

  vector<int> helix_indices;
  GLuint helix_indices_bo;

  // Coordinates of helix. (Helix runs through center of wire.)
  //
  vector<pCoor> helix_coords;
  GLuint helix_coords_bo;
  int helix_coords_size;

  vector<pCoor> helix_lz, helix_ly;
  GLuint helix_lz_bo;
  GLuint helix_ly_bo;

  vector<int> wire_surface_indices;
  GLuint wire_surface_indices_bo;

  pShader *sp_fixed;          // Fixed functionality.
  pShader *sp_geo_shade;

  enum { MI_Eye, MI_Light, MI_Ball, MI_Ball_V, MI_COUNT } opt_move_item;

  pCoor eye_location;
  pVect eye_direction;
  pMatrix modelview;

  GLuint texture_id_syllabus;

  // Physics
  double world_time, last_frame_wall_time;
  float delta_t;
  bool opt_pause;
  bool opt_single_frame;      // Simulate for one frame.
  bool opt_single_time_step;  // Simulate for one time step.

  vector<Wire_Segment> wire_segments;

  pVect gravity_accel;
  pQuat helix_rn_trans;
  float helix_seg_hlength;
  float helix_seg_hlength_inv;
  float opt_spring_constant;
  float opt_helix_density;
  bool opt_gravity;
  int opt_use_shared;
  bool opt_test;
  bool opt_tryout1, opt_tryout2;
  float helix_seg_mass;
  float helix_seg_mass_inv;
  float helix_seg_ma_axis;
  float helix_seg_ma_perp_axis;
  void helix_apply_force_at
  (Wire_Segment *ws, pCoor pos, pVect dir, float magnitude);
  pVect helix_get_vel_at(Wire_Segment *ws, pCoor pos);

  float helix_spring_energy;

  void time_step_cpu();

  // CUDA Physics

  int opt_physics_method;
  bool opt_ez_method_safety;  // If true use super-slow ez kernel.
  int ez_prev_physics;        // Physics method in use when switched to ez.
  bool cuda_initialized;
  bool cuda_constants_stale;
  cudaEvent_t frame_start_ce, frame_stop_ce;
  cudaEvent_t interp_start_ce, interp_stop_ce;
  int block_size_max;
  int interp_seg_per_block;
  cudaDeviceProp cuda_prop;  // Properties of cuda device (GPU, cuda version).
  GPU_Info gpu_info;
  int cuda_time_step_count;
  int64_t time_inter_cyc;

  int inter_block_size, inter_nblocks;
  int inter_dynamic_sm_size_bytes;

  vector<pCoor> helix_position;
  vector<pVect> helix_velocity;
  vector<pQuat> helix_orientation;
  vector<pVect> helix_omega;

  pCoor *d_helix_position;
  pVect *d_helix_velocity;
  pQuat *d_helix_orientation;
  pVect *d_helix_omega;

  void cuda_init();
};

void
World::init()
{
  coords_stale = true;

  seg_per_helix_revolution = 80;
  seg_per_wire_revolution = 20;
  revolutions_per_helix = 8;

  const int num_seg = revolutions_per_helix * seg_per_helix_revolution;

  opt_spring_constant = num_seg / 640.0 * 900000;
  gravity_accel = pVect(0,-.98,0);
  opt_helix_density = 1;

  //  variable_control.insert(seg_per_helix_revolution,"Seg Per Helix Rev");
  //  variable_control.insert(seg_per_wire_revolution,"Seg Per Wire Rev");

  buffer_objects_stale = true;
  helix_coords_bo = 0;
  wire_surface_indices_bo = 0;

  eye_location = pCoor(-1.8,6.9,23.4);
  eye_direction = pVect(0,0,-1);

  opt_light_intensity = 1.5;
  light_location = pCoor(12.2,4.0,6.9);

  opt_end_fixed = false;
  helix_location = pCoor(0,0,-5);
  helix_radius = 5;
  wire_radius = 0.6;
  variable_control.insert(wire_radius,"Wire Radius");

  //  variable_control.insert(opt_light_intensity,"Light Intensity");

  opt_move_item = MI_Eye;

  texture_id_syllabus = pBuild_Texture_File("gpup.png",false,255);

  // Declared like a programmable shader, but used for fixed-functionality.
  //
  sp_fixed = new pShader();

  const char* const file = "demo-cuda-04-acc-pat-shdr.cc";

  sp_geo_shade = new pShader
    (file,           // File holding shader program.
     "vs_main_helix();",      // Name of vertex shader main routine.
     "gs_main_helix();",
     "fs_main_phong();"       // Name of fragment shader main routine.
     );

  modelview_update();

  frame_timer.work_unit_set("Steps / s");
  opt_physics_method = GP_cuda_1;


  variable_control.insert(opt_spring_constant,"Spring Constant");

  delta_t = 1.0 / ( 30 * 200 );
  opt_pause = false;
  opt_single_time_step = false;
  opt_single_frame = false;

  world_time = 0;
  last_frame_wall_time = time_wall_fp();
  opt_gravity = true;
  opt_use_shared = false;
  opt_test = false;
  opt_tryout1 = false;
  opt_tryout2 = false;

  interp_seg_per_block = 32;
  variable_control.insert_power_of_2
    (interp_seg_per_block,"Seg / Block",1,64);

  variable_control.insert_power_of_2
    (inter_block_size, "Interp Block Size", 32, 1024);
  cuda_init();
}

void
World::modelview_update()
{
  pMatrix_Translate center_eye(-eye_location);
  pMatrix_Rotation rotate_eye(eye_direction,pVect(0,0,-1));
  modelview = rotate_eye * center_eye;
}

void
World::frame_callback()
{
  // This routine called whenever window needs to be updated.

  // Get any waiting keyboard commands.
  //
  cb_keyboard();

  physics_advance();

  render();
}

void
World::physics_advance()
{
  if ( wire_segments.size() == 0 ) return;

  // Run the physical simulation if not paused.
  //
  frame_timer.phys_start();
  const double time_now = time_wall_fp();

  // Update data on GPU if necessary.
  //
  if ( cuda_constants_stale )
    {
      cuda_constants_stale = false;
      const size_t phys_helix_segments = wire_segments.size();

      // Allocate storage and write GPU variables with addresses of storage.
      //
      if ( helix_position.size() != phys_helix_segments )
        {
          helix_position.resize(phys_helix_segments);
          helix_velocity.resize(phys_helix_segments);
          helix_orientation.resize(phys_helix_segments);
          helix_omega.resize(phys_helix_segments);
          if ( d_helix_position )
            {
              CE( cudaFree( d_helix_position ) );
              CE( cudaFree( d_helix_velocity ) );
              CE( cudaFree( d_helix_orientation ) );
              CE( cudaFree( d_helix_omega ) );
            }
          CE( cudaMalloc
              ( &d_helix_position,
                phys_helix_segments * sizeof(d_helix_position[0]) ) );
          CE( cudaMalloc
              ( &d_helix_velocity,
                phys_helix_segments * sizeof(d_helix_velocity[0]) ) );
          CE( cudaMalloc
              ( &d_helix_orientation,
                phys_helix_segments * sizeof(d_helix_orientation[0]) ) );
          CE( cudaMalloc
              ( &d_helix_omega,
                phys_helix_segments * sizeof(d_helix_omega[0]) ) );

          cuda_array_addrs_set
            ((float4*)d_helix_position, (float3*)d_helix_velocity,
             (float4*)d_helix_orientation, (float3*)d_helix_omega);
        }

      // Copy data from our CPU-friendly wire_segments array of
      // structures to GPU-friendly individual arrays. Note that this
      // loop copies data from one part of CPU memory to
      // another. Since the ultimate destination this extra CPU->CPU
      // copying is wasteful, but that's acceptable because it only
      // needs to be done rarely, at start up and when the user
      // changes a variable.
      //
#pragma omp parallel for
      for ( size_t i=0; i<phys_helix_segments; i++ )
        {
          Wire_Segment* const ws = &wire_segments[i];
          helix_position[i] = ws->position;
          helix_velocity[i] = ws->velocity;
          helix_orientation[i] = ws->orientation;
          helix_omega[i] = ws->omega;
        }

      // Copy data from CPU to GPU.
      //
#define TOCUDA(var) \
      CE( cudaMemcpy( d_##var, var.data(), var.size() * sizeof(var[0]), \
                      cudaMemcpyHostToDevice ) );
      TOCUDA(helix_position);
      TOCUDA(helix_velocity);
      TOCUDA(helix_orientation);
      TOCUDA(helix_omega);

      // Write scalar variables to a structure, and then
      // send structure to GPU.
      //
      Helix_Info hi;
      const float delta_t_mass_inv = delta_t * helix_seg_mass_inv;
      const float delta_t_ma_axis = delta_t / helix_seg_ma_axis;
      const float delta_t_ma_perp_axis = delta_t / helix_seg_ma_perp_axis;

#define SET(m) hi.m = m;
      SET(opt_gravity);
      SET(opt_tryout1);
      SET(opt_tryout2);
      SET(opt_end_fixed);
      SET(opt_use_shared);
      SET(opt_spring_constant);
      SET(delta_t);
      SET(delta_t_mass_inv);
      SET(delta_t_ma_axis);
      SET(delta_t_ma_perp_axis);
      SET(phys_helix_segments);
      SET(wire_radius);
      SET(helix_seg_hlength);
      SET(helix_seg_mass_inv);
      SET(gravity_accel);
      SET(helix_rn_trans);
#undef SET
      TO_DEV(hi);

      Timing_Data timing_data;
      timing_data.inter_time = 0;
      timing_data.inter_count = 0;
      TO_DEV(timing_data);
    }


  if ( !opt_pause || opt_single_frame || opt_single_time_step )
    {
      const int phys_helix_segments = wire_segments.size();

      // Use a small block size under the assumption that
      // phys_helix_segments is small.
      const int block_size = 128;  // Not for intersection code.
      const int grid_size =
        ( phys_helix_segments + block_size - 1 ) / block_size;

      if ( opt_physics_method != GP_cpu )
        CE(cudaEventRecord(frame_start_ce,0));

      // Evolve simulated state in time steps of duration delta_t until
      // simulated time reaches current time.
      //

      // Amount of time since the user saw the last frame.
      //
      const double wall_delta_t = time_now - last_frame_wall_time;

      // Compute amount by which to advance simulation state for this frame.
      //
      const double duration =
        opt_single_time_step ? delta_t :
        opt_single_frame || ogl_helper.animation_record
        ? ogl_helper.frame_period :
        wall_delta_t;

      const double wall_time_deadline =
        time_wall_fp() + ogl_helper.frame_period;

      const double world_time_target = world_time + duration;
      int iter_count = 0;

      inter_nblocks = phys_helix_segments/interp_seg_per_block;
      const int thd_per_a = inter_block_size / interp_seg_per_block;
      inter_dynamic_sm_size_bytes =
        inter_block_size * sizeof(pVect) +
        ( opt_use_shared ? thd_per_a * sizeof(pVect) : 0 );

      while ( world_time < world_time_target )
        {
          iter_count++;
          if ( opt_physics_method != GP_cpu )
            {
              cuda_time_step_count++;
              time_step_launch(grid_size,block_size);
              time_step_intersect_launch
                (inter_nblocks, inter_block_size,
                 opt_physics_method, inter_dynamic_sm_size_bytes);
              time_step_update_pos_launch(grid_size,block_size);
            }
          else
            time_step_cpu();
          world_time += delta_t;
          if ( time_wall_fp() > wall_time_deadline ) break;
        }

      frame_timer.work_amt_set(iter_count);

      // Reset these, just in case they were set.
      //
      opt_single_frame = opt_single_time_step = false;

      if ( opt_physics_method != GP_cpu )
        {
          // Collect amount of time CUDA took to compute this frame.
          //

          Timing_Data timing_data;
          FROM_DEV(timing_data);
          time_inter_cyc =
            timing_data.inter_time / max(1,timing_data.inter_count);

          CE(cudaEventRecord(frame_stop_ce,0));
          CE(cudaEventSynchronize(frame_stop_ce));
          float cuda_time = -1.1;
          CE(cudaEventElapsedTime(&cuda_time,frame_start_ce,frame_stop_ce));
          frame_timer.cuda_frame_time_set(cuda_time);

          // Copy data back to CPU. Some or all of this copying can be
          // eliminated. As things are currently written, there is no
          // need to copy back the velocity and orientation unless
          // simulation is switching to CPU physics. If the same GPU
          // were used for physics and graphics, there would also be
          // no need to copy back the position and orientation. (A
          // CUDA global memory can be mapped to an OpenGL buffer
          // object.)
          //
#define FROMCUDA(var) \
          CE( cudaMemcpy( var.data(), d_##var, var.size() * sizeof(var[0]), \
                          cudaMemcpyDeviceToHost ) );
          FROMCUDA(helix_position);
          FROMCUDA(helix_velocity);
          FROMCUDA(helix_orientation);
          FROMCUDA(helix_omega);

#pragma omp parallel for
          for ( int i=0; i<phys_helix_segments; i++ )
            {
              Wire_Segment* const ws = &wire_segments[i];
              ws->position = helix_position[i];
              ws->velocity = helix_velocity[i];
              ws->orientation = helix_orientation[i];
              ws->omega = helix_omega[i];
            }
        }

      frame_timer.phys_end();

      // Compute vectors used for drawing wire, and copy helix coordinate.
      //
#pragma omp parallel for
      for ( int i=0; i<phys_helix_segments; i++ )
        {
          Wire_Segment* const ws = &wire_segments[i];

          // Copy helix coordinates to another array before sending
          // back to GPU. This step could easily be eliminated since
          // the coordinates are already linearized in helix_position.
          //
          helix_coords[i] = ws->position;

          // Compute vectors used for drawing wire.
          //
          pMatrix_Rotation c_rot(ws->orientation);
          helix_lz[i] = c_rot.cv(2);
          helix_ly[i] = c_rot.cv(1);
        }

      // Send data back to GPU.

      glBindBuffer(GL_ARRAY_BUFFER, helix_coords_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         helix_coords.size()*4*sizeof(helix_coords[0]),
         helix_coords.data(), GL_STATIC_DRAW);
      glBindBuffer(GL_ARRAY_BUFFER, helix_lz_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         helix_coords_size*sizeof(helix_lz[0]),
         helix_lz.data(), GL_STATIC_DRAW);
      glBindBuffer(GL_ARRAY_BUFFER, helix_ly_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         helix_coords_size*sizeof(helix_ly[0]),
         helix_ly.data(), GL_STATIC_DRAW);
    }

  last_frame_wall_time = time_now;

  frame_timer.phys_end();
}

void
World::render()
{
  // Start a timer object used for tuning this code.
  //
  frame_timer.frame_start();

  glClearColor(0,0,0,0);
  glClearDepth(1.0);
  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );

  glEnable(GL_DEPTH_TEST);
  glDepthFunc(GL_LESS);

  glShadeModel(GL_SMOOTH);

  ogl_helper.fbprintf("%s\n",frame_timer.frame_rate_text_get());

  if ( opt_physics_method != GP_cpu )
    {
      const int bl_p_sm_max =
        gpu_info.get_max_active_blocks_per_mp
        (opt_physics_method,inter_block_size,inter_dynamic_sm_size_bytes);
      const int wp_p_bl = ( inter_block_size + 31 ) >> 5;
      const double bl_p_sm_avail =
        double(inter_nblocks) / gpu_info.cuda_prop.multiProcessorCount;

      const double bl_p_sm = min( bl_p_sm_max + 0.0, bl_p_sm_avail );

      ogl_helper.fbprintf
        ("N Seg: %3zd  Interp routine: Bl Sz %3d  N Blocks %4d  "
         "Bl/SM %2d %4.1f   Wp/SM %4.1f   Bl Time %ld cyc\n",
         wire_segments.size(), inter_block_size, inter_nblocks,
         bl_p_sm_max, bl_p_sm, bl_p_sm * wp_p_bl,
         time_inter_cyc);
    }

  ogl_helper.fbprintf
    ("Eye location: [%5.1f, %5.1f, %5.1f]  "
     "Eye direction: [%+.2f, %+.2f, %+.2f]\n",
     eye_location.x, eye_location.y, eye_location.z,
     eye_direction.x, eye_direction.y, eye_direction.z);

  pVariable_Control_Elt* const cvar = variable_control.current;
  ogl_helper.fbprintf("VAR %s = %.5f  (TAB or '`' to change, +/- to adjust)\n",
                      cvar->name,cvar->get_val());

  ogl_helper.fbprintf
    ("Light location: [%5.1f, %5.1f, %5.1f]  "
     "Helix Location[%5.1f, %5.1f, %5.1f]\n",
     light_location.x, light_location.y, light_location.z,
     helix_location.x, helix_location.y, helix_location.z
     );

  if ( !sp_geo_shade->pobject )
    ogl_helper.fbprintf
      ("Programmable GPU API: %savailable.  GPU Code: %s\n",
       ptr_glCreateShader ? "" : "not",
       sp_geo_shade->pobject ? "okay" : "problem");

  const bool blink_visible = int64_t(time_wall_fp()*3) & 1;
# define BLINK(txt,pad) ( blink_visible ? txt : pad )

  ogl_helper.fbprintf
    ("Physics: %7s ('aA')  Pause: %s ('p')  Gravity: %s ('g')  Grabbed: %s ('%s')  Shared Mem: %d ('s')  Tryouts: %d %d ('y' 'Y')\n",
     opt_physics_method == GP_cpu
     ? BLINK(gpu_physics_method_str[opt_physics_method],"")
     : gpu_physics_method_str[opt_physics_method],
     opt_pause ? BLINK("ON","  ") : "OFF",
     opt_gravity ? "ON" : "OFF",
     opt_end_fixed ? "YES" : "NO",
     opt_end_fixed ? "B" : "b",
     opt_use_shared,
     opt_tryout1,
     opt_tryout2);

  //  ogl_helper.fbprintf("Spring Energy: %f\n", helix_spring_energy);

  const int win_width = ogl_helper.get_width();
  const int win_height = ogl_helper.get_height();
  const float aspect = float(win_width) / win_height;

  glMatrixMode(GL_MODELVIEW);
  glLoadIdentity();
  glLoadTransposeMatrixf(modelview);

  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  // Frustum: left, right, bottom, top, near, far
  glFrustum(-.8,.8,-.8/aspect,.8/aspect,1,5000);

  glEnable(GL_LIGHTING);

  const pColor white(1,1,1);
  const pColor red(1,0,0);
  const pColor blue(0,0,1);
  const pColor lsu_spirit_purple(0x580da6);
  const pColor lsu_spirit_gold(0xf9b237);

  glEnable(GL_LIGHT0);
  glLightfv(GL_LIGHT0, GL_POSITION, light_location);

  glLightfv(GL_LIGHT0, GL_DIFFUSE, white * opt_light_intensity);

  // Set lighting parameters for when shader is not used, as for the
  // big triangle.
  //
  glEnable(GL_COLOR_MATERIAL);
  glColorMaterial(GL_FRONT_AND_BACK,GL_AMBIENT_AND_DIFFUSE);
  glLightModeli(GL_LIGHT_MODEL_TWO_SIDE,1);

  pError_Check();

  // Set parameters that apply to a texture (texture_id_syllabus).
  //
  glTexParameteri
    (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_LINEAR );
  glTexParameteri
    (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );

  // Set parameter for texture unit.
  //
  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_MODULATE);

  sp_fixed->use();

  ///
  /// Paint Single Triangle.
  ///

  pColor color_tri(0x7815b6); // Red, Green, Blue
  glColor3fv( color_tri );

  glEnable(GL_TEXTURE_2D);
  glBindTexture(GL_TEXTURE_2D,texture_id_syllabus);

  //  Indicate type of primitive.
  //
  glBegin(GL_TRIANGLES);

  // Specify vertices for a triangle.
  //

  pCoor p1( 9.5, -5, -1.2 );
  pCoor p2( 0,    5, -3 );
  pCoor p3( 9,    6, -7 );
  pNorm triangle_normal = cross(p3,p2,p1);

  // Specify normal and vertex using course-defined objects pCoor and
  // pNorm. OpenGL sees these as pointers to floats.

  glNormal3fv(triangle_normal);
  glTexCoord2f(0.95,1.0); glVertex3fv(p1);
  glTexCoord2f(0.00,0.1); glVertex3fv(p2);
  glTexCoord2f(0.90,0.0); glVertex3fv(p3);

  glEnd();

  glDisable(GL_TEXTURE_2D);


  ///
  /// Construct a Helix
  ///

  if ( coords_stale )
    {
      // Recompute helix coordinates, etc.

      coords_stale = false;
      cuda_constants_stale = true;
      buffer_objects_stale = true;

      // Reset existing storage.
      helix_coords.clear();
      helix_indices.clear();
      wire_surface_indices.clear();
      wire_segments.clear();

      const int segments_per_helix =
        revolutions_per_helix * seg_per_helix_revolution;

      const double delta_eta = 2 * M_PI / seg_per_helix_revolution;
      const double delta_y = 4 * wire_radius / seg_per_helix_revolution;

      int wire_surface_idx = 0;
      pVect vZero(0,0,0);

      const float helix_seg_hlength_xy = helix_radius * tan(0.5 * delta_eta);
      helix_seg_hlength =
        sqrtf( helix_seg_hlength_xy * helix_seg_hlength_xy
               + 0.25 * delta_y * delta_y );

      const float angle_seg = asinf(delta_y /( 2 * helix_seg_hlength ));
      helix_seg_hlength_inv = 1 / helix_seg_hlength;

      pQuat tilt_up_seg(pVect(0,0,1),angle_seg);
      pQuat tilt_up_inv(pVect(0,0,1),-angle_seg);

      pQuat rot(pVect(0,-1,0),delta_eta);

      // Compute the transform to segment's right neighbor's coordinate space.
      helix_rn_trans = tilt_up_inv * rot * tilt_up_seg;

      const float wire_rad_sq = wire_radius * wire_radius;

      // Compute mass.
      helix_seg_mass =
        opt_helix_density * M_PI * wire_rad_sq * 2 * helix_seg_hlength;
      helix_seg_mass_inv = 1 / helix_seg_mass;

      // Compute moments of inertia, assuming perfect cylinder.
      helix_seg_ma_axis = helix_seg_mass * wire_rad_sq / 2;
      helix_seg_ma_perp_axis = helix_seg_mass *
        ( 4 * helix_seg_hlength * helix_seg_hlength + 3 * wire_rad_sq ) / 12;

      for ( int i = 0; i < segments_per_helix; i++ )
        {
          const bool last_i_iteration = i + 1 == segments_per_helix;

          const double eta = i * delta_eta;

          // Point on core.
          pCoor p0( helix_radius * cos(eta),
                    i * delta_y,
                    helix_radius * sin(eta));

          helix_coords.push_back(p0);

          // Initialize a helix (wire) segment, used for physical simulation.
          //
          wire_segments.emplace_back(); // Eagerly awaiting C++17 on RHEL.
          Wire_Segment* const ws = &wire_segments.back();
          ws->position = p0;
          ws->velocity = vZero;
          ws->omega = vZero;     // Rotation rate.

          // Rotate cylinder coordinates to world coordinates.
          pQuat rot(pVect(0,-1,0),eta + M_PI / 2);
          ws->orientation = rot * tilt_up_seg;

          for ( int j = 0; j < seg_per_wire_revolution; j++ )
            {
              const int idx = wire_surface_idx++;
              helix_indices.push_back(i);
              helix_indices.push_back(j);
              if ( last_i_iteration ) continue;
              // Insert indices for triangle with one vertex on eta.
              wire_surface_indices.push_back(idx); // This vertex.
              wire_surface_indices.push_back(idx + seg_per_wire_revolution);
            }
        }

      helix_coords_size = helix_coords.size();
      helix_end_ws = &wire_segments.back();
    }

  // If necessary, update data in buffer objects.
  if ( buffer_objects_stale )
    {
      buffer_objects_stale = false;

      // Generate buffer id (name), if necessary.
      //
      glGenBuffers(1,&helix_indices_bo);
      glGenBuffers(1,&helix_coords_bo);
      glGenBuffers(1,&helix_lz_bo);
      glGenBuffers(1,&helix_ly_bo);
      glGenBuffers(1,&wire_surface_indices_bo);

      helix_ly.resize(helix_coords_size);
      helix_lz.resize(helix_coords_size);

      glBindBuffer(GL_ARRAY_BUFFER, helix_coords_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         helix_coords.size()*4*sizeof(helix_coords[0]),
         helix_coords.data(), GL_STATIC_DRAW);

      glBindBuffer(GL_ARRAY_BUFFER, helix_indices_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         2 * helix_indices.size() * sizeof(helix_indices[0]),
         helix_indices.data(), GL_STATIC_DRAW);

      glBindBuffer(GL_ARRAY_BUFFER, wire_surface_indices_bo);
      glBufferData
        (GL_ARRAY_BUFFER,
         wire_surface_indices.size()*sizeof(wire_surface_indices[0]),
         wire_surface_indices.data(),GL_STATIC_DRAW);

      // Tell GL that subsequent array pointers refer to host storage.
      //
      glBindBuffer(GL_ARRAY_BUFFER, 0);

      pError_Check();
    }

  sp_geo_shade->use();

  ///
  /// Paint a Helix
  ///

  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,SB_COORD,helix_coords_bo);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,SB_LZ,helix_lz_bo);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,SB_LY,helix_ly_bo);

  glUniform1f(UNIF_IDX_WIRE_RADIUS,wire_radius);  GE();

  glMatrixMode(GL_MODELVIEW);
  glPushMatrix();

  glTranslatef(helix_location.x,helix_location.y,helix_location.z);
  glRotatef(60,0,1,0);

  // Specify color. Since it's not an array the same color
  // will be used for all vertices, which is what we want.
  //
  glUniform4fv(4,1,lsu_spirit_gold);
  glUniform4fv(5,1,color_red);

  glBindBuffer(GL_ARRAY_BUFFER, helix_indices_bo);
  glEnableVertexAttribArray(ATTR_IDX_HELIX_INDICES);
  glVertexAttribIPointer
    (ATTR_IDX_HELIX_INDICES,
     2, // Two components (as in a 2-element vector).
     GL_INT,
     0, // Tightly packed.
     0);

  glBindBuffer(GL_ARRAY_BUFFER, 0); // Avoid surprises.

  glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, wire_surface_indices_bo);

  glDrawElements
    (GL_TRIANGLE_STRIP,
     wire_surface_indices.size() - 4 * seg_per_wire_revolution,
     GL_UNSIGNED_INT,
     (const GLvoid*) ( 2 * seg_per_wire_revolution * sizeof(GL_UNSIGNED_INT)));

  glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);

  glDisableVertexAttribArray(ATTR_IDX_HELIX_INDICES);

  glPopMatrix();

  sp_fixed->use();

  // Render Marker for Light Source
  //
  insert_tetrahedron(light_location,0.5);

  pError_Check();

  frame_timer.frame_end();

  ogl_helper.user_text_reprint();
  glutSwapBuffers();
}

void
World::time_step_cpu()
{
  pVect vZero(0,0,0);
  pVect gravity_force = helix_seg_mass_inv * gravity_accel;
  helix_spring_energy = 0;

  const int phys_helix_segments = wire_segments.size();

  /// Initialize wire segment variables for time step.
  //
#pragma omp parallel for
  for ( int i=0; i<phys_helix_segments; i++ )
    {
      Wire_Segment* const cseg = &wire_segments[i];
      pMatrix_Rotation c_rot(cseg->orientation);

      // Put local-space axes of cseg into convenient variables.
      cseg->lx = c_rot.cv(0);  // Extract column 0 of matrix c_rot.
      cseg->ly = c_rot.cv(1);
      cseg->lz = c_rot.cv(2);
      // Compute and store local-space axes of cseg's right neighbor.
      pQuat cn_rot_q = cseg->orientation * helix_rn_trans;
      pMatrix_Rotation cn_rot(cn_rot_q);
      cseg->ny = cn_rot.cv(1);
      cseg->nz = cn_rot.cv(2);

      pVect ctr_to_right = helix_seg_hlength * cseg->lx;

      // Compute location of ends of cylinder.
      cseg->pos_left = cseg->position - ctr_to_right;
      cseg->pos_right = cseg->position + ctr_to_right;

      cseg->force = opt_gravity ? gravity_force : vZero;
      cseg->torque = vZero;
    }

  /// Apply forces due to helix segment to the "right" (+x in local space).
  //
  for ( int i=0; i<phys_helix_segments-1; i++ )
    {
      Wire_Segment* const cseg = &wire_segments[i];    // Us
      Wire_Segment* const rseg = &wire_segments[i+1];  // Our right neighbor.

      // Find distance between us and right neighbor at three points
      // on cylinder surface.  We hope that the compiler unrolls this
      // loop to avoid computing sin and cos every time.
      const int pieces = 3;
      const float delta_theta = 2 * M_PI / pieces;
      for ( int j=0; j<pieces; j++ )
        {
          const float theta = delta_theta * j;
          pCoor c_pt =
            cseg->pos_right + cseg->nz * cosf(theta) + cseg->ny * sinf(theta);
          pCoor r_pt =
            rseg->pos_left + rseg->lz * cosf(theta) + rseg->ly * sinf(theta);
          pNorm dist(c_pt,r_pt);
          const float force_mag = dist.magnitude * opt_spring_constant;
          pCoor mid_pt = 0.5 * ( c_pt + r_pt );
          helix_apply_force_at(cseg,mid_pt,dist,force_mag);
          helix_apply_force_at(rseg,mid_pt,dist,-force_mag);
        }
    }

  const float delta_t_mass_inv = delta_t * helix_seg_mass_inv;
  const float delta_t_ma_axis = delta_t / helix_seg_ma_axis;
  const float delta_t_ma_perp_axis = delta_t / helix_seg_ma_perp_axis;


  /// Quick and Dirty and Slow Interpenetration Routine
  //
  {
    // Rule out interpenetration between segments that are closer
    // together than MIN_IDX_DIST. In other words, assume that
    // segment 14 and segment 15 cannot interpenetrate each other
    // (but of course they are in contact in a normal way).
    //
    const int min_idx_dist = 0.999 + wire_radius / helix_seg_hlength;

    const float four_wire_radius_sq = 4 * wire_radius * wire_radius;

    // Use a brute force, O(n^2), search of all possible pairs of
    // segments.
    //
    for ( int a_idx=1; a_idx<phys_helix_segments; a_idx++ )
      for ( int b_idx=a_idx+min_idx_dist; b_idx<phys_helix_segments; b_idx++ )
        {
          Wire_Segment* const aseg = &wire_segments[a_idx];
          Wire_Segment* const bseg = &wire_segments[b_idx];
          pNorm dist(aseg->position,bseg->position);

          // Use a bounding sphere to quickly rule out contact.
          //
          if ( dist.mag_sq > four_wire_radius_sq ) continue;

          // As a crude approximation, compute depth of
          // interpenetration using bounding sphere.  Better results
          // would be obtained with a cylinder/cylinder intersection
          // computation.
          //
          const float pen_depth = 2 * wire_radius - dist.magnitude;
          pVect sep_force = pen_depth * opt_spring_constant * dist;
          aseg->force -= sep_force;
          bseg->force += sep_force;
        }
  }

  /// Using force and torque, update velocity, omega, position, and orientation.
  //
#pragma omp parallel for
  for ( int i=1; i<phys_helix_segments; i++ )
    {
      Wire_Segment* const cseg = &wire_segments[i];
      cseg->velocity *= 0.9999;
      cseg->omega *= 0.9999;
      cseg->velocity += delta_t_mass_inv * cseg->force;
      if ( opt_end_fixed && i + 1 == phys_helix_segments )
        cseg->velocity = vZero;
      cseg->position += delta_t * cseg->velocity;
      const float torque_axial_mag = dot( cseg->torque, cseg->lx );
      pVect torque_axial = torque_axial_mag * cseg->lx;
      pVect do_axial = delta_t_ma_axis * torque_axial;
      pVect torque_other = cseg->torque - torque_axial;
      pVect do_other = delta_t_ma_perp_axis * torque_other;
      cseg->omega += do_axial + do_other;
      pNorm axis(cseg->omega);
      cseg->orientation =
        pQuat(axis,delta_t * axis.magnitude) * cseg->orientation;
    }
}


void
World::helix_apply_force_at
(Wire_Segment *s, pCoor pos, pVect dir, float magnitude)
{
  s->force += magnitude * dir;
  helix_spring_energy += fabs(magnitude);
  pVect arm(s->position,pos);
  pVect axis = cross( arm, dir );
  pVect amt = magnitude * axis;
  helix_spring_energy += amt.mag();
  s->torque += amt;
}

pVect
World::helix_get_vel_at(Wire_Segment *s, pCoor pos)
{
  pVect arm(s->position,pos);
  pVect vel_rot = cross( s->omega, arm );
  return vel_rot + s->velocity;
}

//
// Collect GPU and Kernel Info
//

void
World::cuda_init()
{
  cuda_constants_stale = true;
  cuda_time_step_count = 0;

  inter_block_size = 256; // Will be automatically updated.
  inter_nblocks = 1;  // Will be automatically updated.

  // Get information about GPU and its ability to run CUDA.
  //
  int device_count;
  cudaGetDeviceCount(&device_count); // Get number of GPUs.
  if ( device_count == 0 )
    {
      fprintf(stderr,"No GPU found, exiting.\n");
      exit(1);
    }

  // Print information about the available GPUs.
  //
  gpu_info_print();

  // Choose GPU 0 because we don't have time to provide a way to let
  // the user choose.
  //
  int dev = gpu_choose_index();
  CE(cudaSetDevice(dev));
  printf("Using GPU %d\n",dev);

  // Prepare events used for timing.
  //
  CE(cudaEventCreate(&frame_start_ce));
  CE(cudaEventCreate(&frame_stop_ce));

  gpu_info.get_gpu_info(dev);
  cuda_setup(&gpu_info);

  // Print information about time_step routine.
  //
  printf("\nCUDA Routine Resource Usage:\n");

  for ( int i=0; i<gpu_info.num_kernels; i++ )
    {
      printf("For %s:\n", gpu_info.ki[i].name);
      printf("  %6zd shared, %zd const, %zd loc, %d regs; "
             "%d max threads per block.\n",
             gpu_info.ki[i].cfa.sharedSizeBytes,
             gpu_info.ki[i].cfa.constSizeBytes,
             gpu_info.ki[i].cfa.localSizeBytes,
             gpu_info.ki[i].cfa.numRegs,
             gpu_info.ki[i].cfa.maxThreadsPerBlock);
    }

  d_helix_position = NULL;
  d_helix_velocity = NULL;
  d_helix_orientation = NULL;
  d_helix_omega = NULL;
}


void
World::cb_keyboard()
{
  if ( !ogl_helper.keyboard_key ) return;
  pVect adjustment(0,0,0);
  pVect user_rot_axis(0,0,0);
  const bool shift = ogl_helper.keyboard_shift;
  const float move_amt = shift ? 2.0 : 0.4;

  const float wire_radius_prev = wire_radius;
  const float helix_radius_prev = helix_radius;

  switch ( ogl_helper.keyboard_key ) {
  case FB_KEY_LEFT: adjustment.x = -move_amt; break;
  case FB_KEY_RIGHT: adjustment.x = move_amt; break;
  case FB_KEY_PAGE_UP: adjustment.y = move_amt; break;
  case FB_KEY_PAGE_DOWN: adjustment.y = -move_amt; break;
  case FB_KEY_DOWN: adjustment.z = move_amt; break;
  case FB_KEY_UP: adjustment.z = -move_amt; break;
  case FB_KEY_DELETE: user_rot_axis.y = 1; break;
  case FB_KEY_INSERT: user_rot_axis.y =  -1; break;
  case FB_KEY_HOME: user_rot_axis.x = 1; break;
  case FB_KEY_END: user_rot_axis.x = -1; break;

  case 'A':
    opt_ez_method_safety = !opt_ez_method_safety;
    if ( opt_ez_method_safety )
      {
        ez_prev_physics = opt_physics_method;
        opt_physics_method = GPU_cuda_ez;
      }
    else
      {
        opt_physics_method = ez_prev_physics;
      }
    break;

  case 'a':
    opt_physics_method++;
    if ( opt_physics_method >= GPU_cuda_ez ) opt_physics_method = GP_cpu;
    if ( opt_physics_method > GP_cpu ) cuda_constants_stale = true;
    break;

  case 'b':
    opt_end_fixed = true; opt_move_item = MI_Ball;
    cuda_constants_stale = true;
    break;
  case 'B':
    opt_end_fixed = false;
    cuda_constants_stale = true;
    break;
  case 'e': case 'E': opt_move_item = MI_Eye; break;
  case 'l': case 'L': opt_move_item = MI_Light; break;

  case 'g': case 'G': opt_gravity = !opt_gravity;
    cuda_constants_stale = true;
    break;

  case 's': case 'S': opt_use_shared = !opt_use_shared;
    cuda_constants_stale = true;
    break;

  case 'p': case 'P':
    opt_pause = !opt_pause;
    if ( !opt_pause ) world_time = time_wall_fp();
    break;

  case 'y': opt_tryout1 = !opt_tryout1; break;
  case 'Y': opt_tryout2 = !opt_tryout2; break;


  case ' ':
    if ( shift ) opt_single_time_step = true; else opt_single_frame = true;
    opt_pause = true;
    break;

  case 9: variable_control.switch_var_right(); break;
  case 96: variable_control.switch_var_left(); break; // `, until S-TAB works.
  case '-':case '_': variable_control.adjust_lower();
    cuda_constants_stale = true;
    break;
  case '+':case '=': variable_control.adjust_higher();
    cuda_constants_stale = true;
    break;
  default: printf("Unknown key, %d\n",ogl_helper.keyboard_key); break;
  }

  if ( wire_radius_prev != wire_radius || helix_radius_prev != helix_radius )
    coords_stale = true;

  // Update eye_direction based on keyboard command.
  //
  if ( user_rot_axis.x || user_rot_axis.y )
    {
      pMatrix_Rotation rotall(eye_direction,pVect(0,0,-1));
      user_rot_axis *= invert(rotall);
      eye_direction *= pMatrix_Rotation(user_rot_axis, M_PI * 0.03);
      modelview_update();
    }

  // Update eye_location based on keyboard command.
  //
  if ( adjustment.x || adjustment.y || adjustment.z )
    {
      const double angle =
        fabs(eye_direction.y) > 0.99
        ? 0 : atan2(eye_direction.x,-eye_direction.z);
      pMatrix_Rotation rotall(pVect(0,1,0),-angle);
      adjustment *= rotall;

      switch ( opt_move_item ){
      case MI_Light: light_location += adjustment; break;
      case MI_Eye: eye_location += adjustment; break;
      case MI_Ball:
        cuda_constants_stale = true;
        if ( opt_end_fixed )
          helix_end_ws->position += adjustment;
        else
          helix_location += adjustment;
        break;
      default: break;
      }
      modelview_update();
    }
}


int
main(int argv, char **argc)
{
  pOpenGL_Helper popengl_helper(argv,argc);
  World world(popengl_helper);

  // Specify default frame update rate.
  //
  // Default rate used if API won't allow updating on each
  // display device frame.
  //
  popengl_helper.rate_set(30);

  // Start
  //
  popengl_helper.display_cb_set(world.render_w,&world);
}