/// LSU EE 7700-1 (Sp 2009), Graphics Processors
//
 /// Balloon Simulation

// $Id:$

/// Purpose
//
//   Demonstrate use of gpu for physics.


/// What Code Does

// The code in this file simulates a /balloon/: an elastic /surface/
// that encloses a /gas/. Outside the balloon is /air/, and there
// is a /platform/ for the balloon to bounce on.  Many physical
// parameters can be varied affecting the balloon bouncyness and
// buoyancy.

// By default the balloon simulation (physics) is performed on the GPU
// using CUDA using a one-pass algorithm, but the user can switch
// between a second CUDA algorithm, using the OpenGL API for physics,
// or having the CPU do the physics.

// The platform consists of tiles, some are purple-tinted mirrors
// (showing a reflection of the balloon), the others show the course
// syllabus.  The balloon can cast a shadow on the tiles.


///  Keyboard Commands
 //
 /// Object (Eye, Light, Balloon) Location or Push
 //   Arrows, Page Up, Page Down
 //   Will move object or push balloon, depending on mode:
 //   'e': Move eye.
 //   'l': Move light.
 //   'b': Move balloon. (Change position but not velocity.)
 //   'B': Push balloon. (Add velocity.)
 //
 /// Eye Direction
 //   Home, End, Delete, Insert
 //   Turn the eye direction.
 //   Home should rotate eye direction up, End should rotate eye
 //   down, Delete should rotate eye left, Insert should rotate eye
 //   right.  The eye direction vector is displayed in the upper left.

 /// Simulation Options
 //  (Also see variables below.)
 //
 //  'p'    Pause simulation. (Press again to resume.)
 //  's'    Stop balloon (but not vibration).
 //  'S'    Freeze balloon. (Set velocity of all vertices to zero.)
 //  'a'    Cycle between different physics algorithms and processors (GPU/CPU).
 //  'A'    Switch between CPU and GPU physics.
 //  'g'    Turn gravity on and off.
 //  'n'    Switch between textured and striped balloon surface.
 //  'F12'  Write screenshot to file balloon.png.

 /// Variables
 //   Selected program variables can be modified using the keyboard.
 //   Use "Tab" to cycle through the variable to be modified, the
 //   name of the variable is displayed next to "VAR" on the bottom
 //   line of green text.

 //  'Tab' Cycle to next variable.
 //  '`'   Cycle to previous variable.
 //  '+'   Increase variable value.
 //  '-'   Decrease variable value.
 //
 //  VAR Light Intensity - The light intensity.
 //  VAR Gas Amount - Amount of gas in balloon.
 //  VAR Gravity - Gravitational acceleration. (Turn on/off using 'g'.)
 //  VAR Air Resistance - Viscosity of air.
 //  VAR Gas Particle Mass - Initially matches air mass.
 //  VAR Spring Constant - Warning: watch Oversample as this is increased.
 //  VAR Repulsion Constant - Determines stiffness of surface.
 //  VAR Surface Mass -  Warning: watch Oversample as this is reduced.


/// Bugs and Incomplete Features

// Code is only sparsely commented.

// Some code in this file should be moved to other files. 

// The reflection and shadow are incorrect when the balloon
// is below the platform.

// At high spring constant values the simulation leaks momentum.

// The repulsion used to model stiffness should be augmented or replaced
// with forces based on angle (not just distance).


/// To Do

// Detect and prevent interpenetration.

// Use something like Verlet integration to update position.

// Provide presets of physical parameters, for example, to simulate a
// hot-air balloon, a partially deflated basketball, etc.



#define GL_GLEXT_PROTOTYPES
#define GLX_GLXEXT_PROTOTYPES

#include <GL/gl.h>
#include <GL/glext.h>
#include <GL/glx.h>
#include <GL/glxext.h>
#include <GL/glu.h>
#include <GL/freeglut.h>
#include <Magick++.h>

#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

#include "../opengl/util.h"
#include "../opengl/glextfuncs.h"
#include "../opengl/coord.h"
#include "../opengl/shader.h"
#include "../opengl/pstring.h"
#include "../opengl/misc.h"

#include "balloon.cuh"


///
/// CUDA and OpenGL Support
///


 /// CUDA API Error-Checking Wrapper
///
#define CE(call)                                                              \
 {                                                                            \
   const cudaError_t rv = call;                                               \
   if ( rv != cudaSuccess )                                                   \
     {                                                                        \
       pStringF msg("CUDA error %d, %s\n",rv,cudaGetErrorString(rv));         \
       pError_Msg(msg.s);                                                     \
     }                                                                        \
 }

void vec_set(float3& a, pCoor b) {a.x = b.x; a.y = b.y; a.z = b.z;}
void vec_sets(pCoor& a, float3 b) {a.x = b.x; a.y = b.y; a.z = b.z; a.w=1;}
void vec_sets(pVect& a, float3 b) {a.x = b.x; a.y = b.y; a.z = b.z; }


 ///
 /// Class for managing an OpenGL ARRAY_BUFFER
 ///

template <typename T>
class pBuffer_Object {
public:
  pBuffer_Object(){ data = NULL; init(); }
  ~pBuffer_Object()
  {
    if ( data ) free(data);
    glDeleteBuffers(created,bids);
  }

private:
  void init()
  {
    btarget = GL_ARRAY_BUFFER;
    glGenBuffers(2, bids);
    current = 0;
    bid = bids[current];
    created = 0;
    pError_Check();
    usage_hint = GL_DYNAMIC_COPY;
  }

public:
  T* alloc(int elements_p, GLenum hint = GL_DYNAMIC_COPY)
  {
    usage_hint = hint;
    if ( data ) pError_Msg("Double allocation of pBuffer_Object.");
    elements = elements_p;
    chars = elements * sizeof(T);
    data = new T[elements];
    created = 1;
    alloc_gpu_buffer();
    glBindBuffer(btarget,0);
    return data;
  }

  void take(PStack<T>& stack, GLenum hint = GL_DYNAMIC_COPY,
            GLenum default_target = GL_ARRAY_BUFFER)
  {
    usage_hint = hint;
    btarget = default_target;
    if ( data ) pError_Msg("Double allocation of pBuffer_Object.");
    elements = stack.occ();
    chars = elements * sizeof(T);
    data = stack.take_storage();
    created = 1;
    alloc_gpu_buffer();
    glBindBuffer(btarget,0);
  }

  void prepare_two_buffers()
  {
    ASSERTS( created == 1 );
    created = 2; bid_swap(); alloc_gpu_buffer(); bid_swap();
  }

private:
  void alloc_gpu_buffer()
  {
    bind();
    glBufferData(btarget,chars,NULL,usage_hint);
    pError_Check();
  }

public:
  void to_gpu()
  {
    bind();
    glBufferData(btarget, chars, data, usage_hint);
    pError_Check();
  }

  void from_gpu()
  {
    bind();
    T* const from_data = (T*)glMapBuffer(GL_ARRAY_BUFFER,GL_READ_ONLY);
    pError_Check();
    memcpy(data,from_data,chars);
    glUnmapBuffer(btarget);
    glBindBuffer(btarget,0);
  }

  void bind(GLenum target){ glBindBuffer(target,bid); }
  void bind(){ glBindBuffer(btarget,bid); }

  GLuint bid_read() const { return bids[current]; }
  GLuint bid_write()
  {
    bid_swap(); alloc_gpu_buffer(); bid_swap();
    return bids[1-current];
  }
  GLuint bid_fresh()
  {
    alloc_gpu_buffer();
    return bid;
  }

  void bid_swap()
  {
    current = 1 - current;
    bid = bids[current];
  }

  T& operator [] (int idx) { return data[idx]; }

  GLuint bids[2];
  GLuint bid;
  GLenum usage_hint;
  GLenum btarget;
  int created, current;
  T *data;
  int elements, chars;
};


 ///
 /// Class for managing CUDA device memory.
 ///

template <typename T>
class pCUDA_Memory {
public:
  pCUDA_Memory()
  {
    data = NULL;  dev_addr[0] = dev_addr[1] = NULL;  current = 0;  bid = 0;
    bo_ptr = NULL;
  }
  ~pCUDA_Memory()
  {
    if ( data ) free(data);
    if ( bid )
      {
        CE(cudaGLUnmapBufferObject(bid));
        CE(cudaGLUnregisterBufferObject(bid));
        glDeleteBuffers(1,&bid);
      }
  }

  T* alloc(int elements_p)
  {
    if ( data ) pError_Msg("Double allocation of pCUDA_Memory.");
    elements = elements_p;
    chars = elements * sizeof(T);
    data = new T[elements];
    return data;
  }

  void take(PStack<T>& stack)
  {
    if ( data ) pError_Msg("Double allocation of pCUDA_Memory.");
    elements = stack.occ();
    chars = elements * sizeof(T);
    data = stack.take_storage();
  }

  T& operator [] (int idx) const { return data[idx]; }

private:
  void alloc_maybe() { if ( !dev_addr[current] ) alloc_gpu_buffer(); }

  void alloc_gpu_buffer()
  {
    ASSERTS( !dev_addr[current] );
    CE(cudaMalloc(&dev_addr[current],chars));
  }

  void alloc_gl_buffer()
  {
    if ( bid ) return;
    glGenBuffers(1,&bid);
    glBindBuffer(GL_ARRAY_BUFFER,bid);
    glBufferData(GL_ARRAY_BUFFER,chars,NULL,GL_DYNAMIC_DRAW);
    glBindBuffer(GL_ARRAY_BUFFER,0);
    CE(cudaGLRegisterBufferObject(bid));
  }

public:
  T* get_dev_addr() { alloc_maybe(); return (T*)dev_addr[current];}
  T* get_dev_addr_read() { return get_dev_addr(); }
  T* get_dev_addr_write()
  {
    if ( !dev_addr[1-current] ) { swap(); alloc_gpu_buffer(); swap(); }
    return (T*)dev_addr[1-current];
  }

  void to_cuda()
  {
    if ( !dev_addr[current] ) alloc_gpu_buffer();
    CE(cudaMemcpy(dev_addr[current], data, chars, cudaMemcpyHostToDevice));
  }

  void from_cuda()
  {
    CE(cudaMemcpy(data, dev_addr[current], chars, cudaMemcpyDeviceToHost));
  }

  void cuda_to_gl()
  {
    alloc_gl_buffer();
    // Due to a bug in CUDA 2.1 this is slower than copying through host.
    CE(cudaGLMapBufferObject(&bo_ptr,bid));
    CE(cudaMemcpy(bo_ptr, dev_addr[current], chars, cudaMemcpyDeviceToDevice));
    CE(cudaGLUnmapBufferObject(bid));
  }

  void swap() { current = 1 - current; }
  void set_primary() { current = 0; }

  // Stuff below should be private to avoid abuse.
  void *dev_addr[2], *bo_ptr;
  GLuint bid;
  int current;
  T *data;
  int elements, chars;
};


 ///
 /// Ad-Hoc Class for Reading Images
 ///

using namespace Magick;

class P_Image_Read
{
public:
  P_Image_Read(const char *path, int transp):
    image(path),image_loaded(false),data(NULL)
  {
    width = image.columns();
    height = image.rows();
    size = width * height;
    if ( !width || !height ) return;
    if ( transp == 255 )
      image.transparent(Color("White"));
    pp = image.getPixels(0,0,width,height);
    for ( int i = 0; i < size; i++ ) pp[i].opacity = MaxRGB - pp[i].opacity;
    gl_fmt = GL_BGRA;
    gl_type = sizeof(PixelPacket) == 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE;
    data = (unsigned char*) pp;
    image_loaded = true;
  };
  void color_invert()
  {
    for ( int i = 0; i < size; i++ )
      {
        PixelPacket& p = pp[i];
        const int sum = p.red + p.blue + p.green;
        p.opacity = (typeof p.opacity)( MaxRGB - sum * 0.3333333 );
        p.red = p.blue = p.green = MaxRGB;
      }
  }
  Image image;
  PixelPacket *pp;
  bool image_loaded;
  int width, height, maxval, size;
  unsigned char *data;
  int gl_fmt;
  int gl_type;
private:
};


 ///
 /// Create and initialize texture object using image file.
 ///

GLuint
pBuild_Texture_File
(const char *name, bool invert = false, int transp = 256 )
{
  // Read image from file.
  //
  P_Image_Read image(name,transp);
  if ( !image.image_loaded ) return 0;

  // Invert colors. (E.g., to show text as white on black.)
  //
  if ( invert ) image.color_invert();

  GLuint tid;
  glGenTextures(1,&tid);
  glBindTexture(GL_TEXTURE_2D,tid);
  glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, 1);

  // Load data into the texture object.
  //
  glTexImage2D
    (GL_TEXTURE_2D,
     0,                // Level of Detail (0 is base).
     GL_RGBA,          // Internal format to be used for texture.
     image.width, image.height,
     0,                // Border
     image.gl_fmt,     // GL_BGRA: Format of data read by this call.
     image.gl_type,    // GL_UNSIGNED_BYTE: Size of component.
     (void*)image.data);
  pError_Check();

  return tid;
}


///
/// Main Data Structures
///
//
// class World: All data about scene.
// class Balloon: Data about a balloon.
//
// See also balloon.cuh


class World;

struct Balloon_Triangle {
  int pi, qi, ri;
  int pi_opp, qi_opp, ri_opp;
  bool pi_less, qi_less, ri_less;
  pColor color;
  pVect normal;
  float length_relaxed;
  int block_number; // Used for preparing cuda data.
  bool a_vtx(int v) const { return v == pi || v == qi || v == ri; }
  int third_vtx(int v1, int v2) const
  { return ( v1 == pi && v2 == qi || v2 == pi && v1 == qi ? ri :
             v1 == pi && v2 == ri || v2 == pi && v1 == ri ? qi :
             v1 == qi && v2 == ri || v2 == qi && v1 == ri ? pi : -1 ); }
};

struct Balloon_Rep_Pair {
  // Repulsion pairs.
  int pi, qi;
};

struct Balloon_Vertex {
  Balloon_Vertex(){ edge_out_count = edge_in_count = 0; }

  // Constant Data (from time step to time step)

  float mass_inv;
  float mass;
  int edge_out[7];
  int edge_out_count;
  PStack<int> triangles;
  pCoor tex_coor;

  // Data changed each time step.

  pCoor pos;
  pVect vel;

  // Maybe generated and used in same time step.

  pCoor pos_prev;
  pVect force;
  pVect force_spring;
  pVect force_pressure;
  pVect force_rep; // Repulsion.
  pVect surface_normal;

  // Only used during initialization.
  int edge_in_count;
  int edge_in[6];
  double eta;
  double theta;
  int ring;
};

// Structural Data for OpenGL Physics, One per Triangle
//
struct GLP_Tri_Strc {
  float pi, qi, ri;             // Index of triangle's vertices.
  float length_relaxed;
  int pi_opp, qi_opp, ri_opp;
};  // 7 * 4 = 28 bytes

// Structural Data for OpenGL Physics, One per Vertex
//
struct GLP_Vtx_Strc {
  float self_idx;
  float left_idx;                     // Not used.
  uint16_t neighbors[VTX_TRI_DEG_MAX]; // Index of vertex's triangles.
}; // 8 + 16 = 24 bytes. Yuck!


// GPU-Computed Balloon Data for Triangles
//
struct GLP_Tri_Data {
  pCoor surface_normal; // Magnitude is area of incident triangles.
  pCoor force_p;
  pCoor force_q;
  pCoor force_r;
};

// See also balloon.cuh.


// GPU-Computed Balloon Data for Vertices
//
struct GLP_Vtx_Data {
  pCoor surface_normal; // Magnitude is area of incident triangles.
  pCoor vel;
  pCoor pos;
  pCoor padding;
};

// See also balloon.cuh.


enum Data_Location { DL_CPU = 0x1, DL_GLP = 0x2, DL_CUDA = 0x4 };
enum GPU_Physics_Method
  { GP_cpu, GP_glp, GP_cuda_1_pass, GP_cuda_2_pass, GP_ENUM_SIZE };
const char* const gpu_physics_method_str[] =
  { "CPU", "OpenGL 2 Pass", "CUDA 1 Pass", "CUDA 2 Pass" };


class Balloon {
public:
  Balloon(World& w):world(w)
  {
    cuda_initialized = false;
    glp_initialized = false;
  }
  ~Balloon(){ }
  void init(pCoor center, double radius);

  // Called each time user changes a configuration variable, such as gravity.
  void update_for_config();

  void init_glp();

  void init_cuda();
  void cuda_data_partition();

  // Advance (time-step) simulated time.
  //
  void time_step_cpu(int steps);
  void time_step_cpu_once();
  void time_step_gpu(int steps);  // Call appropriate time_step routine.
  void time_step_glp(int steps);   // Use OpenGL API for physics.
  void time_step_cuda(int steps); // Use CUDA API for physics.

  void gpu_data_to_cpu();
  void glp_data_to_cpu();
  void cuda_data_to_cpu();
  void cpu_data_to_glp();
  bool cpu_data_to_cuda();  // Return true if data transferred.

  // User Interaction
  //
  void translate(pVect amt); // Instantly move balloon.
  void push(pVect amt);      // Instantly add velocity.
  void stop()                //  Stop motion but not other motion.
  {
    pVect avg_vel = velocity_avg();
    for ( int i=0; i<point_count; i++ ) points[i].vel -= avg_vel;
  }
  void freeze()             //  Stop all motion.
  {
    for ( int i=0; i<point_count; i++ ) points[i].vel = pVect(0,0,0);
  }

  pCoor centroid_compute()
  {
    pCoor point_sum(0,0,0);
    for ( int i=0; i<point_count; i++ ) point_sum += points[i].pos;
    point_sum.homogenize();
    centroid = point_sum;
    return centroid;
  }
  float pressure_air(float msl)
  {
    return opt_gravity ? exp( - 0.2 * air_particle_mass * msl ) : 1.0;
  }
  float pressure_gas(float msl, float factorp = 0)
  {
    const float factor = factorp ? factorp : gas_pressure_factor;
    return opt_gravity ? factor * exp( - gas_m_over_temp * msl ) : factor;
  }
  void pressure_compute()
  {
    const float exp_air = pressure_air(centroid.y);
    const float exp_gas = pressure_gas(centroid.y,1);
    const float eff_volume = fabs( volume );
    gas_pressure_factor = pressure_factor_coeff / eff_volume;
    pressure = gas_pressure_factor * exp_gas / exp_air;
    density_air =
      ( pressure_air(centroid.y - 0.5) - pressure_air(centroid.y + 0.5) )
      / opt_gravity_accel;
    density_gas =
      ( pressure_gas(centroid.y - 0.5) - pressure_gas(centroid.y + 0.5) )
      / opt_gravity_accel;
  }
  pVect velocity_avg()
  {
    pVect vel_avg(0,0,0);
    for ( int i=0; i<point_count; i++ ) vel_avg += points[i].vel;
    vel_avg *= 1.0/point_count;
    return vel_avg;
  }

  World& world;

  // Structural Data
  //
  float radius;                 // Initial radius.
  float nom_volume; // Volume based on initial radius

  // Balloon Structure
  //
  PStack<Balloon_Vertex> points;
  PStack<Balloon_Triangle> triangles;
  PStack<Balloon_Rep_Pair> rep_pairs;
  pBuffer_Object<GLuint> point_indices;
  pBuffer_Object<float> tex_coords;
  int point_count;
  int tri_count;
  int rep_pair_count;
  int tethered_idx;


  // Unchanging (or user set) Physical Constants
  //
  float spring_constant;
  float rep_constant;
  float air_resistance;
  float surface_mass;
  float gas_amount;
  float gas_particle_mass;
  float air_particle_mass;
  float temperature;
  float opt_gravity_accel;
  float damping_v;              // CPU and gpu. Higher is less damping.
  float damping_factor;         // CPU only code.

  // User-Set Options (in addition to physical constants above).
  //
  bool opt_gravity;             // If false, no gravity.
  bool opt_damping;             // Only used in cpu code. See also damping_v
  bool opt_cpu_interleave;  // When gpu physics on do 1 cpu time step / frame.

  // Computed after each change to user-set physical quantity.
  //
  float temp_ratio;  // Temperature ratio.
  float gas_mass_per_vertex;
  float pressure_factor_coeff;
  float gas_pressure_factor;
  double oversample;       // Harmonic (approx) divided by time step delta t.
  double tightness;
  float damping_factor_per_step;
  float point_mass;
  float point_mass_inv;

  // Computed each time step.
  //
  float volume;
  float area;
  pVect weight;
  pCoor centroid;
  float gas_m_over_temp;  // Coefficient in pressure formula.
  float pressure;
  float density_air, density_gas;

  // Computed but not yet correct.
  //
  double e_spring, e_kinetic;
  double energy, e_zero;

  GLuint texid_pse, texid_syl;

  int cpu_iteration;
  bool need_cpu_iteration;
  bool length_relaxed_update;

  int data_location;
  GLuint data_bid;
  int data_stride;

  bool glp_initialized;
  pBuffer_Object<GLP_Vtx_Data> glp_vtx_data;
  pBuffer_Object<GLP_Tri_Data> glp_tri_data;
  pBuffer_Object<GLP_Vtx_Strc> glp_vtx_strc;
  pBuffer_Object<GLP_Tri_Strc> glp_tri_strc;

  GLuint query_transform_feedback_id;
  GLuint glp_vtx_data_tid;
  GLuint glp_tri_data_tid;
  GLuint framebuffer_id, renderbuffer_id;
  pShader vs_plan_c;
  GLint sat_indices, sat_volume, sat_pos, sat_vel;
  GLint sun_constants_sc;
  GLint sun_constants_gas, sun_constants_dt, sun_platform;
  GLint svl_surface_normal, svl_force_or_v, svl_pos, svl_force_r;
  GLint stx_data_vtx, stx_data_tri;

  // CUDA Stuff
  //
  bool cuda_initialized;
  bool cuda_constants_stale;
  pCUDA_Memory<CUDA_Tri_Strc> cuda_tri_strc;
  pCUDA_Memory<CUDA_Tri_Work_Strc> cuda_tri_work_strc;
  pCUDA_Memory<CUDA_Tri_Data> cuda_tri_data;
  pCUDA_Memory<CUDA_Vtx_Strc> cuda_vtx_strc;
  pCUDA_Memory<CUDA_Vtx_Data> cuda_vtx_data;
  pCUDA_Memory<float> cuda_tower_volumes;
  pCUDA_Memory<float3> cuda_centroid_parts;
  int tri_work_per_vtx;
  int tri_work_per_vtx_lg;
  dim3 Dg_tri, Db_tri, Dg_vtx, Db_vtx;
};

class World {
public:
  World(pOpenGL_Helper &fb):ogl_helper(fb),balloon(*this){init();}
  void init();
  static void render_w(void *moi){ ((World*)moi)->render(); }
  void render();
  void cb_keyboard();
  void modelview_update();
  void shadow_update();
  void shadow_transform_create(pMatrix& m, pCoor light);

  pOpenGL_Helper& ogl_helper;
  pVariable_Control variable_control;
  pFrame_Timer frame_timer;
  cudaEvent_t frame_start_ce, frame_stop_ce;
  double world_time;
  double delta_t;               // Duration of time step.

  // Tiled platform for balloon.
  //
  float platform_xmin, platform_xmax, platform_zmin, platform_zmax;
  pBuffer_Object<pVect> platform_tile_coords;
  pBuffer_Object<float> platform_tex_coords;

  Balloon balloon;
  pCoor light_location;
  float opt_light_intensity;
  enum { MI_Eye, MI_Light, MI_Balloon, MI_Balloon_V, MI_COUNT } opt_move_item;
  bool opt_pause;
  int opt_physics_method;
  int opt_physics_method_last;
  bool opt_surface_smooth;

  pCoor eye_location;
  pVect eye_direction;
  pMatrix modelview;
  pMatrix modelview_shadow;
  pMatrix transform_mirror;

  pShader vs_fixed;
};

void
World::init()
{
  const double radius = 5;
  pCoor center(13.7,12,-15.4);
  frame_timer.work_unit_set("Steps / s");
  world_time = 0;
  delta_t = 1.0 / ( 32 * 30 );
  balloon.data_location = DL_CPU;
  balloon.data_bid = 0;
  eye_location = pCoor(24.2,11.6,-38.7);
  eye_direction = pVect(-0.42,-0.09,0.9);
  opt_move_item = MI_Eye;
  opt_light_intensity = 100.2;
  opt_physics_method = GP_cuda_1_pass;
  opt_surface_smooth = true;
  platform_xmin = -40; platform_xmax = 40;
  platform_zmin = -40; platform_zmax = 40;
  light_location = pCoor(platform_xmax,platform_xmax,platform_zmin);
  balloon.need_cpu_iteration = true;
  balloon.length_relaxed_update = true;
  balloon.damping_v = 0.1;
  balloon.cpu_iteration = 0;
  balloon.opt_gravity = true;
  balloon.opt_damping = false;
  balloon.opt_cpu_interleave = false;
  balloon.damping_factor = 0.2;
  balloon.spring_constant = 40.0;
  balloon.rep_constant = 0.1;
  balloon.air_resistance = 0.001;
  balloon.gas_amount = 0;
  balloon.surface_mass = 1;
  balloon.e_zero = 0;
  balloon.opt_gravity_accel = 9.8;
  balloon.gas_particle_mass = 0.01;
  balloon.air_particle_mass = 0.01;
  balloon.temperature = 300;
  opt_pause = false;
  variable_control.insert(balloon.gas_amount,"Gas Amount");
  variable_control.insert(balloon.opt_gravity_accel,"Gravity");
  //  variable_control.insert(balloon.temperature,"Temperature");
  //  variable_control.insert(balloon.damping_v,"Damping Factor");
  variable_control.insert(balloon.air_resistance,"Air Resistance");
  variable_control.insert(opt_light_intensity,"Light Intensity");
  variable_control.insert(balloon.gas_particle_mass,"Gas Particle Mass");
  variable_control.insert(balloon.spring_constant,"Spring Constant");
  variable_control.insert(balloon.rep_constant,"Repulsion Constant");
  variable_control.insert(balloon.surface_mass,"Surface Mass");
  balloon.init(center,radius);

  modelview_update();

  const float tile_count = 19;
  const float ep = 1.00001;
  const float xdelta = ( platform_xmax - platform_xmin ) / tile_count * ep;
  const float zdelta = ( platform_zmax - platform_zmin ) / tile_count * ep;

  const float trmin = 0.05;
  const float trmax = 0.7;
  const float tsmin = 0;
  const float tsmax = 0.4;

  PStack<pVect> p_tile_coords;
  PStack<pVect> p1_tile_coords;
  PStack<float> p_tex_coords;
  bool even = true;

  for ( float x = platform_xmin; x < platform_xmax; x += xdelta )
    for ( float z = platform_zmin; z < platform_zmax; z += zdelta )
      {
        PStack<pVect>& t_coords = even ? p_tile_coords : p1_tile_coords;
        p_tex_coords += trmax; p_tex_coords += tsmax;
        t_coords += pVect(x,-0.01,z);
        p_tex_coords += trmax; p_tex_coords += tsmin;
        t_coords += pVect(x,-0.01,z+zdelta);
        p_tex_coords += trmin; p_tex_coords += tsmin;
        t_coords += pVect(x+xdelta,-0.01,z+zdelta);
        p_tex_coords += trmin; p_tex_coords += tsmax;
        t_coords += pVect(x+xdelta,-0.01,z);
        even = !even;
      }

  while ( pVect* const v = p1_tile_coords.iterate() ) p_tile_coords += *v;

  platform_tile_coords.take(p_tile_coords);
  platform_tile_coords.to_gpu();
  platform_tex_coords.take(p_tex_coords);
  platform_tex_coords.to_gpu();
}


void
Balloon::init(pCoor center, double r)
{
  radius = r;
  nom_volume = 4.0/3.0 * M_PI * r * r *r;
  const int equator_points = 60;
  const int slice_points_min = 6;
  static const double two_pi = 2 * M_PI;
  const double equator_interpoint_radians = two_pi / equator_points;
  const double equator_interpoint = r * equator_interpoint_radians;
  const double epsilon = 0.00001;
  point_count = 0;

  Balloon_Vertex* const pole_south = points.pushi();
  pole_south->mass_inv = 1;
  pole_south->theta = 10;
  pole_south->pos = center + pVect(0,0,r);
  pole_south->vel = pVect(0,0,0);
  pole_south->ring = 0;
  pole_south->eta = 0;
  point_count++;

  const double first_eta =
    asin( double(slice_points_min) / equator_points );
  const int rings =
    int(0.5 + ( M_PI - 2 * first_eta ) / equator_interpoint_radians);
  const double delta_eta = ( M_PI - 2 * first_eta ) / max(1,rings);
  int ring_count = 0;
  PStack<int> rings_first_idx;
  rings_first_idx += 0;

  for ( double eta = first_eta + epsilon;
        eta <= M_PI-first_eta;
        eta += delta_eta - epsilon )
    {
      ring_count++;
      const double z = r * cos(eta);
      const double slice_r = r * sin(eta);
      const int slice_points = int( two_pi * slice_r / equator_interpoint );
      const double delta_theta = two_pi / slice_points;
      const int ring_first_idx = point_count;
      const int lower_ring_first_idx = rings_first_idx.peek();
      ASSERTS( ring_count != 1 || slice_points == slice_points_min );
      ASSERTS( ring_count != rings + 1 || slice_points == slice_points_min );

      rings_first_idx += point_count;

      int lower_ring_idx = point_count - 1;
      if ( points[lower_ring_idx].theta < two_pi - delta_theta )
        lower_ring_idx = lower_ring_first_idx;
      const int lower_ring_first_connected = lower_ring_idx;
      double lower_ring_angle_adj =
        lower_ring_idx == lower_ring_first_idx ? 0 : -two_pi;
      const double theta_first = ring_count & 1 ? 0 : delta_theta * 0.5;

      int slice_idx = 0;
      for ( double theta = theta_first; theta < two_pi - 0.001;
            theta += delta_theta )
        {
          Balloon_Vertex* const point = points.pushi();
          point->mass_inv = 1;
          point->ring = ring_count;
          point->eta = eta;
          point->theta = theta;
          point->pos = center +
            pVect(slice_r * cos(theta), slice_r * sin(theta), z );
          point->vel = pVect(0,0,0);
          point->edge_out[point->edge_out_count++] =
            point_count - 1 + ( slice_idx ? 0 : slice_points );

          const double next_theta = theta + delta_theta;
          while ( true )
            {
              point->edge_out[point->edge_out_count++] = lower_ring_idx;
              const bool lr_last = lower_ring_idx + 1 == ring_first_idx;
              const int next_idx =
                lr_last ? lower_ring_first_idx : lower_ring_idx + 1;
              Balloon_Vertex* const n_lr_next = &points[next_idx];
              const double next_angle_adj =
                lower_ring_angle_adj + ( lr_last ? two_pi : 0 );
              const double lower_ring_angle = n_lr_next->theta + next_angle_adj;
              if ( lower_ring_angle > next_theta + 0.0001 ) break;
              if ( slice_idx > 1 &&
                   lower_ring_idx == lower_ring_first_connected ) break;
              lower_ring_idx = next_idx;
              lower_ring_angle_adj = next_angle_adj;
            }

          slice_idx++;  point_count++;
        }
    }

  {
    Balloon_Vertex* const pole_north = points.pushi();
    pole_north->mass_inv = 1;
    pole_north->theta = 10;
    pole_north->pos = center + pVect(0,0,-r);
    pole_north->vel = pVect(0,0,0);
    pole_north->ring = ++ring_count;
    pole_north->eta = M_PI;
    const int lower_ring_first_idx = rings_first_idx.peek();
    for ( int lower_ring_idx = lower_ring_first_idx;
          lower_ring_idx != point_count; lower_ring_idx++ )
      pole_north->edge_out[pole_north->edge_out_count++] = lower_ring_idx;
    pole_north->edge_out[pole_north->edge_out_count++] = lower_ring_first_idx;
    point_count++;
  }

  tethered_idx = 0;
  float min_y = pole_south->pos.y;
  for ( int idx = 0;  idx < point_count;  idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      ASSERTS( p->mass_inv > 0 );
      p->mass = 1.0 / p->mass_inv;

      if ( p->pos.y < min_y ) { tethered_idx = idx;  min_y = p->pos.y; }

      for ( int j = 0;  j < p->edge_out_count;  j++ )
        {
          const int qi = p->edge_out[j];
          Balloon_Vertex* const q = &points[qi];
          q->edge_in[q->edge_in_count++] = idx;
        }
    }

  PStack<GLuint> p_indices;
  const double tex_eta_min = 0.25 * M_PI;
  const double tex_eta_max = 0.75 * M_PI;
  const double tex_theta_min = 0;
  const double tex_theta_max = two_pi;
  const double eta_to_s = 1.0 / ( tex_eta_max - tex_eta_min );
  const double theta_to_s = 1.0 / ( tex_theta_max - tex_theta_min );

  PStack<float> gpu_tex_coords;

  for ( int idx = 0;  idx < point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      pColor color;

      p->tex_coor.x = ( p->theta - tex_theta_min ) * theta_to_s;
      p->tex_coor.y = ( p->eta - tex_eta_min ) * eta_to_s;
      gpu_tex_coords += p->tex_coor.x;
      gpu_tex_coords += p->tex_coor.y;

      switch ( p->ring & 0x3 ) {
      case 0: color = pColor(0.9,.1,.1); break;
      case 1: color = pColor(0.1,0.9,0.1); break;
      case 2: color = pColor(0.1,0.1,0.9); break;
      case 3: color = pColor(0.1,0.9,0.9); break;
      default: color = pColor(0.5,0.5,0.5); break;
      }

      const bool within_ring_first = !idx || p->ring != points[idx-1].ring;
      const bool within_ring_last =
        idx + 1 == point_count || p->ring != points[idx+1].ring;

      for ( int e = 0;  e < p->edge_out_count - 1;  e++ )
        {
          const int qi = p->edge_out[e];
          Balloon_Vertex* const q = &points[qi];
          const int ri = p->edge_out[e+1];
          Balloon_Vertex* const r = &points[ri];
          for ( int qe=0; qe < q->edge_out_count-1; qe++ )
            { ASSERTS( q->edge_out[qe] != ri ); }
          for ( int re=0; re < r->edge_out_count-1; re++ )
            { ASSERTS( r->edge_out[re] != idx ); }
          const int tri_idx = triangles.occ();
          Balloon_Triangle* const tri = triangles.pushi();
          tri->pi = idx;  tri->qi = qi;  tri->ri = ri;
          tri->color =
            within_ring_first && e == 0 ? pColor(0.9,0.9,0.9) :
            within_ring_last && e == p->edge_out_count -2
            ? pColor(0.1,0.1,0.1) : color;
          p->triangles += tri_idx;
          p_indices += idx;
          p_indices += qi;
          p_indices += ri;
        }
    }
  tri_count = triangles.occ();
  for ( int idx=0; idx<tri_count; idx++ )
    {
      Balloon_Triangle* const tri = &triangles[idx];
      points[tri->qi].triangles += idx;
      points[tri->ri].triangles += idx;
    }

  for ( int i=0; i<tri_count; i++ )
    {
      Balloon_Triangle* const tri = &triangles[i];
      const int pi = tri->pi;
      const int qi = tri->qi;
      const int ri = tri->ri;
      int opp_p = -1, opp_q = -1, opp_r = -1;
      for ( int j=0; j<tri_count; j++ )
        {
          if ( i == j ) continue;
          Balloon_Triangle* const tri2 = &triangles[j];
          if ( opp_r < 0 ) opp_r = tri2->third_vtx(pi,qi);
          if ( opp_p < 0 ) opp_p = tri2->third_vtx(ri,qi);
          if ( opp_q < 0 ) opp_q = tri2->third_vtx(ri,pi);
        }
      ASSERTS( opp_p != -1 && opp_q != -1 && opp_r != -1 );

#define OP_SET(v)                                                             \
      tri->v##i_opp = opp_##v;                                                \
      tri->v##i_less = v##i < opp_##v;                                        \
      if ( tri->v##i_less )                                                   \
        {                                                                     \
          Balloon_Rep_Pair* const rp = rep_pairs.pushi();                     \
          rp->pi = v##i;  rp->qi = opp_##v;                                   \
        }

      OP_SET(p); OP_SET(q); OP_SET(r);
    }

  rep_pair_count = rep_pairs.occ();

  texid_pse = pBuild_Texture_File("mult.png",false,255);
  //  texid_pse = pBuild_Texture_File("shot-emacs.png",false,255);
  tex_coords.take(gpu_tex_coords,GL_STATIC_DRAW);
  tex_coords.to_gpu();
  if ( 1 )
    texid_syl = pBuild_Texture_File("gp.png",false,255);
  else
    texid_syl = pBuild_Texture_File
      ("/home/faculty/koppel/teach/gpup09/gpup.png",false,255);

  point_indices.take(p_indices,GL_STATIC_DRAW,GL_ELEMENT_ARRAY_BUFFER);
  point_indices.to_gpu();

  glp_vtx_data.alloc(point_count,GL_DYNAMIC_COPY);
  glp_vtx_data.prepare_two_buffers();
  glp_tri_data.alloc(tri_count,GL_DYNAMIC_COPY);
  glp_tri_data.prepare_two_buffers();
  glp_vtx_strc.alloc(point_count,GL_STATIC_DRAW);
  glp_tri_strc.alloc(tri_count,GL_STATIC_DRAW);
  cuda_tri_strc.alloc(tri_count);
  cuda_vtx_strc.alloc(point_count);

  for ( int idx = 0;  idx < point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      GLP_Vtx_Strc* const vd = &glp_vtx_strc[idx];
      CUDA_Vtx_Strc* const vc = &cuda_vtx_strc[idx];
      vd->self_idx = float( idx );
      vd->left_idx = 0.5;
      int np = 0;
      for ( int ti = 0; p->triangles.iterate(ti); )
        {
          Balloon_Triangle* const tri = &triangles[ti];
          const int pos = tri->pi == idx ? 0 : tri->qi == idx ? 1 : 2;
          const int pos_packed = 4 * ti + pos;
          vd->neighbors[np] = pos_packed;
          ASSERTS( vd->neighbors[np] == pos_packed );
          np++;
        }
      ASSERTS( np < VTX_TRI_DEG_MAX );
      while ( np < VTX_TRI_DEG_MAX ) vd->neighbors[np++] = -1;
      typeof vc->n0* const vcn = &vc->n0;
      for ( int i=0; i<VTX_TRI_DEG_MAX; i++ ) vcn[i] = vd->neighbors[i];
    }

  for ( int idx=0; idx<tri_count; idx++ )
    {
      Balloon_Triangle* const tri = &triangles[idx];
      GLP_Tri_Strc* const td = &glp_tri_strc[idx];
      CUDA_Tri_Strc* const tc = &cuda_tri_strc[idx];
#     define CPY_IDX(I) \
       td->I = (typeof td->I)(tri->I);  tc->I = (typeof tc->I)(tri->I);
      CPY_IDX(pi);     CPY_IDX(qi);     CPY_IDX(ri);
      CPY_IDX(pi_opp); CPY_IDX(qi_opp); CPY_IDX(ri_opp);
      CPY_IDX(length_relaxed);
#     undef CPY_IDX
    }
}

void
Balloon::update_for_config()
{
  temp_ratio = temperature / 300;
  point_mass = surface_mass / point_count;
  point_mass_inv = 1.0 / point_mass;
  damping_factor_per_step = pow(damping_factor,world.delta_t);
  const double ell = pow(volume,1/3.);
  const double a = sqrt( 2.0 * ell * spring_constant * point_mass_inv  );
  oversample = M_PI / ( 2 * a * world.delta_t );
  tightness = a;
  gas_m_over_temp = 0.2 * gas_particle_mass / temp_ratio;
  pressure_factor_coeff = gas_amount * temp_ratio;
  const double mass_gas =
    ( pressure_gas(centroid.y - 0.5, pressure_factor_coeff)
      - pressure_gas(centroid.y + 0.5, pressure_factor_coeff) )
    / opt_gravity_accel;
  gas_mass_per_vertex = mass_gas / point_count;
}

void
World::modelview_update()
{
  pMatrix_Translate center_eye(-eye_location);
  pMatrix_Rotation rotate_eye(eye_direction,pVect(0,0,-1));
  modelview = rotate_eye * center_eye;
  shadow_update();
}

void
World::shadow_update()
{
  // These routines need to be made more general.
  pCoor platform_point(platform_xmin,0,platform_zmin);
  pVect platform_normal(0,1,0);
  shadow_transform_create(modelview_shadow,light_location);
  pCoor eye_loc_mirror(eye_location.x, -eye_location.y, eye_location.z);
  pMatrix reflect; reflect.set_identity(); reflect.rc(1,1) = -1;
  transform_mirror = modelview * reflect * invert(modelview);
}

void
World::shadow_transform_create(pMatrix& m, pCoor light_location)
{
  pVect platform_normal(0,1,0);
  pVect eye_normal(0,0,-1);
  pMatrix_Translate center_light(-light_location);
  pNorm axis(-platform_normal,eye_normal);
  const double angle = asin(axis.magnitude);
  pMatrix_Rotation rotate_platform(axis,angle);
  pMatrix frustum; frustum.set_zero();
#if 0
  const bool light_at_infinity = light_location.w == 0;
  if ( light_at_infinity )
    {
      frustum.rc(0,0) = frustum.rc(1,1) = frustum.rc(3,3) = 1;
    }
  else
#endif
    {
      frustum.rc(0,0) = frustum.rc(1,1) = light_location.y;
      frustum.rc(3,2) = -1;
    }
  pMatrix_Translate restore_z(0,0,-light_location.y);
  pMatrix step1 = rotate_platform * center_light;
  pMatrix to_platform = restore_z * frustum * rotate_platform * center_light;
  pMatrix_Rotation un_rotate_platform(axis,-angle);
  pMatrix_Translate un_center_light(light_location);
  pMatrix from_platform = un_center_light * un_rotate_platform;
  pMatrix project = from_platform * to_platform;
  modelview_shadow = modelview * from_platform * to_platform;

  // Compute coordinates to help with debugging.
  //
  pCoor test_pt(1.1,0,2.2);
  pCoor test_pt2(1.1,1,2.2);
  pCoor test_pt_a = step1 * test_pt;
  pCoor test_pt_b = to_platform * test_pt;  test_pt_b.homogenize();
  pCoor test_pt_pr = project * test_pt;  test_pt_pr.homogenize();
  pCoor test_pt2_pr = project * test_pt2;  test_pt2_pr.homogenize();
}

void
Balloon::translate(pVect amt)
{
  gpu_data_to_cpu();
  for ( int idx = 0;  idx < point_count; idx++ )
    points[idx].pos += amt;
  data_location = DL_CPU;
  data_bid = 0;
}

void
Balloon::push(pVect amt)
{
  gpu_data_to_cpu();
  for ( int idx = 0;  idx < point_count; idx++ )
    points[idx].vel += amt;
  data_location = DL_CPU;
  data_bid = 0;
}

void
Balloon::time_step_cpu(int steps)
{
  for ( int i=0; i<steps; i++ ) time_step_cpu_once();
}

void
Balloon::time_step_cpu_once()
{
  const double friction_coefficient = 0.04;
  const double bounce_factor = 0.0;
  const double delta_t = world.delta_t;
  pVect gravity(0,-opt_gravity_accel,0);
  if ( !opt_gravity ) gravity = pVect(0,0,0);
  const bool first_iteration = cpu_iteration == 0;
  cpu_iteration++;
  need_cpu_iteration = false;

  double volume_x2 = 0;
  double area_x2 = 0;
  double kinetic_energy_total = 0;
  double spring_energy_factor_total = 0;
  pVect surface_error2(0,0,0);
  centroid = pCoor(0,0,0,0);
  weight = pVect(0,0,0);

  for ( int idx = 0;  idx < point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      centroid += p->pos;
      kinetic_energy_total += p->vel.mag();
      p->force_spring = pVect(0,0,0);
      p->force_rep = pVect(0,0,0);
      p->surface_normal = pVect(0,0,0);
    }
  centroid.homogenize();

  for ( int i = 0;  i < tri_count;  i++ )
    {
      Balloon_Triangle* const tri = &triangles[i];
      Balloon_Vertex* const p = &points[tri->pi];
      Balloon_Vertex* const q = &points[tri->qi];
      Balloon_Vertex* const r = &points[tri->ri];

      pCoor center = 1./3 * ( p->pos + q->pos + r->pos );
      pVect pqr_cross(q->pos,p->pos,r->pos);

      p->surface_normal += pqr_cross;
      q->surface_normal += pqr_cross;
      r->surface_normal += pqr_cross;
      surface_error2 += pqr_cross;

      const float tower_volume_x2 = -pqr_cross.y * center.y;
      volume_x2 += tower_volume_x2;

      const float triangle_area_x2 = pqr_cross.mag();
      area_x2 += triangle_area_x2;

      pVect p_to_c(p->pos,center);
      pVect q_to_c(q->pos,center);
      pVect r_to_c(r->pos,center);

      const float perimeter = p_to_c.mag() + q_to_c.mag() + r_to_c.mag();

      if ( length_relaxed_update )
        {
          tri->length_relaxed =
            first_iteration ? perimeter * 0.5 : perimeter;
        }

      const float eff_length = max(0.0f, perimeter - tri->length_relaxed );
      const float spring_force = eff_length * spring_constant;

      p->force_spring += spring_force * p_to_c;
      q->force_spring += spring_force * q_to_c;
      r->force_spring += spring_force * r_to_c;

      const double spring_energy = eff_length;
      spring_energy_factor_total += spring_energy;
    }

  for ( int i=0; i<rep_pair_count; i++ )
    {
      Balloon_Rep_Pair* const rp = &rep_pairs[i];
      Balloon_Vertex* const p = &points[rp->pi];
      Balloon_Vertex* const q = &points[rp->qi];
      pNorm p_to_q(p->pos,q->pos);
      const double dist_sq_inv = rep_constant / max(0.001,p_to_q.mag_sq);
      pVect rep_force(dist_sq_inv * p_to_q);
      p->force_rep -= rep_force;
      q->force_rep += rep_force;
    }

  length_relaxed_update = false;
  volume = volume_x2 / 2.0;
  area = area_x2 / 2.0;

  if ( first_iteration )
    {
      const float exp_air = pressure_air(centroid.y);
      const float exp_gas = pressure_gas(centroid.y,1);

      double pf_sum = 0; // Pressure factor.
      double area_sum_x6 = 0;
      for ( int i=0; i<point_count; i++ )
        {
          Balloon_Vertex* const p = &points[i];
          pNorm inward(p->surface_normal);
          const double pf_balance = dot(p->force_spring,inward);
          pf_sum += pf_balance;
          area_sum_x6 += inward.magnitude;
        }

      const double area_sum = area_sum_x6 / 6;

      gas_amount = damping_v *
        ( pf_sum / area_sum + exp_air ) * volume / ( temp_ratio * exp_gas );

      update_for_config(); // Recompute pressure_factor_coeff.
    }

  pressure_compute();

  const double spring_energy =
    12 * pow(point_mass,-0.5) * spring_energy_factor_total;
  const double kinetic_energy =
    point_mass * kinetic_energy_total;
  e_spring = spring_energy;
  e_kinetic = kinetic_energy;
  energy = e_spring + e_kinetic;

  pVect surface_error(0,0,0);
  for ( int i=0; i<point_count; i++ )
    {
      Balloon_Vertex* const p = &points[i];
      surface_error += p->surface_normal;
    }

  for ( int i=0; i<point_count; i++ )
    {
      Balloon_Vertex* const p = &points[i];
      const float gas_pressure = pressure_gas(p->pos.y);
      const float air_pressure = pressure_air(p->pos.y);

      p->surface_normal *= 1./6;

      p->force_pressure =
        ( air_pressure - gas_pressure ) * p->surface_normal;

      p->force = p->force_pressure;

      pNorm vel_norm(-p->vel);
      const double facing_area = max(0.0,dot(vel_norm,p->surface_normal));
      pVect force_ar = - air_resistance * facing_area * p->vel;

      pVect gforce = point_mass * p->mass * gravity;
      p->force += gforce;

      weight += p->force;
      p->force += force_ar;

      pVect force_ns = p->force; // Force non-spring.
      pVect force_s = p->force_spring + p->force_rep;

      p->force += force_s;

      const float mass_wgas_inv_dt =
        delta_t / ( point_mass * p->mass + gas_mass_per_vertex );

      pVect delta_vns = mass_wgas_inv_dt * force_ns;
      pVect delta_vs = mass_wgas_inv_dt * force_s;
      pVect delta_v = delta_vns + delta_vs;

      //  pVect pos_verlet = p->pos - pos_prev + delta_t * delta_v;

      p->pos_prev = p->pos;
      p->pos += ( p->vel +  0.5 * delta_v ) * delta_t;
      p->vel += damping_v * delta_vs + delta_vns;

    }

  if ( opt_damping )
    {
      pVect vel_avg = velocity_avg();

      for ( int i=0; i<point_count; i++ )
        {
          Balloon_Vertex* const p = &points[i];
          pVect local_vel = p->vel - vel_avg;
          p->vel = damping_factor_per_step * local_vel;
        }
      pVect vel_avg2 = velocity_avg();
      pVect vel_fix = vel_avg - vel_avg2;
      for ( int i=0; i<point_count; i++ ) points[i].vel += vel_fix;
    }

  if ( first_iteration ) e_zero = energy;

  for ( int idx = 0;  idx < point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      if ( p->pos.x < world.platform_xmin
           || p->pos.x > world.platform_xmax
           || p->pos.z < world.platform_zmin
           || p->pos.z > world.platform_zmax ) continue;
      if ( p->pos.y > 0 ) continue;
      if ( p->pos_prev.y < 0 ) continue;
      p->pos.y = 0;
      p->vel.y = - bounce_factor * p->vel.y;
      const float gas_pressure = pressure_gas(p->pos.y);
      pVect gforce = point_mass * p->mass * gravity;
      const float f_y =
        gforce.y + p->force_spring.y - gas_pressure * p->surface_normal.y;
      if ( f_y >= 0 ) continue;
      const float friction_force = -f_y * friction_coefficient;
      const float delta_v = friction_force * delta_t / ( point_mass*p->mass );
      const pNorm xzvel(p->vel.x,0,p->vel.z);
      if ( xzvel.magnitude <= delta_v ) {
        p->vel.x = 0;  p->vel.z = 0;
      }
      else
        p->vel -= delta_v * xzvel;
    }
}


#define TRY_XF_FEEDBACK(routine,vertex_count)                                 \
  for ( int feedback_tries = 0; ; feedback_tries++ )                          \
    {                                                                         \
      bool check = false;                                                     \
      glBeginTransformFeedbackNV(GL_POINTS); pError_Check();                  \
      if ( check )                                                            \
        glBeginQuery                                                          \
          (GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN_NV,                       \
           query_transform_feedback_id );                                     \
      routine;                                                                \
      glEndTransformFeedbackNV();  pError_Check();                            \
      if ( !check ) break;                                                    \
      glEndQuery(GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN_NV);                \
      int done_points = -1;                                                   \
      glGetQueryObjectiv                                                      \
        (query_transform_feedback_id,GL_QUERY_RESULT,&done_points);           \
      pError_Check();                                                         \
      if ( vertex_count == done_points ) break;                               \
      printf("Warning, xf only got %d points, trying again.\n",done_points);  \
      if ( feedback_tries > 10 ) pError_Exit();                               \
    }

void
Balloon::time_step_gpu(int steps)
{
  if ( world.opt_physics_method == GP_glp ) time_step_glp(steps);
  else                                      time_step_cuda(steps);
}

void
Balloon::init_glp()
{
  glp_initialized = true;

  glGenTextures(1,&glp_tri_data_tid);
  glGenTextures(1,&glp_vtx_data_tid);
  pError_Check();
  glGenQueries(1,&query_transform_feedback_id);
  pError_Check();

  glGenFramebuffersEXT(1,&framebuffer_id);
  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer_id);
  glGenRenderbuffersEXT(1, &renderbuffer_id);
  glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, renderbuffer_id);

  // GL_MAX_RENDERBUFFER_SIZE_EXT  Maximum size of either dimension.
  glRenderbufferStorageEXT  // p 610
    (GL_RENDERBUFFER_EXT, GL_FLOAT_RGBA_NV, 2, 2);
  glFramebufferRenderbufferEXT
    (GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_EXT,
     renderbuffer_id);

  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0 );
  glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0 );
  pError_Check();

  vs_plan_c.init("balloon-shader.cc","main_physics_plan_c();");
  stx_data_vtx = vs_plan_c.uniform_location("tex_data_vtx");
  stx_data_tri = vs_plan_c.uniform_location("tex_data_tri");
  sun_constants_sc = vs_plan_c.uniform_location("constants_sc");
  sun_constants_gas = vs_plan_c.uniform_location("constants_gas");
  sun_constants_dt = vs_plan_c.uniform_location("constants_dt");
  sun_platform = vs_plan_c.uniform_location("platform");
  sat_volume = vs_plan_c.attribute_location("volume");
  sat_indices = vs_plan_c.attribute_location("in_indices");
  sat_pos = vs_plan_c.attribute_location("in_pos");
  sat_vel = vs_plan_c.attribute_location("in_vel");
  svl_surface_normal = vs_plan_c.varying_location("out_surface_normal");
  svl_force_or_v = vs_plan_c.varying_location("out_force_or_v");
  svl_pos = vs_plan_c.varying_location("out_pos");
  svl_force_r = vs_plan_c.varying_location("out_force_r");
  vs_plan_c.print_active_varying();
  vs_plan_c.validate_once();
}

void
Balloon::time_step_glp(int steps)
{
  if ( !glp_initialized ) init_glp();

  cpu_data_to_glp();

  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, framebuffer_id);

  glDisable(GL_DEPTH_TEST);
  glDisable(GL_STENCIL_TEST);
  glDisable(GL_TEXTURE_2D);
  glDisable(GL_ALPHA_TEST);
  glEnable(GL_BLEND);
  glBlendEquation(GL_FUNC_ADD);
  glBlendFunc(GL_ONE,GL_ONE);
  glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB,GL_FALSE);
  pError_Check();
  glClearColor( 0.0, 0.0, 0.0, 0.0 );

  pError_Check();

  vs_plan_c.use();

  glUniform1i(stx_data_vtx,0);
  glUniform1i(stx_data_tri,1);

  glUniform4f
    (sun_constants_sc,spring_constant,damping_v,
     pressure_factor_coeff,
     gas_m_over_temp);

  glUniform4f
    (sun_constants_gas,
     air_resistance, gas_mass_per_vertex,
     air_particle_mass, opt_gravity ? opt_gravity_accel : 0.0 );

  glUniform4f
    (sun_constants_dt,
     world.delta_t, rep_constant, point_mass, point_mass_inv);

  glUniform4f
    (sun_platform,
     world.platform_xmin, world.platform_xmax,
     world.platform_zmin, world.platform_zmax);

  const GLint svl_p1[] =
    { svl_surface_normal, svl_force_or_v, svl_pos, svl_force_r };
  glTransformFeedbackVaryingsNV
    (vs_plan_c.pobject, 4, &svl_p1[0], GL_INTERLEAVED_ATTRIBS_NV);
  pError_Check();

  GLP_Vtx_Data before = glp_vtx_data.data[0];

  if ( steps ) data_location = DL_GLP;

  pError_Check();

  glEnableClientState(GL_VERTEX_ARRAY);

  glActiveTexture(GL_TEXTURE0);  pError_Check();
  glBindTexture(GL_TEXTURE_BUFFER_EXT,glp_vtx_data_tid);   pError_Check();

  glActiveTexture(GL_TEXTURE1);
  glBindTexture(GL_TEXTURE_BUFFER_EXT,glp_tri_data_tid);  pError_Check();

  glEnableVertexAttribArray(sat_indices);

  for ( int i=0; i<steps; i++ )
    {
      const bool skip_volume = i + 1 != steps && true && ( i & 0x3 );

      //
      // Pass 1, Triangles
      //

      glActiveTexture(GL_TEXTURE0);
      glTexBufferEXT // Attaches to the active buffer texture.
        (GL_TEXTURE_BUFFER_EXT, GL_RGBA32F_ARB, glp_vtx_data.bid_read());

      glActiveTexture(GL_TEXTURE1);
      glBindTexture(GL_TEXTURE_BUFFER_EXT,0);

      const int tstride = sizeof(glp_tri_strc[0]);

      glp_tri_strc.bind();
      glVertexAttribIPointerEXT(sat_indices, 4, GL_INT, tstride, (void*)16);

      glVertexPointer(4, GL_FLOAT, sizeof(glp_tri_strc[0]), 0);
      glBindBuffer(GL_ARRAY_BUFFER,0);

      glBindBufferBaseNV
        (GL_TRANSFORM_FEEDBACK_BUFFER_NV, 0, glp_tri_data.bid_fresh());
      pError_Check();

      if ( skip_volume )
        {
          glEnable(GL_RASTERIZER_DISCARD_NV);
        }
      else
        {
          glDisable(GL_RASTERIZER_DISCARD_NV);
          glClear(GL_COLOR_BUFFER_BIT);
        }
      TRY_XF_FEEDBACK( glDrawArrays(GL_POINTS,0,tri_count), tri_count);

      if ( !skip_volume )
        {
          glReadBuffer(GL_COLOR_ATTACHMENT0_EXT); pError_Check();
          pCoor pb[4];
          glReadPixels(0,0,2,2,GL_RGBA,GL_FLOAT,&pb[0]);
          pError_Check();
          centroid = (1.0/tri_count)*pb[0];
          volume = 0.5 * pb[0].w;
        }

      if ( false )
        {
          glp_tri_data.from_gpu();
          GLP_Tri_Data after_sf = glp_tri_data.data[0];
          pError_Msg("Check.");
        }

      //
      // Pass 2, Vertices
      //

      glEnable(GL_RASTERIZER_DISCARD_NV);
      glVertexAttrib1f(sat_volume,volume); pError_Check();

      glActiveTexture(GL_TEXTURE0);
      glBindTexture(GL_TEXTURE_BUFFER_EXT,0);

      glActiveTexture(GL_TEXTURE1);
      glBindTexture(GL_TEXTURE_BUFFER_EXT,glp_tri_data_tid);  pError_Check();
      glTexBufferEXT
        (GL_TEXTURE_BUFFER_EXT, GL_RGBA32F_ARB, glp_tri_data.bid);
      pError_Check();

      glp_vtx_strc.bind();
      const int vstride = sizeof(glp_vtx_strc[0]);
      glVertexPointer(2, GL_FLOAT, vstride, 0);

      glVertexAttribIPointerEXT(sat_indices, 4, GL_INT, vstride, (void*)8);

      const int dvstride = sizeof(GLP_Vtx_Data);

      glp_vtx_data.bind();
      glVertexAttribPointer(sat_vel, 4, GL_FLOAT, false, dvstride, (void*)16);
      glVertexAttribPointer(sat_pos, 4, GL_FLOAT, false, dvstride, (void*)32);
      glEnableVertexAttribArray(sat_pos);
      glEnableVertexAttribArray(sat_vel);

      glBindBuffer(GL_ARRAY_BUFFER,0);
      glBindBufferBaseNV
        (GL_TRANSFORM_FEEDBACK_BUFFER_NV, 0, glp_vtx_data.bid_write());
      pError_Check();

      TRY_XF_FEEDBACK( glDrawArrays(GL_POINTS,0,point_count), point_count);

      glDisableVertexAttribArray(sat_pos);
      glDisableVertexAttribArray(sat_vel);

      glp_vtx_data.bid_swap();
    }

  if ( false )
    {
      GLP_Vtx_Data after = glp_vtx_data.data[0];
      pError_Msg("Check.");
    }

  world.vs_fixed.use();

  glDisableClientState(GL_VERTEX_ARRAY);
  glDisableVertexAttribArray(sat_indices);

  glBindBuffer(GL_ARRAY_BUFFER,0);

  glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB,GL_TRUE);
  glDisable(GL_RASTERIZER_DISCARD_NV);
  glActiveTexture(GL_TEXTURE0);
  glBindTexture(GL_TEXTURE_BUFFER_EXT,0);
  glActiveTexture(GL_TEXTURE1);
  glBindTexture(GL_TEXTURE_BUFFER_EXT,0);

  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0 );

  data_bid = glp_vtx_data.bid_read();
  data_stride = sizeof(glp_vtx_data[0]);
  pressure_compute();

  pError_Check();
}

void
Balloon::cuda_data_partition()
{
  const int block_lg = CUDA_VTX_BLOCK_LG;
  const int block_size = 1 << block_lg;
  const int block_mask = block_size - 1;
  const int block_count = int(0.9999 + double(point_count) / block_size);
  for ( int i=0; i<tri_count; i++ ) triangles[i].block_number = -1;
  PStack<CUDA_Tri_Work_Strc> tri_work;
  PStack<int> work_sizes;
  int work_max = 0;
  for ( int i=0; i<block_count; i++ )
    {
      const int start = i * block_size;
      const int stop = start + block_size;
      int uniq_tris = 0;
      for ( int vi=start; vi<stop; vi++ )
        {
          Balloon_Vertex* const p = &points[vi];
          for ( int ti = 0; p->triangles.iterate(ti); )
            {
              Balloon_Triangle* const tri = &triangles[ti];
              if ( tri->block_number == i ) continue;
              tri->block_number = i;
              uniq_tris++;
              CUDA_Tri_Work_Strc* const tw = tri_work.pushi();
              tw->pi = int(tri->pi); tw->qi = int(tri->qi);
              tw->ri = int(tri->ri);
              tw->vi_opp0 = tw->vi_opp1 = tw->vi_opp2 = -1;
              tw->length_relaxed = float(ti);
            }
        }
      work_sizes += uniq_tris;
      work_max = max(work_max,uniq_tris);
    }

  tri_work_per_vtx_lg = int(0.998 + log2(double(work_max)/block_size));
  tri_work_per_vtx = 1 << tri_work_per_vtx_lg;
  ASSERTS( tri_work_per_vtx_lg <= 3 );
  const int work_per_block = tri_work_per_vtx * block_size;
  const int work_count = work_per_block * block_count;
  cuda_tri_work_strc.alloc( work_count );
  int ci = 0, wi = 0;
  CUDA_Tri_Work_Strc tw_pad; memset(&tw_pad,-1,sizeof(tw_pad));
  int waste = 0;
  for ( int amt = 0; work_sizes.iterate(amt); )
    {
      for ( int i=0; i<amt; i++ ) cuda_tri_work_strc[ci++] = tri_work[wi++];
      int pad = work_per_block - amt;
      waste += pad;
      while ( pad-- ) cuda_tri_work_strc[ci++] = tw_pad;
    }
  for ( int i=0; i<work_count; i++ ) cuda_tri_work_strc[i].pull_i = 0;
  int max_pull = 0;
  const int pull_limit = 4;
  for ( int i=0; i<work_count; i++ )
    {
      CUDA_Tri_Work_Strc* const tw = &cuda_tri_work_strc[i];
      if ( tw->pi == -1 ) continue;
      const int ti = int(tw->length_relaxed);
      Balloon_Triangle* const tri = &triangles[ti];
      typeof tri->pi_opp* const t_optr = &tri->pi_opp;
      const int round = i % tri_work_per_vtx;
      const int blk = i / tri_work_per_vtx >> block_lg;
      const int tid = i / tri_work_per_vtx & block_mask;
      typeof tw->pi* const w_vptr = &tw->pi;
      for ( int v=0; v<3; v++ )
        {
          const int vi = w_vptr[v];
          const int vi_blk = vi >> block_lg;
          if ( vi_blk != blk ) continue;
          const int vi_tri = vi * tri_work_per_vtx + round;
          CUDA_Tri_Work_Strc* const tv = &cuda_tri_work_strc[vi_tri];
          typeof tv->vi_opp0* const w_optr = &tv->vi_opp0;
          typeof tv->pull_tid_0* const w_pptr = &tv->pull_tid_0;
          ASSERTS( tv->pull_i < pull_limit );
          w_optr[tv->pull_i] = t_optr[v];
          w_pptr[tv->pull_i] = tid;
          tv->pull_i++;
          if ( tv->pull_i > max_pull ) max_pull = tv->pull_i;
        }
    }
  for ( int i=0; i<work_count; i++ )
    {
      CUDA_Tri_Work_Strc* const tw = &cuda_tri_work_strc[i];
      tw->pull_i <<= 1;
      if ( tw->pi == -1 ) continue;
      const int ti = int(tw->length_relaxed);
      Balloon_Triangle* const tri = &triangles[ti];
      if ( tri->block_number != block_count )
        {
          tw->pull_i |= 1;
          tri->block_number = block_count;
        }
      tw->length_relaxed = tri->length_relaxed;
    }
}

template <typename T> void to_dev_ds(const char* const dst_name, T src)
{
  T cpy = src;
  CE(cudaMemcpyToSymbol(dst_name, &cpy, sizeof(T), 0, cudaMemcpyHostToDevice));
}

void to_dev_ds(const char* const dst_name, double& src)
{
  ASSERTS( false );
}

#define TO_DEV_DS(dst,src) to_dev_ds(#dst,src);
#define TO_DEV(var) to_dev_ds<typeof var>(#var,var)
#define TO_DEV_OM(obj,memb) to_dev_ds(#memb,obj.memb)
#define TO_DEV_OM_F(obj,memb) to_dev_ds(#memb,float(obj.memb))

void
Balloon::init_cuda()
{
  cuda_initialized = true;
  cuda_constants_stale = true;
  int device_count;
  cudaGetDeviceCount(&device_count);
  ASSERTS( device_count );
  const int dev = 0;
  cudaDeviceProp prop;
  CE(cudaGetDeviceProperties(&prop,dev));
  CE(cudaGLSetGLDevice(dev));
  printf
    ("GPU: %s @ %.2f GHz WITH %d MiB GLOBAL MEM\n",
     prop.name, prop.clockRate/1e6, prop.totalGlobalMem >> 20);
  printf
    ("CAP: %d.%d  NUM MP: %d  TH/BL: %d  SHARED: %d  CONST: %d  "
     "# REGS: %d\n",
     prop.major, prop.minor,
     prop.multiProcessorCount, prop.maxThreadsPerBlock,
     prop.sharedMemPerBlock, prop.totalConstMem,
     prop.regsPerBlock
     );

  CE(cudaEventCreate(&world.frame_start_ce));
  CE(cudaEventCreate(&world.frame_stop_ce));

  cuda_vtx_strc.to_cuda();
  TO_DEV_DS(vtx_strc,cuda_vtx_strc.get_dev_addr());

  cuda_tri_strc.to_cuda();
  TO_DEV_DS(tri_strc,cuda_tri_strc.get_dev_addr());

  cuda_tri_data.alloc(tri_count);
  TO_DEV_DS(tri_data,cuda_tri_data.get_dev_addr());

  cuda_vtx_data.alloc(point_count);

  const int max_block_count =
    int(0.5
        + double(tri_count)/min(CUDA_TRI_BLOCK_SIZE,CUDA_VTX_BLOCK_SIZE));

  cuda_tower_volumes.alloc(max_block_count);
  TO_DEV_DS(tower_volumes,cuda_tower_volumes.get_dev_addr());

  cuda_centroid_parts.alloc(max_block_count);
  TO_DEV_DS(centroid_parts,cuda_centroid_parts.get_dev_addr());

  cuda_data_partition();
  cuda_tri_work_strc.to_cuda();
  TO_DEV_DS(tri_work_strc,cuda_tri_work_strc.get_dev_addr());
  TO_DEV(tri_work_per_vtx);
  TO_DEV(tri_work_per_vtx_lg);

  Dg_tri.x = int(ceil(double(tri_count)/CUDA_TRI_BLOCK_SIZE));
  Dg_tri.y = Dg_tri.z = 1;
  Db_tri.x = CUDA_TRI_BLOCK_SIZE;
  Db_tri.y = Db_tri.z = 1;

  Dg_vtx.x = int(ceil(double(point_count)/CUDA_VTX_BLOCK_SIZE));
  Dg_vtx.y = Dg_vtx.z = 1;
  Db_vtx.x = CUDA_VTX_BLOCK_SIZE;
  Db_vtx.y = Db_vtx.z = 1;

  for ( int i=0; i<2; i++ )
    {
      dim3 Dg = i==0 ? Dg_tri : Dg_vtx;
      dim3 Db = i==0 ? Db_tri : Db_vtx;
      const char* const pass = i==0 ? "Triangle" : "Vertex";

      const double warp_per_block = double(Db.x) / 32;
      const double block_per_mp = double(Dg.x) / prop.multiProcessorCount;
      const double warp_per_mp = ceil(warp_per_block) * block_per_mp;
      printf("\n%s pass block size %d thds, %d warps.\n",
             pass, Db.x, Db.x >> 5);
      printf("Grid size %d blks,  %.1f blks / MP,  %.1f warps / MP.\n",
             Dg.x,
             block_per_mp, warp_per_mp );
      const double warp_util = double(min(32u,Db.x))/32;
      const int data_dist = 2;
      const int latency_fp_warps = 24 / 4;
      const int latency_mem_warps = 400 / 4;

      // Assume no switch between blocks to hide fp latency.
      //
      const double lat_coverage_fp =
        min( warp_per_block * data_dist / latency_fp_warps, 1.0 );

      // Assume gpu does switch between blocks to hide mem latency.
      //
      const double lat_coverage_mem =
        min( warp_per_mp * data_dist / latency_mem_warps, 1.0 );

      printf("Approx CP util fp  %.3f * %.3f = %.3f (%.3f) for avg dist %d\n",
             warp_util, lat_coverage_fp,
             warp_util * lat_coverage_fp,
             warp_util * lat_coverage_fp * prop.multiProcessorCount,
             data_dist);

      printf("Approx CP util mem %.3f * %.3f = %.3f (%.3f) for avg dist %d\n",
             warp_util, lat_coverage_mem,
             warp_util * lat_coverage_mem,
             warp_util * lat_coverage_mem * prop.multiProcessorCount,
             data_dist);
    }
}


void
Balloon::time_step_cuda(int steps)
{
  static int cuda_iteration = 0;
  cuda_iteration++;

  if ( !cuda_initialized ) init_cuda();

  CE(cudaEventRecord(world.frame_start_ce,0));
  cpu_data_to_cuda();

  if ( cuda_constants_stale )
    {
      cuda_constants_stale = false;
      TO_DEV_DS(volume_cpu,volume);  // For debugging, not used by gpu.
      TO_DEV(tri_count);
      TO_DEV(point_count);

      TO_DEV(spring_constant);
      TO_DEV(damping_v);
      TO_DEV(pressure_factor_coeff);
      TO_DEV(gas_m_over_temp);

      TO_DEV(air_resistance);
      TO_DEV(gas_mass_per_vertex);
      TO_DEV(air_particle_mass);
      float gravity_mag = opt_gravity ? opt_gravity_accel : 0;
      TO_DEV(gravity_mag);
      TO_DEV(opt_gravity);

      TO_DEV_OM_F(world,delta_t);
      TO_DEV(rep_constant);
      TO_DEV(point_mass);
      TO_DEV(point_mass_inv);

      TO_DEV_OM(world,platform_xmin);
      TO_DEV_OM(world,platform_xmax);
      TO_DEV_OM(world,platform_zmin);
      TO_DEV_OM(world,platform_zmax);
      cuda_tower_volumes.set_primary();
    }

  const CUDA_Vtx_Data vtest = cuda_vtx_data[42];

  if ( steps ) data_location = DL_CUDA;
  const bool two_pass = world.opt_physics_method == GP_cuda_2_pass;

  for ( int i=0; i<steps; i++ )
    {
      CUDA_Vtx_Data* const vtx_data_in_d = cuda_vtx_data.get_dev_addr_read();
      CUDA_Vtx_Data* const vtx_data_out_d = cuda_vtx_data.get_dev_addr_write();

      if ( two_pass )
        {
          pass_triangles_launch
            (Dg_tri, Db_tri, vtx_data_in_d, cuda_vtx_data.chars);

          pass_vertices_launch
            (Dg_vtx, Db_vtx,
             cuda_tri_data.get_dev_addr(), vtx_data_out_d,
             cuda_tri_data.chars);
        }
      else
        {
          float* const tv_in = cuda_tower_volumes.get_dev_addr_read();
          float* const tv_out = cuda_tower_volumes.get_dev_addr_write();

          pass_unified_launch
            (Dg_vtx, Db_vtx,
             vtx_data_in_d, vtx_data_out_d, tv_in, tv_out,
             cuda_tri_data.chars, cuda_vtx_data.chars);

          cuda_tower_volumes.swap();
        }

      cuda_vtx_data.swap();
    }

  if ( false )
    {
      // Due to a CUDA bug this is slower than just copying through host.
      cuda_vtx_data.cuda_to_gl();
      //  cuda_centroid_parts.from_cuda();
      data_bid = cuda_vtx_data.bid;
      data_stride = sizeof(cuda_vtx_data[0]);
    }
  else
    {
      cuda_data_to_cpu();
    }

  {
    cuda_tower_volumes.from_cuda();
    const int blocks = two_pass ? Dg_tri.x : Dg_vtx.x;
    volume = 0;
    for ( int i=0; i<blocks; i++ ) volume += cuda_tower_volumes[i];
  }

  CE(cudaEventRecord(world.frame_stop_ce,0));
  CE(cudaEventSynchronize(world.frame_stop_ce));
  float cuda_time = -1.1;
  CE(cudaEventElapsedTime(&cuda_time,world.frame_start_ce,world.frame_stop_ce));
  world.frame_timer.cuda_frame_time_set(cuda_time);
  if ( !opt_cpu_interleave )
    {
      centroid_compute();
      pressure_compute();
    }

  CUDA_Vtx_Data vtest_aftera = cuda_vtx_data[42];

#undef TO_DEV
#undef TO_DEV_OM
}

void
Balloon::gpu_data_to_cpu()
{
  if ( world.opt_physics_method == GP_glp ) glp_data_to_cpu();
  else                                      cuda_data_to_cpu();
}

void
Balloon::glp_data_to_cpu()
{
  if ( data_location & DL_CPU ) return;
  data_location |= DL_CPU;
  glp_vtx_data.from_gpu();
  for ( int idx=0; idx<point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      GLP_Vtx_Data* const g = &glp_vtx_data[idx];
      p->pos = g->pos;
      p->vel = g->vel;
      p->surface_normal = g->surface_normal;
    }
}

void
Balloon::cuda_data_to_cpu()
{
  if ( data_location & DL_CPU ) return;
  data_location |= DL_CPU;
  cuda_vtx_data.from_cuda();
  for ( int idx=0; idx<point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      CUDA_Vtx_Data* const g = &cuda_vtx_data[idx];
      vec_sets(p->pos,g->pos);
      vec_sets(p->vel,g->vel);
      vec_sets(p->surface_normal,g->surface_normal);
    }
}

void
Balloon::cpu_data_to_glp()
{
  if ( data_location & DL_GLP ) return;
  data_location |= DL_GLP;

  for ( int idx=0; idx<point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      GLP_Vtx_Data* const g = &glp_vtx_data[idx];
      g->pos = p->pos;
      g->vel = p->vel;
      g->surface_normal = p->surface_normal;
    }

  for ( int idx=0; idx<tri_count; idx++ )
    {
      Balloon_Triangle* const tri = &triangles[idx];
      GLP_Tri_Strc* const td = &glp_tri_strc[idx];
      td->length_relaxed = tri->length_relaxed;
    }

  glp_vtx_data.to_gpu();
  glp_vtx_strc.to_gpu();
  glp_tri_strc.to_gpu();
  glBindBuffer(GL_ARRAY_BUFFER,0);
}

bool
Balloon::cpu_data_to_cuda()
{
  if ( data_location & DL_CUDA ) return false;
  data_location |= DL_CUDA;

  for ( int idx=0; idx<point_count; idx++ )
    {
      Balloon_Vertex* const p = &points[idx];
      CUDA_Vtx_Data* const g = &cuda_vtx_data[idx];
      vec_set(g->pos,p->pos);
      vec_set(g->vel,p->vel);
    }

  for ( int idx=0; idx<tri_count; idx++ )
    {
      Balloon_Triangle* const tri = &triangles[idx];
      CUDA_Tri_Strc* const tc = &cuda_tri_strc[idx];
      tc->length_relaxed = tri->length_relaxed;
    }

  for ( int idx=1; idx<cuda_tower_volumes.elements; idx++ )
    cuda_tower_volumes[idx] = 0;
  cuda_tower_volumes[0] = volume;
  cuda_tower_volumes.set_primary();
  cuda_tower_volumes.to_cuda();
  cuda_tower_volumes.swap();
  cuda_tower_volumes.to_cuda();
  cuda_tower_volumes.set_primary();

  cuda_tri_strc.to_cuda();  // OPT: Only send if dirty.
  cuda_vtx_data.to_cuda();
  cuda_vtx_strc.to_cuda();  // OPT: Only send if dirty.
  return true;
}


// Display a tetrahedron, used to indicate light position.
//
void
insert_tetrahedron(pCoor& loc, float size)
{
  pCoor v0(loc.x,loc.y,loc.z);
  pCoor v1(loc.x,loc.y-size,loc.z+size);
  pCoor v2(loc.x-.866*size,loc.y-size,loc.z-0.5*size);
  pCoor v3(loc.x+.866*size,loc.y-size,loc.z-0.5*size);
  static pColor c1(0xffffff);
  static pColor c2(0xff00);

  glDisable(GL_LIGHTING);

#define TRI(va,vb,vc)                                                         \
  {                                                                           \
    pVect n = cross(va,vb,vc);                                                \
    glNormal3fv(n);                                                           \
    glColor3fv(c1);  glVertex3fv(va);                                         \
    glColor3fv(c2);  glVertex3fv(vb);                                         \
    glVertex3fv(vc);                                                          \
  }

  glBegin(GL_TRIANGLES);
  TRI(v0,v1,v2); TRI(v0,v2,v3); TRI(v0,v3,v1);
  glEnd();

# undef TRI

  glEnable(GL_LIGHTING);
}

void
tube_tapered(pCoor base, float radius, pVect to_apex)
{
  const int sides = 10;
  const double delta_theta = 2 * M_PI / sides;
  const double base_radius = 1;
  const double apex_radius = 0.1;
  const double apex_height = 1;
  const double alpha = atan2(apex_height,base_radius-apex_radius);
  const double vec_z = sin(alpha);
  const float to_height = to_apex.mag();
  glMatrixMode(GL_MODELVIEW);
  glPushMatrix();

  pVect from_apex(0,0,1);
  pVect rn(from_apex,to_apex);
  const float rot_angle = pangle(from_apex,to_apex);
  glTranslatef(base.x,base.y,base.z);
  glRotatef(rot_angle * 180.0 / M_PI,rn.x,rn.y,rn.z);
  glScalef(radius,radius,to_height);
  glBegin(GL_QUAD_STRIP);
  for ( int i=0; i<=sides; i++ )
    {
      const double theta = delta_theta * i;
      const double cos_t = cos(theta);
      const double sin_t = sin(theta);
      glNormal3f( cos_t, sin_t, vec_z );
      glVertex3f( apex_radius * cos_t, apex_radius * sin_t, apex_height);
      glVertex3f( base_radius * cos_t, base_radius * sin_t, 0);
    }
  glEnd();
  glPopMatrix();
}


void
World::render()
{
  cb_keyboard();
  frame_timer.frame_start();

  if ( opt_physics_method == GP_cpu ) balloon.gpu_data_to_cpu();

  if ( world_time == 0 ) world_time = time_wall_fp();

  if ( opt_pause )
    {
      world_time = time_wall_fp();
    }
  else
    {
      // Advance simulated time.
      //
      const double time_start = time_wall_fp();
      const double sim_time_needed = time_start - world_time;
      delta_t = 1.0 / ( 30 * ( opt_physics_method ? 40 : 20 ) ) ;
      const int time_steps_needed = int( sim_time_needed / delta_t );
      const int time_steps = min(time_steps_needed,100);
      balloon.update_for_config();
      if ( opt_physics_method &&
           ( balloon.need_cpu_iteration || balloon.opt_cpu_interleave ) )
        balloon.time_step_cpu_once();
      if ( opt_physics_method ) balloon.time_step_gpu(time_steps);
      else balloon.time_step_cpu(time_steps);
      frame_timer.work_amt_set(time_steps);
      world_time += delta_t * time_steps;
    }

  // Rescue balloon if it is sinking into the abyss.
  //
  if ( balloon.centroid.y < -50 )
    {
      pVect rescue_vector = pCoor(0,12,-12) - balloon.centroid;
      balloon.translate(rescue_vector);
    }

  const pColor white(0xffffff);
  const pColor gray(0x303030);
  const pColor lsu_business_purple(0x7f5ca2);
  const pColor lsu_spirit_purple(0x580da6);
  const pColor lsu_spirit_gold(0xf9b237);
  const pColor lsu_official_purple(0x2f0462);
  const pColor dark(0);

  const int win_width = ogl_helper.get_width();
  const int win_height = ogl_helper.get_height();
  const float aspect = float(win_width) / win_height;

  glMatrixMode(GL_MODELVIEW);
  glLoadTransposeMatrixf(modelview);

  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  // Frustum: left, right, bottom, top, near, far
  glFrustum(-.8,.8,-.8/aspect,.8/aspect,1,5000);

  glViewport(0, 0, win_width, win_height);
  pError_Check();

  glClearColor(0,0,0,0.5);
  glClearDepth(1.0);
  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT );

  glEnable(GL_DEPTH_TEST);
  glDepthFunc(GL_LESS);
  glDisable(GL_BLEND);

  glLightModeli(GL_LIGHT_MODEL_TWO_SIDE,1);
  glLightfv(GL_LIGHT0, GL_POSITION, light_location);

  glLightf(GL_LIGHT0, GL_CONSTANT_ATTENUATION, 0.3);
  glLightf(GL_LIGHT0, GL_LINEAR_ATTENUATION, 1.0);
  glLightf(GL_LIGHT0, GL_QUADRATIC_ATTENUATION, 0);

  pColor ambient_color(0x999999);

  glLightModelfv(GL_LIGHT_MODEL_AMBIENT, ambient_color);
  glLightfv(GL_LIGHT0, GL_DIFFUSE, white * opt_light_intensity);
  glLightfv(GL_LIGHT0, GL_AMBIENT, dark);
  glLightfv(GL_LIGHT0, GL_SPECULAR, white * opt_light_intensity);

  glEnable(GL_LIGHT0);
  glEnable(GL_LIGHTING);

  glEnable(GL_COLOR_MATERIAL);
  glColorMaterial(GL_FRONT_AND_BACK,GL_AMBIENT_AND_DIFFUSE);

  glShadeModel(GL_SMOOTH);

  pColor color_ball(0x666666);
  pColor scolor_ball(0x111111);
  const float shininess_ball = 5;

  // Common to all textures.
  //
  glActiveTexture(GL_TEXTURE0);
  glEnable(GL_TEXTURE_2D);
  glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,
                  GL_LINEAR_MIPMAP_LINEAR);
  glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
  glTexEnvi(GL_TEXTURE_ENV,GL_TEXTURE_ENV_MODE,GL_MODULATE);
  glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT);
  glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT);

  glEnable(GL_RESCALE_NORMAL);
  glEnable(GL_NORMALIZE);

  ogl_helper.fbprintf("%s\n",frame_timer.frame_rate_text_get());

  ogl_helper.fbprintf
    ("Physics Computation: %s ('a' to change) "
     "  + %d cpu timestep / frame ('x' to change)\n",
     gpu_physics_method_str[opt_physics_method],
     balloon.opt_cpu_interleave);

  ogl_helper.fbprintf
    ("Eye location: [%5.1f, %5.1f, %5.1f]  "
     "Eye direction: [%+.2f, %+.2f, %+.2f]\n",
     eye_location.x, eye_location.y, eye_location.z,
     eye_direction.x, eye_direction.y, eye_direction.z);

  pCoor cent = balloon.centroid;
  pCoor vel = balloon.velocity_avg();

  ogl_helper.fbprintf
    ("Centroid  [%5.1f,%5.1f,%5.1f] Vel [%+5.1f,%+5.1f,%+5.1f]  "
     "Gas Amt %.2f  Volume  %.2f  Pressure %.2f\n",
     cent.x,cent.y,cent.z,
     vel.x,vel.y,vel.z,
     balloon.gas_amount,
     balloon.volume / balloon.nom_volume,
     balloon.pressure
     );

  ogl_helper.fbprintf
    ("Weight (Surf+Gas-Displ Air=W) (%6.2f + %6.2f - %6.2f = %6.2f)\n",
     balloon.opt_gravity_accel * balloon.surface_mass,
     balloon.opt_gravity_accel * balloon.volume * balloon.density_gas,
     balloon.opt_gravity_accel * balloon.volume * balloon.density_air,
     balloon.opt_gravity_accel * balloon.surface_mass
     + balloon.opt_gravity_accel * balloon.volume * balloon.density_gas
     - balloon.opt_gravity_accel * balloon.volume * balloon.density_air
     );

  ogl_helper.fbprintf("Oversample %3.1f\n", balloon.oversample);

  pVariable_Control_Elt* const cvar = variable_control.current;
  ogl_helper.fbprintf("VAR %s = %.5f  (TAB or '`' to change, +/- to adjust)\n",
                      cvar->name,cvar->var[0]);

  const int half_elements = platform_tile_coords.elements >> 3 << 2;
  const int vstride = sizeof(Balloon_Vertex);

  if ( opt_surface_smooth )
    {
      //
      // Render balloon reflection.  (Will be blended with dark tiles.)
      //

      // Write stencil at location of dark (mirrored) tiles.
      //
      glDisable(GL_LIGHTING);
      glEnable(GL_STENCIL_TEST);
      glStencilFunc(GL_NEVER,2,2);
      glStencilOp(GL_REPLACE,GL_KEEP,GL_KEEP);
      platform_tile_coords.bind();
      glVertexPointer(3, GL_FLOAT, sizeof(platform_tile_coords.data[0]), 0);
      glEnableClientState(GL_VERTEX_ARRAY);
      glDrawArrays(GL_QUADS,half_elements+4,half_elements-4);
      glEnable(GL_LIGHTING);

      // Prepare to write only stenciled locations.
      //
      glStencilFunc(GL_EQUAL,2,2);
      glStencilOp(GL_KEEP,GL_KEEP,GL_KEEP);

      // Use a transform that reflects objects to other size of platform.
      //
      glMatrixMode(GL_PROJECTION);
      glPushMatrix();
      glMultTransposeMatrixf(transform_mirror);

      // Reflected front face should still be treated as the front face.
      //
      glFrontFace(GL_CW);

      glColor3fv(color_ball);
      glMaterialfv(GL_FRONT_AND_BACK,GL_SPECULAR,scolor_ball);
      glMaterialf(GL_FRONT_AND_BACK,GL_SHININESS,shininess_ball);

      glBindTexture(GL_TEXTURE_2D,balloon.texid_pse);

      balloon.tex_coords.bind();
      glTexCoordPointer(2,GL_FLOAT,0,NULL);
      glEnableClientState(GL_TEXTURE_COORD_ARRAY);

      if ( balloon.data_bid )
        {
          glBindBuffer(GL_ARRAY_BUFFER,balloon.data_bid);
          glVertexPointer
            (3, GL_FLOAT, balloon.data_stride, (void*)( 2 * sizeof(pCoor) ));
          glNormalPointer(GL_FLOAT, balloon.data_stride, NULL );
        }
      else
        {
          glBindBuffer(GL_ARRAY_BUFFER,0);
          glVertexPointer(4, GL_FLOAT, vstride, &balloon.points[0].pos);
          glNormalPointer(GL_FLOAT, vstride, &balloon.points[0].surface_normal);
        }

      glEnableClientState(GL_VERTEX_ARRAY);
      glEnableClientState(GL_NORMAL_ARRAY);

      balloon.point_indices.bind(GL_ELEMENT_ARRAY_BUFFER);

      glDrawElements
        (GL_TRIANGLES,balloon.point_indices.elements,GL_UNSIGNED_INT, NULL);

      glDisableClientState(GL_NORMAL_ARRAY);
      glDisableClientState(GL_VERTEX_ARRAY);
      glDisableClientState(GL_TEXTURE_COORD_ARRAY);
      glBindBuffer(GL_ARRAY_BUFFER,0);

      glFrontFace(GL_CCW);
      glPopMatrix();
      glDisable(GL_STENCIL_TEST);
    }

  {
    //
    // Write framebuffer stencil with balloon's shadow.
    //

    // Use transform that maps vertices to platform surface.
    //
    glMatrixMode(GL_MODELVIEW);
    glPushMatrix();
    glLoadTransposeMatrixf(modelview_shadow);

    glDisable(GL_LIGHTING);
    glDisable(GL_TEXTURE_2D);

    glEnable(GL_STENCIL_TEST);
    glStencilFunc(GL_NEVER,1,-1); // ref, mask
    glStencilOp(GL_REPLACE,GL_KEEP,GL_KEEP);  // sfail, dfail, dpass

    if ( balloon.data_bid )
      {
        glBindBuffer(GL_ARRAY_BUFFER,balloon.data_bid);
        glVertexPointer
          (3, GL_FLOAT, balloon.data_stride, (void*)( 2 * sizeof(pCoor) ));
      }
    else
      glVertexPointer(4, GL_FLOAT, vstride, &balloon.points[0].pos);
    glEnableClientState(GL_VERTEX_ARRAY);
    balloon.point_indices.bind(GL_ELEMENT_ARRAY_BUFFER);
    glDrawElements
      (GL_TRIANGLES,balloon.point_indices.elements,GL_UNSIGNED_INT, NULL);
    glDisableClientState(GL_VERTEX_ARRAY);
    glBindBuffer(GL_ARRAY_BUFFER,0);

    glEnable(GL_LIGHTING);
    glDisable(GL_STENCIL_TEST);
    glPopMatrix();
  }

  // Setup texture for platform.
  //
  glBindTexture(GL_TEXTURE_2D,balloon.texid_syl);

  // Blend dark tiles with existing balloon reflection.
  //
  glEnable(GL_STENCIL_TEST);
  glBlendEquation(GL_FUNC_ADD);
  glBlendFunc(GL_CONSTANT_ALPHA,GL_ONE_MINUS_CONSTANT_ALPHA); // src, dst
  glBlendColor(0,0,0,0.5);

  glDepthFunc(GL_ALWAYS);
  glNormal3f(0,1,0);

  if ( opt_surface_smooth )
    {
      glEnable(GL_TEXTURE_2D);
      platform_tex_coords.bind();
      glTexCoordPointer(2, GL_FLOAT,2*sizeof(float), 0);
      glEnableClientState(GL_TEXTURE_COORD_ARRAY);
    }

  platform_tile_coords.bind();
  glVertexPointer
    (3, GL_FLOAT,sizeof(platform_tile_coords.data[0]), 0);
  glEnableClientState(GL_VERTEX_ARRAY);

  for ( int pass = 0;  pass < 2;  pass++ )
    {
      if ( pass == 0 )
        {
          // Prepare to write unshadowed parts of frame buffer.
          //
          glStencilFunc(GL_NOTEQUAL,1,1);
        }
      else
        {
          // Prepare to write shadowed parts of frame buffer.
          //
          glStencilFunc(GL_EQUAL,1,1);
          glLightf(GL_LIGHT0, GL_LINEAR_ATTENUATION, 6.0);
        }

      if ( opt_surface_smooth ) glEnable(GL_TEXTURE_2D);

      // Write lighter-colored, textured tiles.
      //
      glMaterialfv(GL_FRONT_AND_BACK,GL_SPECULAR,gray);
      glMaterialf(GL_FRONT_AND_BACK,GL_SHININESS,2.0);
      glColor3f(0.35,0.35,0.35);
      glDrawArrays(GL_QUADS,0,half_elements+4);

      // Write darker-colored, untextured, mirror tiles.
      //
      glEnable(GL_BLEND);
      glMaterialfv(GL_FRONT_AND_BACK,GL_SPECULAR,white);
      glMaterialf(GL_FRONT_AND_BACK,GL_SHININESS,20);
      glDisable(GL_TEXTURE_2D);
      glColor3fv(lsu_spirit_purple);
      glDrawArrays(GL_QUADS,half_elements+4,half_elements-4);
      glDisable(GL_BLEND);
    }

  glDisableClientState(GL_TEXTURE_COORD_ARRAY);
  glDisableClientState(GL_VERTEX_ARRAY);
  glBindBuffer(GL_ARRAY_BUFFER,0);

  glDepthFunc(GL_LESS);
  glDisable(GL_TEXTURE_2D);
  glDisable(GL_STENCIL_TEST);
  glLightf(GL_LIGHT0, GL_LINEAR_ATTENUATION, 1.0);
  glMaterialf(GL_BACK,GL_SHININESS,shininess_ball);

  //
  // Render Balloon
  //
  if ( opt_surface_smooth )
    {
      // With Textures

      const int vstride = sizeof(Balloon_Vertex);

      glEnable(GL_TEXTURE_2D);
      glBindTexture(GL_TEXTURE_2D,balloon.texid_pse);

      glColor3fv(color_ball);
      glMaterialfv(GL_BACK,GL_SPECULAR,scolor_ball);
      glColorMaterial(GL_BACK,GL_AMBIENT_AND_DIFFUSE);
      pColor color_red(0.9,0.2,0.2);
      glMaterialfv(GL_FRONT,GL_AMBIENT_AND_DIFFUSE,color_red);
      glMaterialfv(GL_FRONT,GL_SPECULAR,dark);

      balloon.tex_coords.bind();
      glTexCoordPointer(2,GL_FLOAT,0,NULL);
      glEnableClientState(GL_TEXTURE_COORD_ARRAY);

      if ( balloon.data_bid )
        {
          glBindBuffer(GL_ARRAY_BUFFER,balloon.data_bid);
          glVertexPointer
            (3, GL_FLOAT, balloon.data_stride, (void*)( 2 * sizeof(pCoor) ));
          glNormalPointer(GL_FLOAT, balloon.data_stride, NULL );
        }
      else
        {
          glBindBuffer(GL_ARRAY_BUFFER,0);
          glVertexPointer(4, GL_FLOAT, vstride, &balloon.points[0].pos);
          glNormalPointer(GL_FLOAT, vstride, &balloon.points[0].surface_normal);
        }

      glEnableClientState(GL_VERTEX_ARRAY);
      glEnableClientState(GL_NORMAL_ARRAY);
      balloon.point_indices.bind(GL_ELEMENT_ARRAY_BUFFER);

      glDrawElements
        (GL_TRIANGLES,balloon.point_indices.elements,GL_UNSIGNED_INT, NULL);

      glDisableClientState(GL_VERTEX_ARRAY);
      glDisableClientState(GL_NORMAL_ARRAY);
      glDisableClientState(GL_TEXTURE_COORD_ARRAY);
      glDisable(GL_TEXTURE_2D);
      glBindBuffer(GL_ARRAY_BUFFER,0);
    }
  else
    {
      // With Colored Stripes

      balloon.gpu_data_to_cpu();
      glMaterialfv(GL_BACK,GL_SPECULAR,scolor_ball);
      glBegin(GL_TRIANGLES);

      for ( int idx = 0;  idx < balloon.tri_count; idx++ )
        {
          Balloon_Triangle* const tri = &balloon.triangles[idx];
          Balloon_Vertex* const p = &balloon.points[tri->pi];
          Balloon_Vertex* const q = &balloon.points[tri->qi];
          Balloon_Vertex* const r = &balloon.points[tri->ri];

          glColor3fv(tri->color);
          glMaterialfv(GL_FRONT_AND_BACK,GL_SPECULAR,tri->color);

          const bool true_normal = false;
          if ( !true_normal )
            {
              pVect norm(q->pos,p->pos,r->pos);
              norm.normalize();
              glNormal3fv(norm);
            }

          if ( true_normal ) glNormal3fv(p->surface_normal);
          glVertex4fv(p->pos);
          if ( true_normal ) glNormal3fv(q->surface_normal);
          glVertex4fv(q->pos);
          if ( true_normal ) glNormal3fv(r->surface_normal);
          glVertex4fv(r->pos);

        }
      glEnd();

    }

  insert_tetrahedron(light_location,0.05);

  pError_Check();

  glColor3f(0,1,0); // This sets the text color. Don't know why.

  frame_timer.frame_end();

  glutSwapBuffers();
}


void
World::cb_keyboard()
{
  if ( !ogl_helper.keyboard_key ) return;
  pVect adjustment(0,0,0);
  pVect user_rot_axis(0,0,0);
  const float move_amt = 0.4;

  balloon.gpu_data_to_cpu();
  balloon.data_location = DL_CPU;
  balloon.data_bid = 0;
  balloon.cuda_constants_stale = true;

  switch ( ogl_helper.keyboard_key ) {
  case FB_KEY_LEFT: adjustment.x = -move_amt; break;
  case FB_KEY_RIGHT: adjustment.x = move_amt; break;
  case FB_KEY_PAGE_UP: adjustment.y = move_amt; break;
  case FB_KEY_PAGE_DOWN: adjustment.y = -move_amt; break;
  case FB_KEY_DOWN: adjustment.z = move_amt; break;
  case FB_KEY_UP: adjustment.z = -move_amt; break;
  case FB_KEY_DELETE: user_rot_axis.y = 1; break;
  case FB_KEY_INSERT: user_rot_axis.y =  -1; break;
  case FB_KEY_HOME: user_rot_axis.x = 1; break;
  case FB_KEY_END: user_rot_axis.x = -1; break;
  case 'A':
    if ( opt_physics_method )
      { opt_physics_method_last = opt_physics_method;
        opt_physics_method = GP_cpu; }
    else
      { opt_physics_method = opt_physics_method_last; }
    break;
  case 'a':
    opt_physics_method++;
    if ( opt_physics_method == GP_ENUM_SIZE ) opt_physics_method = 0;
    break;
  case 'b': opt_move_item = MI_Balloon; break;
  case 'B': opt_move_item = MI_Balloon_V; break;
  case 'd': case 'D': balloon.opt_damping = !balloon.opt_damping; break;
  case 'e': case 'E': opt_move_item = MI_Eye; break;
  case 'g': case 'G': balloon.opt_gravity = !balloon.opt_gravity; break;
  case 'l': case 'L': opt_move_item = MI_Light; break;
  case 'n': case 'N': opt_surface_smooth = !opt_surface_smooth; break;
  case 'p': case 'P': opt_pause = !opt_pause; break;
  case 'r': case 'R': balloon.length_relaxed_update = true; break;
  case 's': balloon.stop(); break;
  case 'S': balloon.freeze(); break;
  case 'x': balloon.opt_cpu_interleave = !balloon.opt_cpu_interleave; break;
  case 9: variable_control.switch_var_right(); break;
  case 96: variable_control.switch_var_left(); break; // `, until S-TAB works.
  case '-':case '_': variable_control.adjust_lower(); break;
  case '+':case '=': variable_control.adjust_higher(); break;
  default: printf("Unknown key, %d\n",ogl_helper.keyboard_key); break;
  }

  // Update eye_direction based on keyboard command.
  //
  if ( user_rot_axis.x || user_rot_axis.y )
    {
      pMatrix_Rotation rotall(eye_direction,pVect(0,0,-1));
      user_rot_axis *= invert(rotall);
      eye_direction *= pMatrix_Rotation(user_rot_axis, M_PI * 0.03);
      modelview_update();
    }

  // Update eye_location based on keyboard command.
  //
  if ( adjustment.x || adjustment.y || adjustment.z )
    {
      //  pMatrix_Rotation rotall(eye_direction,pVect(0,0,-1));
      const double angle =
        fabs(eye_direction.y) > 0.99
        ? 0 : atan2(eye_direction.x,-eye_direction.z);
      pMatrix_Rotation rotall(pVect(0,1,0),-angle);
      adjustment *= rotall;

      switch ( opt_move_item ){
      case MI_Balloon: balloon.translate(adjustment); break;
      case MI_Balloon_V: balloon.push(adjustment); break;
      case MI_Light: light_location += adjustment; break;
      case MI_Eye: eye_location += adjustment; break;
      default: break;
      }
      modelview_update();
    }
}


int
main(int argv, char **argc)
{
  pOpenGL_Helper popengl_helper(argv,argc);
  World world(popengl_helper);

  popengl_helper.rate_set(30);
  popengl_helper.display_cb_set(world.render_w,&world);
}