/// LSU EE 7700-1 (Sp 2009), Graphics Processors
//
 /// Balloon Simulation

// $Id:$

 /// Purpose
//
//   Demonstrate use of gpu for physics.


#include "balloon.cuh"

__constant__ CUDA_Tri_Strc* tri_strc;
__constant__ CUDA_Vtx_Strc* vtx_strc;
__constant__ CUDA_Tri_Data* tri_data;
__constant__ float* tower_volumes;
__constant__ float3* centroid_parts;

texture<float4> vtx_data_tex;
texture<float4> tri_data_tex;

__constant__ CUDA_Tri_Work_Strc* tri_work_strc;
__constant__ int tri_work_per_vtx;
__constant__ int tri_work_per_vtx_lg;

__constant__ float volume_cpu;
__constant__ int tri_count;
__constant__ int point_count;
__constant__ bool opt_gravity;

__constant__ float spring_constant;
__constant__ float damping_v;
__constant__ float pressure_factor_coeff;
__constant__ float gas_m_over_temp;

__constant__ float air_resistance;
__constant__ float gas_mass_per_vertex;
__constant__ float air_particle_mass;
__constant__ float gravity_mag;

__constant__ float delta_t;
__constant__ float rep_constant;
__constant__ float point_mass;
__constant__ float point_mass_inv;

__constant__ float platform_xmin;
__constant__ float platform_xmax;
__constant__ float platform_zmin;
__constant__ float platform_zmax;


__device__ int
div_p2_ceil(int num, int den_lg)
{
  const int quot = num >> den_lg;
  return quot << den_lg == num ? quot : quot + 1;
}

__device__ float3
vec_add(float3 a, float3 b){return make_float3(a.x+b.x,a.y+b.y,a.z+b.z);}
__device__ float3
vec_add3(float3 a, float3 b, float3 c) { return vec_add(a,vec_add(b,c)); }
__device__ void vec_addto(float3& a, float3 b)
{
  float3 sum = vec_add(a,b);
  a = sum;
}

__device__ float3
vec_sub(float3 a, float3 b){return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);}

__device__ float3
vec_scale(float s, float3 a) {return make_float3(s*a.x,s*a.y,s*a.z);}

__device__ float
dot(float3 a, float3 b){return a.x*b.x + a.y*b.y + a.z*b.z;}

__device__ float length(float3 a) {return sqrtf(dot(a,a));}

__device__ float3
normalize(float3 a) { return vec_scale(rsqrtf(dot(a,a)),a); }

__device__ float3
cross(float3 a, float3 b)
{
  return make_float3
    ( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x );
}

__device__ float3
cross3(float3 a, float3 b, float3 c)
{
  float3 ab = vec_sub(a,b);
  float3 cb = vec_sub(c,b);
  return cross(ab,cb);
}

 ///
 /// Reduction Routine
 ///
 // Computes a sum of floats over all threads in the block.
 //
__device__ float
reduce(int block_lg, float *shared_array, float my_value, bool all)
{
  const int tid = threadIdx.x;
  const int block_lg_h = block_lg >> 1;
  const int block_lg_l = block_lg - block_lg_h;
  const int lower_size = 1 << block_lg_l;

  float vol_sum = shared_array[tid] = my_value;
  __syncthreads();

  // Round 1
  //
  if ( tid < lower_size )
    {
      // Note: CUDA is not good at unrolling loops or optimizing once
      // unrolled. For that matter, it's not good at scheduling
      // either. (CUDA 2.1)  That's why to loops below are hand unrolled.
      // Thankfully CUDA does optimize out the if statements.
      //
#define ITER(i) vol_sum += shared_array[ tid + (i << block_lg_l ) ]
#define ITER2(i) { ITER(i); ITER(i+1); }
#define ITER4(i) { ITER2(i); ITER2(i+2); }
#define ITER8(i) { ITER4(i); ITER4(i+4); }
      if ( block_lg_h >= 1 ) ITER(1);
      if ( block_lg_h >= 2 ) ITER2(2);
      if ( block_lg_h >= 3 ) ITER4(4);
      if ( block_lg_h >= 4 ) ITER8(8);
      if ( block_lg_h >= 5 ) { ITER8(16); ITER8(24); }
#undef ITER

      shared_array[tid] = vol_sum;
    }

  // Round 2
  //
  __syncthreads();

  if ( tid == 0 )
    {
#define ITER(i) vol_sum += shared_array[i];
      if ( block_lg_l >= 1 ) ITER(1);
      if ( block_lg_l >= 2 ) ITER2(2);
      if ( block_lg_l >= 3 ) ITER4(4);
      if ( block_lg_l >= 4 ) ITER8(8);
      if ( block_lg_l >= 5 ) { ITER8(16); ITER8(24); }
#undef ITER
    }

  if ( !all ) return vol_sum;
  if ( tid == 0 ) shared_array[0] = vol_sum;
  __syncthreads();
  return shared_array[0];
}


 ///
 /// Texture Access Convenience Functions
 ///

// These functions access data via the texture cache and place them in
// appropriate data structures. (Texture cache fetches can not return
// structures and cannot return vectors of length 3.)

__device__ float3 vtx_data_pos(int idx)
{
  const int idx_tex = idx * 3 + 2;
  const float4 pos4 = tex1Dfetch(vtx_data_tex, idx_tex);
  return make_float3(pos4.x,pos4.y,pos4.z);
}

__device__ float3 vtx_data_vel(int idx)
{
  const int idx_tex = idx * 3 + 1;
  const float4 pos4 = tex1Dfetch(vtx_data_tex, idx_tex);
  return make_float3(pos4.x,pos4.y,pos4.z);
}

__device__ float3 tri_data_surface_normal(int idx)
{
  const int idx_tex = idx * 3;
  const float4 sn = tex1Dfetch(tri_data_tex, idx_tex);
  return make_float3(sn.x,sn.y,sn.z);
}

__device__ float3 tri_data_force(int idx, int member)
{
  const int idx_tex_base = idx * 3;
  float4 el, eh;
  switch (member) {
  case 0:                       // force_p
    el = tex1Dfetch(tri_data_tex,idx_tex_base);
    eh = tex1Dfetch(tri_data_tex,idx_tex_base+1);
    return make_float3(el.w,eh.x,eh.y);
  case 1:                       // force_q
    el = tex1Dfetch(tri_data_tex,idx_tex_base+1);
    eh = tex1Dfetch(tri_data_tex,idx_tex_base+2);
    return make_float3(el.z,el.w,eh.x);
  case 2:                       // force_r
    el = tex1Dfetch(tri_data_tex,idx_tex_base+2);
    return make_float3(el.y,el.z,el.w);
  default:                      // Unreachable.
    return make_float3(0,0,0);
  }
}


 ///
 /// Compute Repulsion Force
 ///
//
// Used by one- and two-pass algorithms.
//
__device__ float3
repforce_compute(float3 p_pos, int po_idx)
{
  const float3 po_pos = vtx_data_pos(po_idx);
  const float3 p_to_q = vec_sub(p_pos,po_pos);
  const float mag_sq = dot(p_to_q,p_to_q);
  const float dist_sq_inv = rep_constant / max(0.001,mag_sq);
  const float3 p_to_q_n = normalize(p_to_q);
  return vec_scale(dist_sq_inv, p_to_q_n);
}


///
/// Two-Pass Code
///

__global__ void pass_triangles();
__global__ void pass_vertices(CUDA_Vtx_Data *vtx_data_out);


 __host__ void
pass_triangles_launch
 (dim3 dg, dim3 db, CUDA_Vtx_Data *vtx_data, size_t vtx_data_size)
 {
   size_t offset;
   cudaBindTexture(&offset, vtx_data_tex, vtx_data, vtx_data_size);
   pass_triangles<<<dg,db>>>();
 }

__global__ void
pass_triangles()
{
  const int tid = threadIdx.x;
  const int ti = blockIdx.x * blockDim.x + threadIdx.x;
  __shared__ float volumes[CUDA_TRI_BLOCK_SIZE];
  if ( ti >= tri_count ) volumes[tid] = 0;
  __syncthreads();
  if ( ti >= tri_count ) return;

  const CUDA_Tri_Strc ts = tri_strc[ti];

  const float3 ppos = vtx_data_pos(ts.pi);
  const float3 qpos = vtx_data_pos(ts.qi);
  const float3 rpos = vtx_data_pos(ts.ri);

  const float3 center = vec_scale(1.0/3.0, vec_add3(ppos,qpos,rpos));
  const float3 pqr_cross = cross3(qpos,ppos,rpos);

  const float triangle_area_x2 = length(pqr_cross);
  const float tower_volume_x2 = -pqr_cross.y * center.y;

  float3 force_p = repforce_compute(ppos,ts.pi_opp);
  float3 force_q = repforce_compute(qpos,ts.qi_opp);
  float3 force_r = repforce_compute(rpos,ts.ri_opp);

  const float3 p_to_c = vec_sub(center,ppos);
  const float3 q_to_c = vec_sub(center,qpos);
  const float3 r_to_c = vec_sub(center,rpos);

  const float perimeter = length(p_to_c) + length(q_to_c) + length(r_to_c);

  tri_data[ti].surface_normal = pqr_cross;

  const float length_relaxed = ts.length_relaxed;
  const float eff_length = max(0.0f, perimeter - length_relaxed );
  const float spring_force = eff_length * spring_constant;

  tri_data[ti].force_p = vec_add(force_p, vec_scale(spring_force, p_to_c));
  tri_data[ti].force_q = vec_add(force_q, vec_scale(spring_force, q_to_c));
  tri_data[ti].force_r = vec_add(force_r, vec_scale(spring_force, r_to_c));

  const float vol_sum =
    reduce(CUDA_TRI_BLOCK_LG,volumes,tower_volume_x2 * 0.5f,false);
  if ( threadIdx.x == 0 ) tower_volumes[ blockIdx.x ] = vol_sum;
}

__host__ void
pass_vertices_launch
(dim3 dg, dim3 db, CUDA_Tri_Data *tri_data, CUDA_Vtx_Data *vtx_out,
 size_t tri_data_size)
{
  size_t offset;
  cudaBindTexture(&offset, tri_data_tex, tri_data, tri_data_size);
  pass_vertices<<<dg,db>>>(vtx_out);
  cudaUnbindTexture(vtx_data_tex);
  cudaUnbindTexture(tri_data_tex);
}

__global__ void
pass_vertices(CUDA_Vtx_Data *vtx_data_out)
{
  const int tid = threadIdx.x;
  const int vi = blockIdx.x * blockDim.x + threadIdx.x;

  __shared__ float volumes[CUDA_VTX_BLOCK_SIZE];

  const int grid_dim_tri = div_p2_ceil(tri_count,CUDA_TRI_BLOCK_LG);
  const int vol_per_thread = div_p2_ceil(grid_dim_tri,CUDA_VTX_BLOCK_LG);
  const int start = tid * vol_per_thread;
  const int stop = min(grid_dim_tri, start + vol_per_thread);
  float my_vol = 0;
  for ( int i=start; i<stop; i++ ) my_vol += tower_volumes[i];
  const float volume = reduce(CUDA_VTX_BLOCK_LG,volumes,my_vol,true);

  const float friction_coefficient = .04;
  const float bounce_factor = 0.0;
  const float mass = 1.0;

  const float3 gravity = make_float3(0.0,-gravity_mag,0.0);

  const float3 pos = vtx_data_pos(vi);
  const float3 vel = vtx_data_vel(vi);
  const CUDA_Vtx_Strc vs = vtx_strc[vi];

  float3 force_spring = make_float3(0.0,0.0,0.0);
  float3 surface_normal_sum = make_float3(0.0,0.0,0.0);

#define TRI_BODY(i)                                                           \
  { const int idx_packed = vs.n##i;                                           \
    if ( idx_packed != -1 )                                                   \
      {                                                                       \
        const int idx_base = idx_packed >> 2;                                 \
        const int idx_force = idx_packed & 0x3;                               \
        const float3 surface_normal_t = tri_data_surface_normal(idx_base);    \
        vec_addto(surface_normal_sum, surface_normal_t);                      \
        const float3 force_x = tri_data_force(idx_base,idx_force);            \
        vec_addto(force_spring, force_x);                                     \
      }}

#if 1
  TRI_BODY(0); TRI_BODY(1); TRI_BODY(2); TRI_BODY(3);
  TRI_BODY(4); TRI_BODY(5); TRI_BODY(6); TRI_BODY(7);
#else
  for ( int i=0; i<VTX_TRI_DEG_MAX; i++ ) TRI_BODY(i);
#endif
#undef TRI_BODY

  float3 surface_normal = vec_scale(1./6., surface_normal_sum);

  float pressure_factor = pressure_factor_coeff / fabs(volume);

  float pressure =
    opt_gravity
    ? pressure_factor * exp( - gas_m_over_temp * pos.y )
    : pressure_factor;
  float air_pressure =
    opt_gravity ? exp( - 0.2f * air_particle_mass * pos.y ) : 1.0;

  float3 force_pressure = vec_scale(air_pressure - pressure, surface_normal);

  float3 force = force_pressure;

  float3 vel_norm = normalize(vel);
  float facing_area = max(0.0f,-dot(vel_norm,surface_normal));
  float3 force_ar = vec_scale( - air_resistance * facing_area, vel);

  float3 gforce = vec_scale(point_mass * mass, gravity);
  vec_addto(force, gforce);

  vec_addto(force, force_ar);

  float3 force_ns = force;

  vec_addto(force, force_spring);

  float mass_wgas_inv_dt =
    delta_t / ( point_mass * mass + gas_mass_per_vertex );

  float3 delta_vns = vec_scale(mass_wgas_inv_dt, force_ns);
  float3 delta_vs = vec_scale(mass_wgas_inv_dt, force_spring);
  float3 delta_v = vec_add(delta_vns, delta_vs);

  float3 pos_next =
    vec_add(pos, vec_scale(delta_t, vec_add( vel,  vec_scale(0.5f,delta_v) )));

  float3 vel_next = vec_add3(vel,  vec_scale(damping_v, delta_vs), delta_vns);

  const bool platform_aligned =
    pos_next.x >= platform_xmin && pos_next.x <= platform_xmax
    && pos_next.z >= platform_zmin && pos_next.z <= platform_zmax;
  const bool above_to_below =  pos_next.y <= 0.0f && pos.y >= 0.0f;

  if ( platform_aligned && above_to_below )
    {
      pos_next.y = 0.0;
      vel_next.y = - bounce_factor * vel_next.y;
      const float f_y =
        min(0.0, gforce.y + force_spring.y - pressure * surface_normal.y);
      const float friction_force = -f_y * friction_coefficient;
      const float delta_v = friction_force * delta_t / ( point_mass * mass );
      const float3 xzvel = make_float3(vel_next.x,0,vel_next.z);
      if ( length(xzvel) <= delta_v ) {
        vel_next.x = 0.0;  vel_next.z = 0.0;
      }
      else
        vec_addto(vel_next, vec_scale( -delta_v, normalize(xzvel) ));
    }

  vtx_data_out[vi].surface_normal = surface_normal;
  vtx_data_out[vi].vel = vel_next;
  vtx_data_out[vi].pos = pos_next;
}



///
/// One-Pass Code
///

struct CUDA_Tri_Shared {
  // Note: make sure GCD(16,sizeof(this)/4) = 1, e.g., not a power of 2!
  float3 center;
  float spring_force;
  float3 surface_normal;
};

__global__ void pass_unified
(CUDA_Vtx_Data *vtx_data_out, float *tv_in, float *tv_out);

__host__ void
pass_unified_launch
(dim3 dg, dim3 db,
 CUDA_Vtx_Data *vtx_data_in, CUDA_Vtx_Data *vtx_out,
 float *tv_in, float *tv_out,
 size_t tri_data_size, size_t vtx_data_size )
{
  size_t offset;
  cudaBindTexture(&offset, vtx_data_tex, vtx_data_in, vtx_data_size);
  pass_unified<<<dg,db>>>(vtx_out,tv_in,tv_out);
  cudaUnbindTexture(vtx_data_tex);
}

__global__ void
pass_unified
(CUDA_Vtx_Data *vtx_data_out, float *tower_volumes_in, float *tower_volumes_out)
{
  const int tid = threadIdx.x;
  const int vtx_bk_base = __mul24(blockIdx.x , blockDim.x);
  const int vi = vtx_bk_base + tid;
  const int work_idx_base = __mul24(vi , tri_work_per_vtx);
  const int block_lg = CUDA_VTX_BLOCK_LG;
  const int block_size = CUDA_VTX_BLOCK_SIZE;

  __shared__ float volumes[block_size], volumes_read[block_size];
  __shared__ CUDA_Tri_Shared tri_shared[block_size];

  // This routine computes information (area, surface normal, and a
  // force) for several triangles and up to one vertex. Each vertex
  // uses information from up to eight triangles. The triangles that a
  // vertex needs are computed somewhere in the same block, but not
  // necessarily in the same thread, so vertices get the triangle
  // information they need through shared memory.

  ///
  /// Triangle Round
  ///
  // Note that several triangles are processed by a thread.

  float local_volume_x2 = 0;
  float3 force_spring = make_float3(0,0,0);
  float3 surface_normal_sum = make_float3(0,0,0);

  const float3 pos =
    vi < point_count ? vtx_data_pos(vi) : make_float3(0,0,0);

  for ( int i=0; i<tri_work_per_vtx; i++ )
    {
      const CUDA_Tri_Work_Strc ts = tri_work_strc[work_idx_base+i];
      const int ts_pull_i =  ts.pull_i;

      /// Compute information for a triangle (if there is one).
      //
      if ( ts.pi != -1 )
        {
          const float3 ppos = vtx_data_pos(ts.pi);
          const float3 qpos = vtx_data_pos(ts.qi);
          const float3 rpos = vtx_data_pos(ts.ri);

          const float3 center = vec_scale(1.0/3.0, vec_add3(ppos,qpos,rpos));
          const float3 pqr_cross = cross3(qpos,ppos,rpos);
          const float tower_volume_x2 = -pqr_cross.y * center.y;

          const bool use_vol = ts_pull_i & 1;
          if ( use_vol ) local_volume_x2 += tower_volume_x2;

          const float3 p_to_c = vec_sub(center,ppos);
          const float3 q_to_c = vec_sub(center,qpos);
          const float3 r_to_c = vec_sub(center,rpos);
          const float perimeter = length(p_to_c)+length(q_to_c)+length(r_to_c);
          const float length_relaxed = ts.length_relaxed;
          const float eff_length = max(0.0f, perimeter - length_relaxed );
          const float spring_force = eff_length * spring_constant;

          tri_shared[tid].center = center;
          tri_shared[tid].surface_normal = pqr_cross;
          tri_shared[tid].spring_force = spring_force;
        }

      __syncthreads();

      const int pull_i = ts_pull_i >> 1;

      /// Accumulate sum of triangle information computed above.
      //
      // Note that loops are hand unrolled to overcome compiler
      // limitations.
      //
#define VTX_PULL(idx)                                                         \
      if ( idx < pull_i )                                                     \
        {                                                                     \
          const int tri_id = ts.pull_tid_##idx;                               \
          const float3 center = tri_shared[tri_id].center;                    \
          const float3 surface_normal_t = tri_shared[tri_id].surface_normal;  \
          vec_addto(surface_normal_sum,surface_normal_t);                     \
          vec_addto(force_spring,repforce_compute(pos,ts.vi_opp##idx));       \
          const float3 p_to_c = vec_sub(center,pos);                          \
          const float spring_force = tri_shared[tri_id].spring_force;         \
          vec_addto(force_spring, vec_scale(spring_force, p_to_c));           \
        }

      VTX_PULL(0); VTX_PULL(1); VTX_PULL(2); VTX_PULL(3);
    }

  /// Write Volumes
  //
  // Compute sum of volumes of each thread this block, then write to a
  // global array for use in next time step.
  //
  const float vol_sum = reduce(block_lg,volumes,local_volume_x2 * 0.5f,false);
  if ( tid == 0 ) tower_volumes_out[ blockIdx.x ] = vol_sum;

  /// Read Volumes
  //
  // Retrieve volumes of blocks written in the last time step and
  // compute their sum.
  //
  const int grid_dim_vol = gridDim.x;
  const int vol_per_thread = div_p2_ceil(gridDim.x,block_lg);
  const int start = tid * vol_per_thread;
  const int stop = min(grid_dim_vol, start + vol_per_thread);
  const int tid_limit = min(block_size, gridDim.x);
  float my_vol = 0;
  for ( int i=start; i<stop; i++ ) my_vol += tower_volumes_in[i];
  const float volume = reduce(block_lg,volumes_read,my_vol,true);

  ///
  /// Vertex Round
  ///
  // Compute new position and velocity.

  if ( vi >= point_count ) return;

  const float friction_coefficient = .04;
  const float bounce_factor = 0.0;
  const float mass = 1.0;

  const float3 gravity = make_float3(0.0,-gravity_mag,0.0);

  const float3 vel = vtx_data_vel(vi);

  const float3 surface_normal = vec_scale((1./6.), surface_normal_sum);

  float pressure_factor = pressure_factor_coeff / fabs(volume);

  float pressure =
    opt_gravity
    ? pressure_factor * exp( - gas_m_over_temp * pos.y )
    : pressure_factor;
  float air_pressure =
    opt_gravity ? exp( - 0.2f * air_particle_mass * pos.y ) : 1.0;

  float3 force_pressure = vec_scale(air_pressure - pressure, surface_normal);

  float3 force = force_pressure;

  float3 vel_norm = normalize(vel);
  float facing_area = max(0.0f,-dot(vel_norm,surface_normal));
  float3 force_ar = vec_scale( - air_resistance * facing_area, vel);

  float3 gforce = vec_scale(point_mass * mass, gravity);
  vec_addto(force, gforce);

  vec_addto(force, force_ar);

  float3 force_ns = force;

  vec_addto(force, force_spring);

  float mass_wgas_inv_dt =
    delta_t / ( point_mass * mass + gas_mass_per_vertex );

  float3 delta_vns = vec_scale(mass_wgas_inv_dt, force_ns);
  float3 delta_vs = vec_scale(mass_wgas_inv_dt, force_spring);
  float3 delta_v = vec_add(delta_vns, delta_vs);

  float3 pos_next =
    vec_add(pos, vec_scale(delta_t, vec_add( vel,  vec_scale(0.5,delta_v) )));

  float3 vel_next = vec_add3(vel,  vec_scale(damping_v, delta_vs), delta_vns);

  const bool platform_aligned =
    pos_next.x >= platform_xmin && pos_next.x <= platform_xmax
    && pos_next.z >= platform_zmin && pos_next.z <= platform_zmax;
  const bool above_to_below =  pos_next.y <= 0.0f && pos.y >= 0.0f;

  if ( platform_aligned && above_to_below )
    {
      pos_next.y = 0.0;
      vel_next.y = - bounce_factor * vel_next.y;
      const float f_y =
        min(0.0f, gforce.y + force_spring.y - pressure * surface_normal.y);
      const float friction_force = -f_y * friction_coefficient;
      const float delta_v = friction_force * delta_t / ( point_mass * mass );
      const float3 xzvel = make_float3(vel_next.x,0,vel_next.z);
      if ( length(xzvel) <= delta_v ) {
        vel_next.x = 0.0;  vel_next.z = 0.0;
      }
      else
        vec_addto(vel_next, vec_scale( -delta_v, normalize(xzvel) ));
    }

  vtx_data_out[vi].surface_normal = surface_normal;
  vtx_data_out[vi].vel = vel_next;
  vtx_data_out[vi].pos = pos_next;
}