/// LSU EE X70X-X (Fall 2012), GPU Programming // /// CUDA code for computing intersections and time-stepping physics model. // $Id:$ /// Purpose // // Demonstrate Several Graphical and Simulation Techniques. // This file contains GPU/cuda code. // See demo-x-collide.cc for main program. #include <gp/cuda-util-kernel.h> #include "k-main.cuh" /// /// Variables Read or Written By With Host Code /// /// Ball Information Structure // // This is in soa (structure of arrays) form, rather than // in the programmer-friendly aos (array of structure) form. // In soa form it is easier for multiple thread to read contiguous // blocks of data. // __constant__ CUDA_Ball_X balls_x; /// /// Ball Contact (tact) Pair Information /// /// Balls needed by block. // // This array identifies those balls that will be used by each block // during each contact pass. When a thread starts balls are placed in // shared memory, then contact between a pair of balls is tested for // and resolved. // __constant__ int *block_balls_needed; /// Shared memory array holding balls updated cooperating threads in a block. #undef USE_STRUCT #ifdef USE_STRUCT extern __shared__ CUDA_Phys_W sm_balls[]; #else extern __shared__ float3 sm_balls[]; __shared__ uchar4 sm_balls_misc[300]; #endif /// Pairs of Balls to Check // __constant__ SM_Idx2 *tacts_schedule; /// Box/Box Intersect // __constant__ XX_Pair *xx_pairs; __constant__ float4 *xx_sects_center; __constant__ float4 *xx_sects_dir; __constant__ float4 *xx_sects_debug; __constant__ float3 gravity_accel_dt; __constant__ float opt_bounce_loss, opt_bounce_loss_box; __constant__ float opt_friction_coeff, opt_friction_roll; __constant__ float opt_air_resistance; __constant__ bool opt_platform_curved; __constant__ float platform_xmin, platform_xmax; __constant__ float platform_zmin, platform_zmax; __constant__ float platform_xmid, platform_xrad; __constant__ float delta_t; __constant__ float elasticity_inv_dt; __constant__ bool opt_debug, opt_debug2; __constant__ CUDA_Wheel wheel; extern __shared__ float block_torque_dt[]; static __host__ void collect_symbols(); /// /// Useful Functions and Types /// typedef float3 pCoor; typedef float3 pVect; __device__ float3 make_float3(float4 f4){return make_float3(f4.x,f4.y,f4.z);} __device__ float3 m3(float4 a){ return make_float3(a); } __device__ float3 xyz(float4 a){ return m3(a); } __device__ float4 m4(float3 v, float w) { return make_float4(v.x,v.y,v.z,w); } __device__ pVect operator +(pVect a,pVect b) { return make_float3(a.x+b.x,a.y+b.y,a.z+b.z); } __device__ pVect operator -(pVect a,pVect b) { return make_float3(a.x-b.x,a.y-b.y,a.z-b.z); } __device__ pVect operator -(float4 a,float4 b) { return make_float3(a.x-b.x,a.y-b.y,a.z-b.z); } __device__ pVect operator -(pCoor a,float4 b) { return make_float3(a.x-b.x,a.y-b.y,a.z-b.z); } __device__ pVect operator *(float s, pVect v) {return make_float3(s*v.x,s*v.y,s*v.z);} __device__ float4 operator *(float s, float4 v) {return make_float4(s*v.x,s*v.y,s*v.z,s*v.w);} __device__ pVect operator *(pVect u, pVect v) {return make_float3(u.x*v.x,u.y*v.y,u.z*v.z);} __device__ pVect operator -(pVect v) { return make_float3(-v.x,-v.y,-v.z); } __device__ float3 operator -=(float3& a, pVect b) {a = a - b; return a;} __device__ float3 operator +=(float3& a, pVect b) {a = a + b; return a;} struct pNorm { pVect v; float mag_sq, magnitude; }; __device__ pVect operator *(float s, pNorm n) { return s * n.v;} // Make a Coordinate __device__ pCoor mc(float x, float y, float z){ return make_float3(x,y,z); } __device__ pCoor mc(float4 c){ return make_float3(c.x,c.y,c.z); } __device__ void set_f3(float3& a, float4 b){a.x = b.x; a.y = b.y; a.z = b.z;} __device__ void set_f4(float4& a, float3 b) {a.x = b.x; a.y = b.y; a.z = b.z; a.w = 1;} __device__ void set_f4(float4& a, float3 b, float c) {a.x = b.x; a.y = b.y; a.z = b.z; a.w = c;} // Make a Vector __device__ pVect mv(float x, float y, float z){ return make_float3(x,y,z); } __device__ pVect mv(float3 a, float3 b) { return b-a; } __device__ pVect mv(float a) { return make_float3(a,a,a); } __device__ float dot(float4 a, float4 b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;} __device__ float dot(pVect a, pVect b){ return a.x*b.x + a.y*b.y + a.z*b.z;} __device__ float dot(pVect a, pNorm b){ return dot(a,b.v); } __device__ float dot(pNorm a, pVect b){ return dot(a.v,b); } __device__ float dot3(float4 a, float4 b){ return dot(m3(a),m3(b)); } __device__ float mag_sq(pVect v){ return dot(v,v); } __device__ float length(pVect a) {return sqrtf(mag_sq(a));} __device__ pVect normalize(pVect a) { return rsqrtf(mag_sq(a))*a; } // Make a Normal (a structure containing a normalized vector and length) __device__ pNorm mn(pVect v) { pNorm n; n.mag_sq = mag_sq(v); if ( n.mag_sq == 0 ) { n.magnitude = 0; n.v.x = n.v.y = n.v.z = 0; } else { n.magnitude = sqrtf(n.mag_sq); n.v = (1.0f/n.magnitude) * v; } return n; } __device__ pNorm mn(float4 a, float4 b) {return mn(b-a);} __device__ pNorm mn(pCoor a, pCoor b) {return mn(b-a);} __device__ pNorm mn(float x, float y, float z) {return mn(mv(x,y,z));} __device__ pNorm mn(float4 v4) { pNorm n; n.v = m3(v4); n.magnitude = v4.w; return n; } __device__ pNorm mn(float3 v3, float mag) { pNorm n; n.v = v3; n.magnitude = mag; return n; } // The unary - operator doesn't seem to work when used in an argument. __device__ pNorm operator -(pNorm n) { pNorm m; m.magnitude = n.magnitude; m.mag_sq = n.mag_sq; m.v = -n.v; return m; } struct pQuat { float w; pVect v; }; // Make Quaternion __device__ float4 mq(pNorm axis, float angle) { return m4( __sinf(angle/2) * axis.v, __cosf(angle/2) ); } __device__ float4 quat_normalize(float4 q) { float len_sq = dot(q,q); float norm_factor = 1.0f / sqrtf(len_sq); return norm_factor * q; } // Make float4 __device__ float4 m4(pQuat q){ return make_float4(q.v.x,q.v.y,q.v.z,q.w); } __device__ float4 m4(pNorm v, float w) { return m4(v.v,w); } __device__ pVect fabs(pVect v){ return mv(fabs(v.x),fabs(v.y),fabs(v.z)); } __device__ float min(pVect v){ return min(min(v.x,v.y),v.z); } __device__ float max(pVect v){ return max(max(v.x,v.y),v.z); } __device__ float sum(pVect v){ return v.x+v.y+v.z; } // Cross Product of Two Vectors __device__ float3 cross(float3 a, float3 b) { return make_float3 ( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x ); } __device__ pVect cross(pVect a, pNorm b){ return cross(a,b.v); } __device__ pVect cross(pNorm a, pVect b){ return cross(a.v,b); } __device__ pVect crossf3(float4 a, float4 b) { return cross(m3(a),m3(b)); } // Cross Product of Vectors Between Coordinates __device__ float3 cross3(float3 a, float3 b, float3 c) { float3 ab = a - b; float3 cb = c - b; return cross(ab,cb); } __device__ pVect cross3(pVect a, pVect b, pNorm c) { return cross3(a,b,c.v); } __device__ float4 quat_mult(float4 a, float4 b) { float w = a.w * b.w - dot3(a,b); float3 v = a.w * m3(b) + b.w * m3(a) + crossf3(a,b); return make_float4(v.x,v.y,v.z,w); }; __device__ void pMatrix_set_rotation(pcMatrix3x3& m, pVect u, float theta) { const float cos_theta = __cosf(theta); const float sin_theta = sqrtf(1.0f - cos_theta * cos_theta ); m.r0.x = u.x * u.x + cos_theta * ( 1 - u.x * u.x ); m.r0.y = u.x * u.y * ( 1 - cos_theta ) - u.z * sin_theta; m.r0.z = u.z * u.x * ( 1 - cos_theta ) + u.y * sin_theta; m.r1.x = u.x * u.y * ( 1 - cos_theta ) + u.z * sin_theta; m.r1.y = u.y * u.y + cos_theta * ( 1 - u.y * u.y ); m.r1.z = u.y * u.z * ( 1 - cos_theta ) - u.x * sin_theta; m.r2.x = u.z * u.x * ( 1 - cos_theta ) - u.y * sin_theta; m.r2.y = u.y * u.z * ( 1 - cos_theta ) + u.x * sin_theta; m.r2.z = u.z * u.z + cos_theta * ( 1 - u.z * u.z ); } __device__ float3 operator *(pcMatrix3x3 m, float3 coor) { return make_float3(dot(m.r0,coor), dot(m.r1,coor), dot(m.r2,coor)); } // /// Ball Physics Functions // // See demo-x-collide.cc for details. __device__ pVect point_rot_vel(float3 omega, float r, pNorm direction) { /// Return velocity of point on surface of sphere of radius r. // return r * cross( omega, direction ); } __device__ float get_fdt_to_do(float r, float mass_inv) { return 2.5f * mass_inv / r; } __device__ float3 tan_force_dt (pNorm tact_dir, float3 force_dt, float fdt_to_do) { /// Change rotation rate due to force_dt at tact_dir in direction force_dir. // return cross(tact_dir, fdt_to_do * force_dt ); } /// /// Major Ball Physics Routines /// // A time step is computed using two kernels, pass_pairs and // pass_platform. The pass_pairs kernel, which might be launched // several times, handles collisions between balls. The pass_platform // kernel handles collision between balls and the platform, and also // updates position and orientation, and spins the wheel. __device__ bool tile_ball_collide (CUDA_Tile_W& tile, CUDA_Ball_W& ball, pCoor& tact_pos, pVect& tact_dir) { // If tile in contact with ball return true and write contact // point on tile to tact_pos and ball-center-to-tact-pos direction // to tact_dir. pVect tile_to_ball = mv(tile.pt_ll,ball.position); // Distance from tile's plane to the ball. const float dist = dot(tile_to_ball,tile.normal); const float radius = ball.radius; if ( fabs(dist) > radius ) return false; // The closest point on tile plane to the ball. pCoor pt_closest = ball.position - dist * tile.normal; // How far up the tile in the y direction the center of the ball sits const float dist_ht = dot(tile.norm_up,tile_to_ball); if ( dist_ht < -radius ) return false; if ( dist_ht > tile.height + radius ) return false; // How far up the tile in the x direction the center of the ball sits const float dist_wd = dot(tile.norm_rt,tile_to_ball); if ( dist_wd < -radius ) return false; if ( dist_wd > tile.width + radius ) return false; // If ball touching tile surface (not including an edge or corner) // then set up the pseudo ball for collision handling if ( dist_ht >= 0 && dist_ht <= tile.height && dist_wd >= 0 && dist_wd <= tile.width ) { tact_pos = pt_closest; tact_dir = dist > 0 ? -tile.normal : tile.normal; return true; } float3 pt_lr = tile.pt_ll + tile.width * tile.norm_rt; float3 pt_ul = tile.pt_ll + tile.height * tile.norm_up; float3 pt_ur = pt_lr + tile.height * tile.norm_up; // Test whether the ball is touching a corner if ( ( dist_ht < 0 || dist_ht > tile.height ) && ( dist_wd < 0 || dist_wd > tile.width) ) { pCoor ref_pt; // We need to place the pseudo ball based upon the vector from // ball position to the corner. First step is to figure out which // corner. if ( dist_ht < 0 && dist_wd < 0 ) { ref_pt = tile.pt_ll; } else if ( dist_ht < 0 && dist_wd > tile.width ) { ref_pt = pt_lr; } else if ( dist_ht > tile.height && dist_wd < 0 ) { ref_pt = pt_ul; } else { ref_pt = pt_ur; } tact_pos = ref_pt; tact_dir = normalize(mv(ball.position,ref_pt)); return true; } // Else the ball is touching an edge const bool tact_horiz = dist_ht < 0 || dist_ht > tile.height; const pVect corner_to_tact = tact_horiz ? dist_wd * tile.norm_rt : dist_ht * tile.norm_up; const pCoor ref_pt = tact_horiz ? ( dist_ht < 0 ? tile.pt_ll : pt_ul ) : ( dist_wd < 0 ? tile.pt_ll : pt_lr ); // Find the closest edge point of the tile to the ball tact_pos = ref_pt + corner_to_tact; tact_dir = normalize(mv(ball.position,tact_pos)); return true; } __device__ void wheel_collect_tile_force(CUDA_Tile_W& tile, pCoor tact, pVect delta_mo) { pVect to_center = mv(wheel.center,tact); // Formula below needs to be checked. const float torque_dt = dot(wheel.axis_dir,cross(to_center,delta_mo)); tile.torque += torque_dt; } /// /// Collision (Penetration) Detection and Resolution Routines /// // Used in both passes. __device__ bool penetration_balls_resolve (CUDA_Ball_W& ball1, CUDA_Ball_W& ball2, bool b2_real, Force_Types ft) { /// Update velocity and angular momentum for a pair of balls in contact. // Later, separate friction and other forces. if ( ft == FT_Friction ) return false; pVect zero_vec = mv(0,0,0); pNorm dist = mn(ball1.position,ball2.position); float3 v1 = ball1.velocity; float3 v2 = ball2.velocity; float3 omega1 = ball1.omega; float3 omega2 = ball2.omega; const float mass_inv1 = ball1.mass_inv; const float mass_inv2 = ball2.mass_inv; const float r1 = ball1.radius; const float r2 = ball2.radius; const float radii_sum = r1 + r2; if ( dist.magnitude >= radii_sum ) return false; /// WARNING: This doesn't work: somefunc(-dist); pNorm ndist = -dist; // Compute relative (approach) velocity. // pVect prev_appr_vel = ball1.prev_velocity - ball2.prev_velocity; const float prev_approach_speed = dot( prev_appr_vel, dist ); const float loss_factor = 1 - opt_bounce_loss; // Compute change in speed based on how close balls touching, ignoring // energy loss. // const float appr_force_dt_no_loss = ( radii_sum - dist.magnitude ) * ( radii_sum - dist.magnitude ) * elasticity_inv_dt; // Change in speed accounting for energy loss. Only applied when // balls separating. // const float appr_force_dt = prev_approach_speed > 0 ? appr_force_dt_no_loss : loss_factor * appr_force_dt_no_loss; const float appr_deltas_1 = appr_force_dt * mass_inv1; /// Update Linear Velocity // v1 -= appr_deltas_1 * dist; if ( b2_real ) v2 += appr_force_dt * mass_inv2 * dist; const float fdt_to_do_1 = get_fdt_to_do(r1,mass_inv1); const float fdt_to_do_2 = get_fdt_to_do(r2,mass_inv2); // Find speed on surface of balls at point of contact. // pVect tact1_rot_vel = point_rot_vel(omega1,r1,dist); pVect tact2_rot_vel = point_rot_vel(omega2,r2,ndist); // Find relative velocity of surfaces at point of contact // in the plane formed by their surfaces. // pVect tan_vel = prev_appr_vel - prev_approach_speed * dist; pNorm tact_vel_dir = mn(tact1_rot_vel - tact2_rot_vel + tan_vel); // Find change in velocity due to friction. // const float fric_force_dt_potential = appr_force_dt_no_loss * opt_friction_coeff; const float mass_inv_sum = b2_real ? mass_inv1 + mass_inv2 : mass_inv1; const float force_dt_limit = tact_vel_dir.magnitude / ( 3.5f * mass_inv_sum ); // If true, surfaces are not sliding or will stop sliding after // frictional forces applied. (If a ball surface isn't sliding // against another surface than it must be rolling.) // const bool will_roll = force_dt_limit <= fric_force_dt_potential; const float sliding_fric_force_dt = will_roll ? force_dt_limit : fric_force_dt_potential; const float dv_tolerance = 0.000001f; const float sliding_fric_dv_1 = sliding_fric_force_dt * mass_inv1; const float3 sliding_fric_fdt_vec = sliding_fric_force_dt * tact_vel_dir; if ( sliding_fric_dv_1 > dv_tolerance ) { // Apply tangential force (resulting in angular momentum change) and // linear force (resulting in velocity change). // omega1 += tan_force_dt(dist, sliding_fric_fdt_vec, -fdt_to_do_1); v1 -= sliding_fric_dv_1 * tact_vel_dir; } const float sliding_fric_dv_2 = sliding_fric_force_dt * mass_inv2; if ( b2_real && sliding_fric_dv_2 > dv_tolerance ) { // Apply frictional forces for ball 2. // omega2 += tan_force_dt(ndist, sliding_fric_fdt_vec, fdt_to_do_2); v2 += sliding_fric_dv_2 * tact_vel_dir;; } { /// Torque // // // Account for forces of surfaces twisting against each // other. (For example, if one ball is spinning on top of // another.) // const float appr_omega = dot(omega2,dist) - dot(omega1,dist); const float fdt_to_do_sum = b2_real ? fdt_to_do_1 + fdt_to_do_2 : fdt_to_do_1; const float fdt_limit = fabs(appr_omega) / fdt_to_do_sum; const bool rev = appr_omega < 0; const float fdt_raw = min(fdt_limit,fric_force_dt_potential); const pVect fdt_v = ( rev ? -fdt_raw : fdt_raw ) * dist; omega1 += fdt_to_do_1 * fdt_v; if ( b2_real ) omega2 -= fdt_to_do_2 * fdt_v; } ball1.velocity = v1; ball1.omega = omega1; if ( !b2_real ) return true; ball2.velocity = v2; ball2.omega = omega2; return true; { /// Rolling Friction // // The rolling friction model used here is ad-hoc. pVect tan_b12_vel = b2_real ? 0.5f * tan_vel : zero_vec; const float torque_limit_sort_of = appr_force_dt_no_loss * sqrt( radii_sum - dist.mag_sq / radii_sum ); // * sqrt( ball1.radius - 0.25 * dist.mag_sq * r_inv ); pVect tact1_rot_vel = point_rot_vel(omega1,r1,dist); pVect tact1_roll_vel = tact1_rot_vel + tan_b12_vel; pNorm tact1_roll_vel_dir = mn(tact1_roll_vel); pVect lost_vel = zero_vec; const float rfric_loss_dv_1 = torque_limit_sort_of * 2.5f * mass_inv1 * ( tact1_roll_vel_dir.magnitude * opt_friction_roll / ( 1 + tact1_roll_vel_dir.magnitude * opt_friction_roll ) ); pVect lost_vel1 = min(tact1_roll_vel_dir.magnitude, rfric_loss_dv_1) * tact1_roll_vel_dir; lost_vel = -lost_vel1; if ( b2_real ) { pVect tact2_rot_vel = point_rot_vel(omega2,r2,ndist); pVect tact2_roll_vel = tact2_rot_vel - tan_b12_vel; pNorm tact2_roll_vel_dir = mn(tact2_roll_vel); const float rfric_loss_dv_2 = torque_limit_sort_of * 2.5f * mass_inv2 * ( tact2_roll_vel_dir.magnitude * opt_friction_roll / ( 1 + tact2_roll_vel_dir.magnitude * opt_friction_roll ) ); pVect lost_vel2 = min(tact2_roll_vel_dir.magnitude, rfric_loss_dv_2 ) * tact2_roll_vel_dir; lost_vel += lost_vel2; } omega1 += tan_force_dt(dist, 0.4f / mass_inv1 * lost_vel, fdt_to_do_1); if ( b2_real ) omega2 += tan_force_dt(dist, 0.4f / mass_inv2 * lost_vel, fdt_to_do_2); } return true; } // // Generic operations used by box code. // __device__ float3 sign_mask(int idx, float3 v) { return make_float3 (idx & 4 ? v.x : -v.x, idx & 2 ? v.y : -v.y, idx & 1 ? v.z : -v.z ); } // Multiply transpose of matrix m by column vector v. __device__ float3 mm_transpose(pcMatrix3x3 m, float3 v) { return v.x * m.r0 + v.y * m.r1 + v.z * m.r2; } __device__ float set_min(float &a, float b) { if ( b < a ) a = b; return a; } __device__ float set_max(float &a, float b) { if ( b > a ) a = b; return a; } // Set matrix m to a rotation matrix based on quaternion q. __device__ void pMatrix_set_rotation(pcMatrix3x3& m, float4 q) { m.r0.x = 1.f - 2.f * q.y * q.y - 2.f * q.z * q.z; m.r0.y = 2.f * q.x * q.y - 2.f * q.w * q.z; m.r0.z = 2.f * q.x * q.z + 2.f * q.w * q.y; m.r1.x = 2.f * q.x * q.y + 2.f * q.w * q.z; m.r1.y = 1.f - 2.f * q.x * q.x - 2.f * q.z * q.z; m.r1.z = 2.f * q.y * q.z - 2.f * q.w * q.x; m.r2.x = 2.f * q.x * q.z - 2.f * q.w * q.y; m.r2.y = 2.f * q.y * q.z + 2.f * q.w * q.x; m.r2.z = 1.f - 2.f * q.x * q.x - 2.f * q.y * q.y; } // Set transpose of matrix m to a rotation matrix based on quaternion q. __device__ void pMatrix_set_rotation_transpose(pcMatrix3x3& m, float4 q) { m.r0.x = 1.f - 2.f * q.y * q.y - 2.f * q.z * q.z; m.r1.x = 2.f * q.x * q.y - 2.f * q.w * q.z; m.r2.x = 2.f * q.x * q.z + 2.f * q.w * q.y; m.r0.y = 2.f * q.x * q.y + 2.f * q.w * q.z; m.r1.y = 1.f - 2.f * q.x * q.x - 2.f * q.z * q.z; m.r2.y = 2.f * q.y * q.z - 2.f * q.w * q.x; m.r0.z = 2.f * q.x * q.z - 2.f * q.w * q.y; m.r1.z = 2.f * q.y * q.z + 2.f * q.w * q.x; m.r2.z = 1.f - 2.f * q.x * q.x - 2.f * q.y * q.y; } // // Box operations. // struct pLine { __device__ pLine() {}; __device__ pLine(pCoor s, pVect d, float l):start(s),dir(d),len(l){}; pCoor start; pVect dir; float len; }; __device__ int8_t get_edge_vtx_idx(int edge) { // Index: xyz (z is LSB). #if 1 const int axis = edge >> 2; const int mask = 0xc >> axis; const int face_vtx = edge & 3; const int box_vtx_check = ( face_vtx & mask ) + face_vtx; return box_vtx_check; #else static const int8_t bi[12] = { 0, 1, 2, 3, 0, 1, 4, 5, 0, 2, 4, 6 }; return bi[edge]; #endif } __device__ float3 box_get_vertices(CUDA_Box_W& box, int vertex) { return box.position + mm_transpose(box.rot_inv,sign_mask(vertex,box.to_111)); } __device__ float3 box_get_axis_norm(CUDA_Box_W& box, int axis) { return axis == 0 ? box.rot_inv.r0 : axis == 1 ? box.rot_inv.r1 : box.rot_inv.r2; } __device__ float3 box_get_face_norm(CUDA_Box_W& box, int face) { pVect norm_raw = box_get_axis_norm(box,face>>1); return face & 1 ? norm_raw : -norm_raw; } __device__ float box_get_axis_len(CUDA_Box_W& box, int axis) { return 2.0f * ( axis == 0 ? box.to_111.x : axis == 1 ? box.to_111.y : box.to_111.z ); } __device__ float box_get_axis_area(CUDA_Box_W& box, int d) { return 4 * ( d == 0 ? box.to_111.x * box.to_111.y : d == 1 ? box.to_111.z * box.to_111.x : box.to_111.y * box.to_111.z ); } __device__ pLine box_get_edge(CUDA_Box_W& box, int edge) { const int axis = edge >> 2; const int8_t box_vtx = get_edge_vtx_idx(edge); return pLine(box_get_vertices(box,box_vtx), box_get_axis_norm(box,axis), box_get_axis_len(box,axis)); } __device__ void box_set_mi_vec(CUDA_Box_W& box,float3 to_111) { pVect dsq = to_111 * to_111; float dsqs = dsq.x + dsq.y + dsq.z; float mass_factor = 1.0f / ( box.mass_inv * 3.0f ); box.mi_vec = mass_factor * ( mv(dsqs) - dsq ); } __device__ void box_set_mi_vec(CUDA_Box_W& box) { box_set_mi_vec(box,box.to_111); } __device__ float box_get_moment_of_inertia_inv(CUDA_Box_W& box, pNorm axis); __device__ float3 box_get_vel(CUDA_Box_W&box, float3 pos) { pVect cent_to_pt = mv(box.position,pos); pVect rot_vel = cross(box.omega,cent_to_pt); return rot_vel + box.velocity; } __device__ void box_geometry_update(CUDA_Box_W& box) { pMatrix_set_rotation_transpose(box.rot_inv, box.orientation); box_set_mi_vec(box); } __device__ void box_apply_force_dt(CUDA_Box_W& box, float3 tact, float3 force) { if ( box.mass_inv == 0 ) return; box.velocity += box.mass_inv * force; pVect cent_to_tact = mv(box.position,tact); pVect torque = cross(cent_to_tact,force); pNorm torqueN = mn(torque); float mi_inv = box_get_moment_of_inertia_inv(box,torqueN); box.omega += mi_inv * torque; } __device__ float box_get_moment_of_inertia_inv(CUDA_Box_W& box, pNorm axis) { if ( axis.mag_sq < 1e-11f || box.mass_inv == 0 ) return 0; pVect tl = box.rot_inv * axis.v; pVect tls = tl * tl; float mi = dot(tls,box.mi_vec); return 1.0f / mi; } __device__ float box_get_moment_of_inertia_inv(CUDA_Box_W& box, float3 tact, pNorm dir) { pVect cent_to_tact = mv(box.position,tact); pNorm torque_axis = mn(cross(cent_to_tact,dir)); return box_get_moment_of_inertia_inv(box,torque_axis); } __device__ void box_apply_force_fric_dt (CUDA_Box_W& box, float3 tact, pNorm force_dir, float force_mag_dt) { box_apply_force_dt(box,tact,force_mag_dt*force_dir); } __device__ CUDA_SectTT sect_init() { CUDA_SectTT sect; sect.exists = false; return sect; } #include "k-boxes.h" /// /// Pass Box/Box Intersect /// __global__ void pass_xx_intersect(int xx_pairs_count); __host__ void pass_xx_intersect_launch(dim3 dg, dim3 db, int xx_pairs_count) { const int shared_amt = 0; pass_xx_intersect<<<dg,db,shared_amt>>>(xx_pairs_count); } __device__ void penetration_boxes_resolve_force (CUDA_Box_W& box1, CUDA_Box_W& box2, float3 pos, pNorm sep_normal) { const float pen_dist = 0.1f * sep_normal.magnitude; pVect vel1 = box_get_vel(box1,pos); pVect vel2 = box_get_vel(box2,pos); pVect velto1 = vel2 - vel1; const float sep_vel = dot(velto1,sep_normal.v); const float loss_factor = 1 - opt_bounce_loss_box; const float force_dt_no_loss = elasticity_inv_dt * pen_dist; const bool separating = sep_vel >= 0; const float appr_force_dt = separating ? force_dt_no_loss * loss_factor : force_dt_no_loss; pVect sep_force = appr_force_dt * sep_normal.v; box_apply_force_dt(box1, pos, -sep_force ); box_apply_force_dt(box2, pos, sep_force ); } __device__ void penetration_boxes_resolve_fric (CUDA_Box_W& box1, CUDA_Box_W& box2, float3 pos, pNorm sep_normal) { const float pen_dist = 0.1f * sep_normal.magnitude; const float force_dt_no_loss = elasticity_inv_dt * pen_dist; const float fric_force_dt_potential = force_dt_no_loss * opt_friction_coeff; /// Torque // // // Account for forces of surfaces twisting against each // other. (For example, if one box is spinning on top of // another.) // const float appr_omega = dot(box2.omega,sep_normal) - dot(box1.omega,sep_normal); { const float mi1_inv = box_get_moment_of_inertia_inv(box1,sep_normal); const float mi2_inv = box_get_moment_of_inertia_inv(box2,sep_normal); const float fdt_limit = fabs(appr_omega) / ( mi1_inv + mi2_inv ); const bool rev = appr_omega < 0; const float fdt_raw = min(fdt_limit,fric_force_dt_potential); const pVect fdt_v = ( rev ? -fdt_raw : fdt_raw ) * sep_normal; box1.omega += mi1_inv * fdt_v; box2.omega -= mi2_inv * fdt_v; } pVect vel1b = box_get_vel(box1,pos); pVect vel2b = box_get_vel(box2,pos); pVect velto1b = vel2b - vel1b; const float sep_velb = dot(velto1b,sep_normal); pNorm tan_vel = mn(velto1b - sep_velb * sep_normal); const float fdt_limit = 0.5f * tan_vel.magnitude / ( box1.mass_inv + box2.mass_inv + box_get_moment_of_inertia_inv(box1,pos,tan_vel) + box_get_moment_of_inertia_inv(box2,pos,tan_vel) ); const float fric_force_dt = min(fdt_limit,fric_force_dt_potential); box_apply_force_fric_dt(box1,pos, tan_vel, fric_force_dt); box_apply_force_fric_dt(box2,pos, -tan_vel, fric_force_dt); } __device__ bool penetration_boxes_resolve (CUDA_Phys_W& phys1, CUDA_Phys_W& phys2, int tsidx, Force_Types ft) { /// Update velocity and angular momentum for a pair of boxes in contact. CUDA_Box_W& box1 = phys1.box; CUDA_Box_W& box2 = phys2.box; float4 dir_and_mag = xx_sects_dir[tsidx]; if ( dir_and_mag.w == 0 ) return false; float4 center_and_um = xx_sects_center[tsidx]; float3 center = m3(center_and_um); pNorm sep_normal = mn(dir_and_mag); if ( ft & FT_NonFriction ) penetration_boxes_resolve_force(box1,box2,center,sep_normal); if ( ft & FT_Friction ) penetration_boxes_resolve_fric(box1,box2,center,sep_normal); return true; } /// /// Pairs Pass /// // // Resolve ball collisions with each other. __global__ void pass_pairs (int prefetch_offset, int schedule_offset, int round_cnt, int max_balls_per_thread, int balls_per_block, Force_Types ft); __host__ void pass_pairs_launch (dim3 dg, dim3 db, int prefetch_offset, int schedule_offset, int round_cnt, int max_balls_per_thread, int balls_per_block, Force_Types ft) { #ifdef USE_STRUCT const int shared_amt = balls_per_block * sizeof(CUDA_Phys_W); #else const int shared_amt = balls_per_block * sizeof(sm_balls[0]) * 8; #endif pass_pairs<<<dg,db,shared_amt>>> (prefetch_offset, schedule_offset, round_cnt, max_balls_per_thread, balls_per_block, ft); } #ifndef USE_STRUCT struct SM_Offsets { int idx_pos; int idx_vel; int idx_omega; int idx_prev_vel; int idx_rad_etc; int idx_to_111; int idx_ori_xyz; int factor; }; __device__ CUDA_Phys_W get_sm_ball(SM_Offsets& smo, int idx) { CUDA_Phys_W phys; const int sidx = idx * smo.factor; phys.box.velocity = sm_balls[smo.idx_vel+sidx]; phys.box.prev_velocity = sm_balls[smo.idx_prev_vel+sidx]; phys.box.position = sm_balls[smo.idx_pos+sidx]; phys.box.omega = sm_balls[smo.idx_omega+sidx]; phys.box.radius = sm_balls[smo.idx_rad_etc+sidx].x; phys.box.mass_inv = sm_balls[smo.idx_rad_etc+sidx].y; phys.read_only = phys.box.mass_inv == 0; return phys; } __device__ void upgrade_sm_box(CUDA_Phys_W& phys, SM_Offsets& smo, int idx) { const int sidx = idx * smo.factor; float4 ori; set_f4(ori,sm_balls[smo.idx_ori_xyz+sidx], sm_balls[smo.idx_rad_etc+sidx].z); pMatrix_set_rotation_transpose(phys.box.rot_inv,ori); float3 to_111 = sm_balls[smo.idx_to_111+sidx]; phys.box.to_111 = to_111; box_set_mi_vec(phys.box); } __device__ void put_sm_phys(SM_Offsets& smo, int sidx, CUDA_Phys_W& phys) { sm_balls[smo.idx_vel+sidx] = phys.ball.velocity; sm_balls[smo.idx_omega+sidx] = phys.ball.omega; } #endif __global__ void pass_pairs(int prefetch_offset, int schedule_offset, int round_cnt, int max_balls_per_thread, int balls_per_block, Force_Types ft) { const int tid = threadIdx.x; // Initialized variables used to access balls_needed and tacts_schedule // arrays. // const int si_block_size = blockIdx.x * max_balls_per_thread * blockDim.x; const int si_block_base = prefetch_offset + si_block_size + tid; const int sp_block_size = blockIdx.x * round_cnt * blockDim.x; const int sp_block_base = schedule_offset + sp_block_size + tid; /// Prefetch objects to shared memory. // #ifdef USE_STRUCT for ( int i=0; i<max_balls_per_thread; i++ ) { int idx = tid + i * blockDim.x; if ( idx >= balls_per_block ) continue; const int m_idx = block_balls_needed[ si_block_base + i * blockDim.x ]; CUDA_Phys_W& phys = sm_balls[idx]; CUDA_Ball_W& ball = phys.ball; CUDA_Box_W& box = phys.box; phys.m_idx = m_idx; if ( m_idx < 0 ) continue; int4 tact_counts = balls_x.tact_counts[m_idx]; phys.pt_type = tact_counts.x; phys.contact_count = tact_counts.y; phys.debug_pair_calls = tact_counts.z; phys.part_of_wheel = bool(tact_counts.w & 2); phys.read_only = tact_counts.w & 1; ball.velocity = xyz(balls_x.velocity[m_idx]); ball.prev_velocity = xyz(balls_x.prev_velocity[m_idx]); ball.position = xyz(balls_x.position[m_idx]); ball.omega = xyz(balls_x.omega[m_idx]); float4 ball_props = balls_x.ball_props[m_idx]; ball.radius = ball_props.x; ball.mass_inv = ball_props.y; ball.pad1 = ball_props.z; ball.pad2 = ball_props.w; if ( phys.pt_type == PT_Box ) { set_f3(box.to_111, balls_x.to_111[m_idx]); box.orientation = balls_x.orientation[m_idx]; box_geometry_update(box); } } #else SM_Offsets smo; smo.idx_pos = 0; smo.idx_vel = 1; smo.idx_omega = 2; smo.idx_prev_vel = 3; smo.idx_rad_etc = 4; smo.idx_to_111 = 5; smo.idx_ori_xyz = 6; smo.factor = 7; for ( int i=0; i<max_balls_per_thread; i++ ) { int idx = tid + i * blockDim.x; if ( idx >= balls_per_block ) continue; const int m_idx = block_balls_needed[ si_block_base + i * blockDim.x ]; if ( m_idx < 0 ) continue; int4 tact_counts = balls_x.tact_counts[m_idx]; const int pt_type = tact_counts.x; sm_balls_misc[idx].x = tact_counts.x; // pt_type sm_balls_misc[idx].y = tact_counts.y; // contact count sm_balls_misc[idx].z = tact_counts.z; // debug_pair_calls sm_balls_misc[idx].w = tact_counts.w; // Part of wheel is bit 0x2 const int sidx = idx * smo.factor; sm_balls[smo.idx_vel+sidx] = m3(balls_x.velocity[m_idx]); sm_balls[smo.idx_prev_vel+sidx] = m3(balls_x.prev_velocity[m_idx]); sm_balls[smo.idx_pos+sidx] = m3(balls_x.position[m_idx]); sm_balls[smo.idx_omega+sidx] = m3(balls_x.omega[m_idx]); float4 props =balls_x.ball_props[m_idx]; sm_balls[smo.idx_rad_etc+sidx] = m3(props); if ( pt_type == PT_Box ) { sm_balls[smo.idx_to_111+sidx] = m3(balls_x.to_111[m_idx]); const float4 orientation = balls_x.orientation[m_idx]; sm_balls[smo.idx_ori_xyz+sidx] = m3(orientation); sm_balls[smo.idx_rad_etc+sidx].z = orientation.w; } } #endif const pVect zero_vec = mv(0,0,0); /// Resolve Collisions // for ( int round=0; round<round_cnt; round++ ) { const int tsidx = sp_block_base + round * blockDim.x; SM_Idx2 indices = tacts_schedule[ tsidx ]; const int ix = indices.x; const int iy = indices.y; // Wait for all threads to reach this point (to avoid having // two threads operate on the same ball simultaneously). // __syncthreads(); if ( indices.x == indices.y ) continue; #ifdef USE_STRUCT CUDA_Phys_W& physx = sm_balls[ix]; CUDA_Phys_W& physy = sm_balls[iy]; const unsigned char ptx = physx.pt_type; const unsigned char pty = physy.pt_type; #else const int six = ix * smo.factor; const int siy = iy * smo.factor; CUDA_Phys_W physx = get_sm_ball(smo,ix); CUDA_Phys_W physy = get_sm_ball(smo,iy); const int ptx = sm_balls_misc[ix].x; const int pty = sm_balls_misc[iy].x; #endif if ( ft & FT_NonFriction ) { #ifdef USE_STRUCT physx.debug_pair_calls++; physy.debug_pair_calls++; #else sm_balls_misc[ix].z++; sm_balls_misc[iy].z++; #endif } char rv; if ( ptx == PT_Box && pty == PT_Box ) { #ifndef USE_STRUCT upgrade_sm_box(physx,smo,ix); upgrade_sm_box(physy,smo,iy); #endif rv = penetration_boxes_resolve(physx,physy,tsidx,ft); } else if ( ptx == PT_Ball && pty == PT_Box ) { #ifndef USE_STRUCT upgrade_sm_box(physy,smo,iy); #endif rv = penetration_box_ball_resolve(physy,physx,ft); } else if ( pty == PT_Ball ) { CUDA_Ball_W& ballx = physx.ball; CUDA_Ball_W& bally = physy.ball; rv = penetration_balls_resolve(ballx,bally,true,ft); } else if ( pty == PT_Box ) { // Note: Tile / Box collisions not yet handled. rv = 0; } else { CUDA_Ball_W& ballx = physx.ball; CUDA_Tile_W& tiley = physy.tile; pCoor tact_pos; pVect tact_dir; rv = tile_ball_collide(tiley, ballx, tact_pos, tact_dir); if ( !rv ) continue; CUDA_Ball_W pball; pball.radius = 1; pball.omega = pball.prev_velocity = pball.velocity = zero_vec; pball.position = tact_pos + tact_dir; pVect vbefore = physx.ball.velocity; penetration_balls_resolve(ballx, pball, false, ft); pVect delta_mo = ( 1.0f / ballx.mass_inv ) * ( ballx.velocity - vbefore ); #ifdef USE_STRUCT const bool part_of_wheel = physy.part_of_wheel; #else const bool part_of_wheel = sm_balls_misc[iy].w & 2; #endif if ( part_of_wheel ) { wheel_collect_tile_force(tiley, tact_pos, delta_mo); // Note: Need to fix this. } #ifndef USE_STRUCT put_sm_phys(smo,six,physx); sm_balls_misc[ix].y += 1; continue; #endif } #ifdef USE_STRUCT physx.contact_count += rv; physy.contact_count += rv; #else put_sm_phys(smo,six,physx); put_sm_phys(smo,siy,physy); sm_balls_misc[ix].y += rv; sm_balls_misc[iy].y += rv; #endif } __syncthreads(); /// Copy Ball Data to Memory // for ( int i=0; i<max_balls_per_thread; i++ ) { int idx = tid + i * blockDim.x; if ( idx >= balls_per_block ) continue; #ifdef USE_STRUCT CUDA_Phys_W& phys = sm_balls[idx]; const int m_idx = phys.m_idx; if ( m_idx < 0 ) continue; if ( phys.read_only ) continue; #else const int sidx = idx * smo.factor; const int m_idx = block_balls_needed[ si_block_base + i * blockDim.x ]; if ( m_idx < 0 ) continue; const float mass_inv = sm_balls[smo.idx_rad_etc+sidx].y; const bool read_only = mass_inv == 0; if ( read_only ) continue; #endif #ifdef USE_STRUCT CUDA_Ball_W& ball = phys.ball; int4 tact_counts; tact_counts.x = phys.pt_type; tact_counts.y = phys.contact_count; tact_counts.z = phys.debug_pair_calls; tact_counts.w = phys.part_of_wheel; balls_x.tact_counts[m_idx] = tact_counts; const char pt_type = phys.pt_type; set_f4(balls_x.velocity[m_idx], ball.velocity); if ( pt_type == PT_Tile ) continue; set_f4(balls_x.omega[m_idx], ball.omega); #else balls_x.tact_counts[m_idx].y = sm_balls_misc[idx].y; balls_x.tact_counts[m_idx].z = sm_balls_misc[idx].z; const unsigned char pt_type = sm_balls_misc[idx].x; set_f4(balls_x.velocity[m_idx], sm_balls[smo.idx_vel+sidx]); if ( pt_type == PT_Tile ) continue; set_f4(balls_x.omega[m_idx], sm_balls[smo.idx_omega+sidx]); #endif } } /// /// Platform Pass /// // // Resolve ball collisions with platform, also update ball position // and orientation. __device__ void platform_collision(CUDA_Phys_W& phys); __device__ void platform_collision_box(CUDA_Phys_W& phys); __global__ void pass_platform(int ball_count); __device__ void pass_platform_ball(CUDA_Phys_W& phys, int idx); __device__ void pass_platform_tile(CUDA_Phys_W& phys, int idx); __device__ void pass_platform_box(CUDA_Phys_W& phys, int idx); __host__ cudaError_t cuda_get_attr_plat_pairs (struct cudaFuncAttributes *attr_platform, struct cudaFuncAttributes *attr_pairs, struct cudaFuncAttributes *attr_xx_intersect) { collect_symbols(); // Return attributes of CUDA functions. The code needs the // maximum number of threads. cudaError_t e1 = cudaFuncGetAttributes(attr_platform,pass_platform); if ( e1 ) return e1; cudaError_t e2 = cudaFuncGetAttributes(attr_pairs,pass_pairs); if ( e2 ) return e2; cudaError_t e3 = cudaFuncGetAttributes(attr_xx_intersect,pass_xx_intersect); return e3; } __host__ void pass_platform_launch (dim3 dg, dim3 db, int ball_count) { const int block_lg = 32 - __builtin_clz(db.x-1); const int shared_amt = sizeof(float) << block_lg; pass_platform<<<dg,db,shared_amt>>>(ball_count); } __global__ void pass_platform(int ball_count) { /// Main CUDA routine for resolving collisions with platform and /// updating ball position and orientation. // One ball per thread. const int idx_base = blockIdx.x * blockDim.x; const int idx = idx_base + threadIdx.x; if ( idx >= ball_count ) return; CUDA_Phys_W phys; /// Copy ball data from memory to local variables. // // Local variables hopefully will be in GPU registers, not // slow local memory. // int4 tact_counts = balls_x.tact_counts[idx]; phys.pt_type = tact_counts.x; phys.contact_count = tact_counts.y; phys.part_of_wheel = tact_counts.w & 1; if ( phys.pt_type == PT_Ball ) pass_platform_ball(phys, idx); else if ( phys.pt_type == PT_Box ) pass_platform_box(phys, idx); else pass_platform_tile(phys, idx); /// Copy other updated data to memory. // tact_counts.y = phys.contact_count << 8; tact_counts.z = tact_counts.z << 16; balls_x.tact_counts[idx] = tact_counts; } __device__ void pass_platform_ball(CUDA_Phys_W& phys, int idx) { // One ball per thread. CUDA_Ball_W& ball = phys.ball; /// Copy ball data from memory to local variables. // // Local variables hopefully will be in GPU registers, not // slow local memory. // ball.prev_velocity = xyz(balls_x.prev_velocity[idx]); ball.velocity = xyz(balls_x.velocity[idx]) + gravity_accel_dt; set_f3(ball.position,balls_x.position[idx]); set_f3(ball.omega, balls_x.omega[idx]); float4 ball_props = balls_x.ball_props[idx]; ball.radius = ball_props.x; ball.mass_inv = ball_props.y; /// Handle Ball/Platform Collision // if ( opt_platform_curved ) platform_collision(phys); /// Handle Air Resistance // const float area = M_PI * ball.radius * ball.radius; pNorm force = mn( -area * opt_air_resistance * ball.velocity ); const float v_change = exp( -force.magnitude * ball.mass_inv * delta_t ); ball.velocity = v_change * ball.velocity; /// Update Position and Orientation // ball.position += 0.5f * delta_t * ( ball.prev_velocity + ball.velocity ); pNorm axis = mn(ball.omega); balls_x.orientation[idx] = quat_normalize ( quat_mult ( mq( axis, delta_t * axis.magnitude ), balls_x.orientation[idx] )); /// Copy other updated data to memory. // set_f4(balls_x.velocity[idx], ball.velocity); set_f4(balls_x.prev_velocity[idx], ball.velocity); set_f4(balls_x.omega[idx], ball.omega); set_f4(balls_x.position[idx], ball.position, ball.radius); } __device__ void pass_platform_tile(CUDA_Phys_W& phys, int idx) { if ( !phys.part_of_wheel ) return; const int tid = threadIdx.x; float4 tile_props = balls_x.velocity[idx]; float torque = tile_props.z; block_torque_dt[tid] = torque; tile_props.z = 0; balls_x.velocity[idx] = tile_props; float omega = wheel.omega[0]; const float3 pt_ll = xyz(balls_x.position[idx]); const float3 norm_rt = xyz(balls_x.omega[idx]); const float3 norm_up = xyz(balls_x.prev_velocity[idx]); const float3 normal = xyz(balls_x.ball_props[idx]); float torque_sum = 0; // Assuming that all are on same warp. :-) for ( int i=wheel.idx_start; i<wheel.idx_stop; i++ ) torque_sum += block_torque_dt[i]; omega -= torque_sum * wheel.moment_of_inertia_inv; const float friction_delta_omega = wheel.friction_torque * wheel.moment_of_inertia_inv * delta_t; if ( fabs(omega) <= friction_delta_omega ) omega = 0; else if ( omega > 0 ) omega -= friction_delta_omega; else omega += friction_delta_omega; const float delta_theta = omega * delta_t; pcMatrix3x3 rot; pMatrix_set_rotation(rot,wheel.axis_dir,delta_theta); const float3 rpt_ll = wheel.center + rot * ( pt_ll - wheel.center ); const float3 rnorm_rt = rot * norm_rt; const float3 rnorm_up = rot * norm_up; const float3 rnormal = rot * normal; set_f4(balls_x.position[idx],rpt_ll); set_f4(balls_x.omega[idx], rnorm_rt); set_f4(balls_x.prev_velocity[idx], rnorm_up); set_f4(balls_x.ball_props[idx], rnormal); if ( idx == wheel.idx_start ) wheel.omega[0] = omega; } __device__ void pass_platform_box(CUDA_Phys_W& phys, int idx) { // One box per thread. CUDA_Box_W& box = phys.box; /// Copy data from memory to local variables. // // Local variables hopefully will be in GPU registers, not // slow local memory. // float4 box_props = balls_x.ball_props[idx]; box.mass_inv = box_props.y; if ( box.mass_inv == 0 ) return; // Read only. box.prev_velocity = xyz(balls_x.prev_velocity[idx]); box.velocity = xyz(balls_x.velocity[idx]) + gravity_accel_dt; set_f3(box.position,balls_x.position[idx]); set_f3(box.omega, balls_x.omega[idx]); set_f3(box.to_111, balls_x.to_111[idx]); box.orientation = balls_x.orientation[idx]; box_geometry_update(box); /// Handle Ball/Platform Collision // if ( opt_platform_curved ) platform_collision_box(phys); /// Handle Air Resistance // pVect force = mv(0,0,0); for ( int d=0; d<3; d++ ) { const pVect face_normal = box_get_axis_norm(box,2-d); const float amt = dot( face_normal, box.velocity ); const float area = box_get_axis_area(box,d); force += amt * area * face_normal; } pNorm force_dir = mn(force); const float v_dir = dot(force_dir,box.velocity); const float resistance = force_dir.magnitude * opt_air_resistance; const float v_change = expf(- resistance * box.mass_inv * delta_t ); box.velocity -= v_dir * (1.0f - v_change ) * force_dir; { float3 lsq = box.to_111 * box.to_111; pVect amoment1 = mv( lsq.y + lsq.z, lsq.x + lsq.z, lsq.x + lsq.y ); float3 omega = box.omega; pVect amoment = amoment1 * box.to_111; pVect omega_l = box.rot_inv * omega; const float torque = opt_air_resistance * dot(amoment,fabs(omega_l)); const float mi = box_get_moment_of_inertia_inv(box,mn(omega)); const float o_change = exp(- torque * mi * delta_t ); box.omega = o_change * omega; } /// Update Position and Orientation // box.position += 0.5f * delta_t * ( box.prev_velocity + box.velocity ); pNorm axis = mn(box.omega); balls_x.orientation[idx] = quat_normalize ( quat_mult ( mq( axis, delta_t * axis.magnitude ), box.orientation )); /// Copy other updated data to memory. // set_f4(balls_x.velocity[idx], box.velocity); set_f4(balls_x.prev_velocity[idx], box.velocity); set_f4(balls_x.omega[idx], box.omega); set_f4(balls_x.position[idx], box.position, box_props.x); } __device__ void platform_collision_box(CUDA_Phys_W& phys) { CUDA_Box_W& box = phys.box; float radius = length(box.to_111); if ( box.position.y - radius >= 0 ) return; if ( box.position.z + radius <= platform_zmin ) return; if ( box.position.z - radius >= platform_zmax ) return; float3 axis = mv(platform_xmid,0,box.position.z); pVect btoa = mv(box.position,axis); if ( dot(btoa,btoa) < (platform_xrad-radius)*(platform_xrad-radius) ) return; box_geometry_update(box); int inside = 0; int outside_under = 0; float pen_dists[8]; CUDA_SectTT psects[5]; int ps_next = 0; float min_pd = 0; // For vertices between ends. float max_pd = 0; // Find vertices that are under the platform. // for ( int v=0; v<8; v++ ) { int v_bit = 1 << v; float3 pos = box_get_vertices(box,v); if ( pos.y > 0 ) { pen_dists[v] = 0; continue; } float3 axis = mc(platform_xmid,0,pos.z); pNorm tact_dir = mn(axis,pos); float pen_dist = tact_dir.magnitude - platform_xrad; pen_dists[v] = pen_dist; if ( pos.z < platform_zmin || pos.z > platform_zmax ) { if ( pen_dist > 0 ) outside_under |= v_bit; continue; } set_min(min_pd,pen_dist); set_max(max_pd,pen_dist); if ( pen_dist > 1 ) continue; inside |= v_bit; if ( pen_dist <= 0 ) continue; CUDA_SectTT* sect = &psects[ps_next++]; sect->start = pos; sect->dir = tact_dir.v; sect->pen_dist = pen_dist; } bool object_inside = max_pd < -min_pd; if ( !object_inside ) return; // Examine vertices that are off the edge of the platform (in the // z direction), to see if an adjoining edge intersects the platform // edge. // for ( int v=0; v<8; v++ ) { int v_bit = 1 << v; if ( ! ( v_bit & outside_under ) ) continue; // Outside Vertex (beyond z_max or z_min). // pCoor pos = box_get_vertices(box,v); float pen_dist_out = pen_dists[v]; float v_z = pos.z; float ref_z = v_z >= platform_zmax ? platform_zmax : platform_zmin; float outside_z_len = fabs(v_z - ref_z); // Look for adjoining vertices that are over the platform. // for ( int axis = 0; axis < 3; axis++ ) { int vn = v ^ ( 1 << axis ); int vn_bit = 1 << vn; if ( ! ( inside & vn_bit ) ) continue; float pen_len = pen_dists[vn] - pen_dist_out; // Inside Vertex pCoor pos_in = box_get_vertices(box,vn); // Compute the contact point at penetration distance. // float z_len = fabs(v_z - pos_in.z); if ( z_len < 0.0001f ) continue; float scale = outside_z_len / z_len; pVect to_inside = mv(pos,pos_in); pCoor tact = pos + scale * to_inside; float pen_tact = pen_dist_out + scale * pen_len; if ( pen_tact <= 0 ) continue; CUDA_SectTT* sect = &psects[ps_next++]; sect->start = tact; sect->pen_dist = pen_tact; pNorm dir = mn(cross(to_inside,mv(-tact.y,tact.x,0))); sect->dir = pen_len >= 0 ? normalize(mv(tact.x,tact.y,0)) : dir.v; } } // if ( ps_next > 0 ) phys.contact_count++; for ( int i=0; i<ps_next; i++ ) { CUDA_SectTT *sect = &psects[i]; pCoor pos = sect->start; pVect tact_dir = sect->dir; pNorm ctopos = mn(box.position,pos); pVect vel = box_get_vel(box,pos); float pen_dist = sect->pen_dist; float rad_vel = dot(vel,tact_dir); double loss_factor = 1 - opt_bounce_loss; float force_dt_no_loss = elasticity_inv_dt * pen_dist; float max_fdt_in = rad_vel / box.mass_inv; float appr_force_dt = rad_vel > 0 ? min(max_fdt_in,force_dt_no_loss) : force_dt_no_loss * loss_factor; box_apply_force_dt(box,pos, - appr_force_dt * tact_dir ); } for ( int i=0; i<ps_next; i++ ) { CUDA_SectTT *sect = &psects[i]; pCoor pos = sect->start; pVect tact_dir = sect->dir; float pen_dist = sect->pen_dist; float force_dt_no_loss = elasticity_inv_dt * pen_dist; pVect vel2 = box_get_vel(box,pos); float rad_vel2 = dot(vel2,tact_dir); pNorm tan_vel = mn( vel2 - rad_vel2 * tact_dir ); float mi_inv = box_get_moment_of_inertia_inv(box,pos,tan_vel); float fdt_limit = tan_vel.magnitude / ( box.mass_inv + mi_inv ); float fric_force_dt_no_loss = force_dt_no_loss * opt_friction_coeff; float fric_force_dt = min(fdt_limit, fric_force_dt_no_loss); box_apply_force_fric_dt(box,pos, tan_vel, -fric_force_dt); } } __device__ void platform_collision(CUDA_Phys_W& phys) { /// Check if ball in contact with platform, if so apply forces. CUDA_Ball_W& ball = phys.ball; pCoor pos = ball.position; const float r = ball.radius; bool collision_possible = pos.y < r && pos.x >= platform_xmin - r && pos.x <= platform_xmax + r && pos.z >= platform_zmin - r && pos.z <= platform_zmax + r; if ( !collision_possible ) return; CUDA_Ball_W pball; pCoor axis = mc(platform_xmid,0,pos.z); const float short_xrad = platform_xrad - r; const float short_xrad_sq = short_xrad * short_xrad; const float long_xrad = platform_xrad + r; const float long_xrad_sq = long_xrad * long_xrad; // Test for different ways ball can touch platform. If contact // is found find position of an artificial platform ball (pball) // that touches the real ball at the same place and angle as // the platform. This pball will be used for the ball-ball penetration // routine, penetration_balls_resolve. if ( pos.y > 0 ) { // Possible contact with upper edge of platform. // pCoor tact = mc(pos.x > platform_xmid ? platform_xmax : platform_xmin, 0, pos.z); pNorm tact_dir = mn(pos,tact); if ( tact_dir.mag_sq >= r * r ) return; pball.position = tact + r * tact_dir; } else if ( pos.z > platform_zmax || pos.z < platform_zmin ) { // Possible contact with side (curved) edges of platform. // pNorm ball_dir = mn(axis,pos); if ( ball_dir.mag_sq <= short_xrad_sq ) return; const float zedge = pos.z > platform_zmax ? platform_zmax : platform_zmin; pCoor axis_edge = mc(platform_xmid,0,zedge); pCoor tact = axis_edge + platform_xrad * ball_dir; pNorm tact_dir = mn(pos,tact); if ( tact_dir.mag_sq >= r * r ) return; pball.position = tact + r * tact_dir; } else { // Possible contact with surface of platform. // pNorm tact_dir = mn(axis,pos); if ( tact_dir.mag_sq <= short_xrad_sq || tact_dir.mag_sq >= long_xrad_sq ) return; pball.position = axis + ( platform_xrad + ( tact_dir.magnitude < platform_xrad ? r : -r ) ) * tact_dir; } // Finish initializing platform ball, and call routine to // resolve penetration. // pVect zero_vec = mv(0,0,0); pball.omega = zero_vec; pball.prev_velocity = pball.velocity = zero_vec; pball.radius = ball.radius; pball.mass_inv = ball.mass_inv; if ( penetration_balls_resolve(phys.ball,pball,false,FT_All) ) phys.contact_count++; } /// Compute Phys Proximity Pairs // Mapping from z-sort index to ball array index. __constant__ int *z_sort_indices; // Pre-computed z_max values. __constant__ float *z_sort_z_max; // Computed proximity values, sent to CPU. __constant__ int64_t *cuda_prox; // An array that can be used to pass values back to the CPU for // use in debugging. __constant__ float3 *pass_sched_debug; texture<float4> balls_pos_tex; texture<float4> balls_vel_tex; __global__ void pass_sched(int ball_count, float lifetime_delta_t); __device__ float ball_min_z_get (float3 position, float3 velocity, float radius, float lifetime_delta_t); __host__ bool pass_sched_launch (dim3 dg, dim3 db, int ball_count, float lifetime_delta_t, void *pos_array_dev, void *vel_array_dev) { size_t offset; const size_t size = ball_count * sizeof(float4); const cudaChannelFormatDesc fd = cudaCreateChannelDesc(32,32,32,32,cudaChannelFormatKindFloat); cudaBindTexture(&offset, balls_pos_tex, pos_array_dev, fd, size); if ( offset ) return false; cudaBindTexture(&offset, balls_vel_tex, vel_array_dev, fd, size); if ( offset ) return false; pass_sched<<<dg,db>>>(ball_count,lifetime_delta_t); return true; } __global__ void pass_sched(int ball_count, float lifetime_delta_t) { // Determine which balls that are in proximity to a ball. This // routine only works for balls, if a tile is found an I-give-up // value is returned, and the CPU will have to determine proximity. const int idx_base = blockIdx.x * blockDim.x; // idx9 is an index into z-sorted arrays. const int idx9 = idx_base + threadIdx.x; if ( idx9 >= ball_count ) return; // bidx9 is an index into the balls arrays. const int bidx9 = z_sort_indices[idx9]; // If bidx9 is negative then Phys at index bidx9 is not a ball, // so just return a give-up code 't' (tile). if ( bidx9 < 0 ) { cuda_prox[idx9] = ( 't' << 8 ) | 0xff; return; } // Fetch position, radius (packed in position vector), and velocity. // const float4 pos_rad9 = tex1Dfetch(balls_pos_tex,bidx9); const float3 pos9 = xyz(pos_rad9); const float radius9 = pos_rad9.w; const float4 vel9_pad = tex1Dfetch(balls_vel_tex,bidx9); const float3 vel9 = xyz(vel9_pad); const float z_min = ball_min_z_get(pos9,vel9,radius9,lifetime_delta_t); // Number of nearby balls. int proximity_cnt = 0; // Reason for giving up, 0 means we didn't give up (yet). char incomplete = 0; // The list of balls in proximity, packed into a single integer. Prox_Offsets offsets = 0; for ( int idx1 = idx9-1; !incomplete && idx1 >= 0; idx1-- ) { const float z_max = z_sort_z_max[idx1]; // Break if this and subsequent z-ordered balls could not // possibly be in proximity. if ( z_max < z_min ) break; const int bidx1 = z_sort_indices[idx1]; // If there's a tile here give up. // (t is for tile) if ( bidx1 < 0 ) { incomplete = 't'; continue; } const float4 pos_rad = tex1Dfetch(balls_pos_tex,bidx1); const float3 pos1 = xyz(pos_rad); const float4 vel_pad1 = tex1Dfetch(balls_vel_tex,bidx1); const float3 vel1 = xyz(vel_pad1); const float radius1 = pos_rad.w; // Use the pNorm constructor to compute the distance between two balls. pNorm dist = mn(pos1,pos9); // Balls are considered in proximity if they can be // this close over schedule lifetime. const float region_length_small = 1.11f * ( radius9 + radius1 ); // Check if balls will be close enough over lifetime. pVect delta_v = vel9 - vel1; const float delta_d = lifetime_delta_t * length(delta_v); const float dist2 = dist.magnitude - delta_d; if ( dist2 > region_length_small ) continue; // At this point the balls are considered in proximity, now // squeeze the value of bidx1 into eight bits by taking // the difference of z-sort indices, which should be close // together. const int offset = idx9 - idx1; // Ooops, exceeded the limit on the number of proximities. // (f is for full) if ( proximity_cnt >= cuda_prox_per_ball ) incomplete = 'f'; // Ooops, the offset won't fit into 8 bits. // (o is for overflow) else if ( offset >= 255 ) incomplete = 'o'; // Everything is fine, slide the offset on to the list. else offsets = ( offsets << 8 ) | offset; proximity_cnt++; } // If code could not compute all proximities replace offsets with // the error code. if ( incomplete ) offsets = ( incomplete << 8 ) | 0xff; cuda_prox[idx9] = offsets; } __device__ float ball_min_z_get (float3 position, float3 velocity, float radius, float lifetime_delta_t) { const float m = fabs(velocity.x) + fabs(velocity.y) + fabs(velocity.z); const float z_min = position.z + position.x - m * lifetime_delta_t - 2 * radius; return z_min; } static __host__ void collect_symbols() { CU_SYM(balls_x); CU_SYM(block_balls_needed); CU_SYM(tacts_schedule); CU_SYM(xx_pairs); CU_SYM(xx_sects_center); CU_SYM(xx_sects_dir); CU_SYM(xx_sects_debug); CU_SYM(gravity_accel_dt); CU_SYM(opt_bounce_loss); CU_SYM(opt_bounce_loss_box); CU_SYM(opt_friction_coeff); CU_SYM(opt_friction_roll); CU_SYM(opt_air_resistance); CU_SYM(opt_platform_curved); CU_SYM(platform_xmin); CU_SYM(platform_xmax); CU_SYM(platform_zmin); CU_SYM(platform_zmax); CU_SYM(platform_xmid); CU_SYM(platform_xrad); CU_SYM(delta_t); CU_SYM(elasticity_inv_dt); CU_SYM(opt_debug); CU_SYM(opt_debug2); CU_SYM(wheel); CU_SYM(z_sort_indices); CU_SYM(z_sort_z_max); CU_SYM(cuda_prox); CU_SYM(pass_sched_debug); }