CLamsee - Nbody

This tutorial demonstrates the implementation of a nbody simulation using the CLamsee slang. A n-body simulation numerically approximates a dynamical system of particles that interact with each other. Typical examples are simulations in astrophysics, where bodies interact through gravitational forces. The session for this tutorial is inlcuded in the ViSlang binary package but you can also download it here. CLamsee is a a general slang for parallel computing that consists of a subset of OpenCL C, an introduction can be found here.

Implementation

The implementation is adapted from the NVIDIA OpenCL SDK and explained in detail in the book GPU Gems 3.

Device Code

using Device; float4 bodyBodyInteraction(float4 ai, float4 bi, float4 bj, float softeningSquared) { float4 r; r.x = bi.x - bj.x; r.y = bi.y - bj.y; r.z = bi.z - bj.z; float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; distSqr += softeningSquared; float invDist = rsqrt(distSqr); float invDistCube = invDist * invDist * invDist; float s = bj.w * invDistCube; ai.x += r.x * s; ai.y += r.y * s; ai.z += r.z * s; return ai; } float4 gravitation(float4 myPos, float4 accel, float softeningSquared, local float4 *sharedPos) { int blockDimx = get_local_size(0); int threadIdxy = get_local_id(1); for (int counter = 0; counter < blockDimx;counter = counter ) { accel = bodyBodyInteraction(accel, sharedPos[counter + (mul24(blockDimx, threadIdxy))], myPos, softeningSquared); counter += 1; accel = bodyBodyInteraction(accel, sharedPos[counter + (mul24(blockDimx, threadIdxy))], myPos, softeningSquared); counter += 1; accel = bodyBodyInteraction(accel, sharedPos[counter + (mul24(blockDimx, threadIdxy))], myPos, softeningSquared); counter += 1; accel = bodyBodyInteraction(accel, sharedPos[counter + (mul24(blockDimx, threadIdxy))], myPos, softeningSquared); counter += 1; } return accel; } float4 computeBodyAccel_MT(float4 bodyPos, global float4 *positions, int numBodies, float softeningSquared, local float4 *sharedPos) { float4 acc = {0,0,0,0}; int threadIdxx = get_local_id(0); int threadIdxy = get_local_id(1); int blockIdxx = get_group_id(0); int blockIdxy = get_group_id(1); int gridDimx = get_num_groups(0); int blockDimx = get_local_size(0); int blockDimy = get_local_size(1); int numTiles = numBodies / mul24(blockDimx, blockDimy); for (int tile = blockIdxy; tile < numTiles + blockIdxy; tile +=1) { int wrap = -1; if((blockIdxx + (blockDimy* tile)+ threadIdxy) < gridDimx) { wrap = blockIdxx + mul24(blockDimy, tile) + threadIdxy; } else { wrap = blockIdxx + mul24(blockDimy, tile) + threadIdxy - gridDimx; } sharedPos[threadIdxx + blockDimx * threadIdxy] = positions[wrap * blockDimx + threadIdxx]; barrier(CLK_LOCAL_MEM_FENCE); acc = gravitation(bodyPos, acc, softeningSquared, sharedPos); barrier(CLK_LOCAL_MEM_FENCE); } sharedPos[threadIdxx + mul24(blockDimx, threadIdxy)].x = acc.x; sharedPos[threadIdxx + mul24(blockDimx, threadIdxy)].y = acc.y; sharedPos[threadIdxx + mul24(blockDimx, threadIdxy)].z = acc.z; barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) == 0) { for (int i = 1; i < blockDimy; i+=1) { acc.x += sharedPos[threadIdxx + mul24(blockDimx, i)].x; acc.y += sharedPos[threadIdxx + mul24(blockDimx, i)].y; acc.z += sharedPos[threadIdxx + mul24(blockDimx, i)].z; } } return acc; } kernel integrateBodies_MT(global float4 *newPos, global float4 *newVel, global float4 *oldPos, global float4 *oldVel, float deltaTime, float damping, float softeningSquared, int numBodies, int sharedMemSize) { local float4 sharedPos[sharedMemSize]; int threadIdxx = get_local_id(0); int threadIdxy = get_local_id(1); int blockIdxx = get_group_id(0); int blockIdxy = get_group_id(1); int gridDimx = get_num_groups(0); int blockDimx = get_local_size(0); int blockDimy = get_local_size(1); int index = mul24(blockIdxx, blockDimx) + threadIdxx; float4 pos = oldPos[index]; float4 accel = computeBodyAccel_MT(pos, oldPos, numBodies, softeningSquared, sharedPos); float4 vel = oldVel[index]; vel.x += accel.x * deltaTime; vel.y += accel.y * deltaTime; vel.z += accel.z * deltaTime; vel.x *= damping; vel.y *= damping; vel.z *= damping; // new position = old position + velocity * deltaTime pos.x += vel.x * deltaTime; pos.y += vel.y * deltaTime; pos.z += vel.z * deltaTime; // store new position and velocity newPos[index] = pos; newVel[index] = vel; }

Host Code

// initialize parameters numBodies = 8192; Device.createGLArray('oldPos', numBodies); float4 [numBodies] newPos; float4 [numBodies] newVel; float4 [numBodies] oldVel; vs.declareFloat('deltaTime', 0.016, 0, 0.1); vs.declareFloat('damping', 0.998, 0.9, 1); vs.declareFloat('softeningSquared', 0.01, 0.00001, 10); // generate random positions and velocities Arrays.randomizeBodies(oldPos, 2, 10); Arrays.fillRandom(oldVel, 11); void setup(float x, float y) { integer p = x; integer q = y; Device.setLocalWorkSize(p,q); Device.setGlobalWorkSize(numBodies,q); integer sharedMemSize = p*q; } setup(64,4); boolean run = false; void simulation() { while(run) { Device.integrateBodies_MT(newPos, newVel, oldPos, oldVel, deltaTime, damping, softeningSquared, numBodies, sharedMemSize); Device.integrateBodies_MT(oldPos, oldVel, newPos, newVel, deltaTime, damping, softeningSquared, numBodies, sharedMemSize); } } // call kernel run -> simulation(); run = true;

Copyright © 2015-2017 vislang.net