Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate gpuLocalTreeWalk and Ewald Kernel Launches #186

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e2393c7
clearRegisteredPieces called before tree build
spencerw Sep 25, 2024
7dec4c5
CUDA streams assigned to TreePieces after tree build
spencerw Sep 25, 2024
ba603d6
Move clearRegisteredPieces inside buildTree
spencerw Sep 25, 2024
8bbeccb
Remove unused code
spencerw Sep 25, 2024
049e4fa
Move assignCUDAStreams() call inside of buildTree()
spencerw Sep 30, 2024
8ddb491
Remove DataTransfer calls from TreePieceCellListDataTransferLocal whe…
spencerw Oct 22, 2024
a5e500b
Hack to launch gpuLocalTreeWalk from DataManager
spencerw Oct 22, 2024
83961cf
Hack to run remote gravity without bookkeeping
spencerw Oct 23, 2024
f24d350
Restore bookkeeping around GPU local tree walk
spencerw Nov 12, 2024
bb0d5bb
Use bare callback for local walk
spencerw Nov 13, 2024
5eb603a
First attempt at consolidated Ewald GPU kernel
spencerw Nov 13, 2024
85191fe
Merge remote-tracking branch 'remotes/origin/dm_tp' into kernelfix
spencerw Nov 14, 2024
37ba423
More fixes to consolidated Ewald
spencerw Nov 18, 2024
336b959
Ensure device memory pointers are passed to TreePieces even though co…
spencerw Nov 19, 2024
ee3d927
Call finishBucket after local tree walk, ensure smooth happens before…
spencerw Nov 19, 2024
aa919f8
Pass nReplicas and fPeriod to DataManagerLocalTreeWalk
spencerw Nov 20, 2024
40de4f5
Removed unused Ewald GPU code
spencerw Nov 20, 2024
6b012e5
Ewald GPU data to pinned host memory, remove markers
spencerw Nov 20, 2024
e4cb17f
Comments and code cleanup
spencerw Nov 20, 2024
2866fdb
Fix for bucket bookkeeping
spencerw Nov 21, 2024
bce757c
Temporarily move initiatePrefetch to commenceCalculateGravityLocal
spencerw Dec 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 1 addition & 140 deletions Compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1391,146 +1391,6 @@ template<class type> int calcParticleForces(TreePiece *tp, int b, int activeRung
return computed;
}

#ifdef GPU_LOCAL_TREE_WALK

// XXX This appears to be identical to cudaCallback(), I think it can
// be deleted --trq.
void cudaCallbackForAllBuckets(void *param, void *msg) {
CudaRequest *data = (CudaRequest *)param;
int *affectedBuckets = data->affectedBuckets;
TreePiece *tp = (TreePiece*)data->tp;
DoubleWalkState *state = (DoubleWalkState *)data->state;
int bucket;

int numBucketsDone = data->numBucketsPlusOne-1;

// bucket computations finished
//
for(int i = 0; i < numBucketsDone; i++){
bucket = affectedBuckets[i];
state->counterArrays[0][bucket]--;
tp->finishBucket(bucket);
}

// free data structures
if(numBucketsDone > 0){
delete [] data->affectedBuckets;
}
freePinnedHostMemory(data->list);
freePinnedHostMemory(data->bucketMarkers);
freePinnedHostMemory(data->bucketStarts);
freePinnedHostMemory(data->bucketSizes);

#ifdef CUDA_PRINT_ERRORS
printf("cudaCallbackForAllBuckets: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif

delete ((CkCallback *)data->cb);
delete data;
}

/**
* This function is designed to send an ignition signal to the GPU manager.
* To make minor change to existing ChaNGa code, we mimic a nodeGravityCompute
* request. This request will eventually call our GPU local tree walk kernel.
*/
void ListCompute::sendLocalTreeWalkTriggerToGpu(State *state, TreePiece *tp,
int activeRung, int startBucket, int endBucket) {
int numFilledBuckets = 0;
for (int i = startBucket; i < endBucket; ++i) {
if (tp->bucketList[i]->rungs >= activeRung) {
++numFilledBuckets;
}
}

// No necessary to call GPU kernel if there is no active bucket in
// current tree piece
if (numFilledBuckets == 0) {
return;
}

int *affectedBuckets = new int[numFilledBuckets];

// Set up a series of dummy parameters to match existing function interfaces
int dummyTotalNumInteractions = 1;
int dummyCurBucket = 0;
ILCell *dummyFlatlists = NULL;
int *dummyNodeMarkers = NULL;
int *dummyStarts = NULL;
int *dummySizes = NULL;

// XXX I think this can be deleted --trq.
#ifdef PINNED_HOST_MEMORY
allocatePinnedHostMemory((void **)&dummyFlatlists, dummyTotalNumInteractions *
sizeof(ILCell));
allocatePinnedHostMemory((void **)&dummyNodeMarkers, (numFilledBuckets+1) *
sizeof(int));
allocatePinnedHostMemory((void **)&dummyStarts, numFilledBuckets *
sizeof(int));
allocatePinnedHostMemory((void **)&dummySizes, numFilledBuckets * sizeof(int));
#else
dummyFlatlists = (ILCell *) malloc(dummy_totalNumInteractions*sizeof(ILCell));
dummyNodeMarkers = (int *) malloc((numFilledBuckets+1)*sizeof(int));
dummyStarts = (int *) malloc(numFilledBuckets*sizeof(int));
dummySizes = (int *) malloc(numFilledBuckets*sizeof(int));
#endif

// No need to memset the interaction list array since we're not using it at all
// And, we can't directly memset it.
ILCell temp_ilc;
memcpy(&dummyFlatlists[0], &temp_ilc, dummyTotalNumInteractions *
sizeof(ILCell));
for (int i = startBucket; i < endBucket; ++i) {
if (tp->bucketList[i]->rungs >= activeRung) {
((DoubleWalkState *)state)->counterArrays[0][i] ++;
dummyNodeMarkers[dummyCurBucket] = tp->bucketList[i]->nodeArrayIndex;
int tempNum = 0;
memcpy(&dummyStarts[dummyCurBucket], &tempNum, sizeof(int));
memcpy(&dummySizes[dummyCurBucket], &tempNum, sizeof(int));
affectedBuckets[dummyCurBucket] = i;
dummyCurBucket++;
}
}

CudaRequest *request = new CudaRequest;

request->d_localMoments = tp->d_localMoments;
request->d_localParts = tp->d_localParts;
request->d_localVars = tp->d_localVars;
request->sMoments = tp->sMoments;
request->sCompactParts = tp->sCompactParts;
request->sVarParts = tp->sVarParts;
request->stream = tp->stream;

request->numBucketsPlusOne = numFilledBuckets+1;
request->affectedBuckets = affectedBuckets;
request->tp = (void *)tp;
request->state = (void *)state;
request->node = true;
request->remote = false;
request->firstParticle = tp->FirstGPUParticleIndex;
request->lastParticle = tp->LastGPUParticleIndex;
// In DataManager serializes the local tree so that the root of the local tree
// will always be the 0th element in the moments array.
request->rootIdx = 0;
request->theta = theta;
request->thetaMono = thetaMono;
request->nReplicas = tp->nReplicas;
request->fperiod = tp->fPeriod.x;
request->fperiodY = tp->fPeriod.y;
request->fperiodZ = tp->fPeriod.z;
request->cb = new CkCallback(cudaCallbackForAllBuckets, request);

request->list = (void *)dummyFlatlists;
request->bucketMarkers = dummyNodeMarkers;
request->bucketStarts = dummyStarts;
request->bucketSizes = dummySizes;
request->numInteractions = dummyTotalNumInteractions;

TreePieceCellListDataTransferLocal(request);
}
#endif //GPU_LOCAL_TREE_WALK

/// @brief Check for computation
/// Computation can be done on buckets indexed from start to end
/// @param state_ State to be checked
Expand Down Expand Up @@ -2130,6 +1990,7 @@ void ListCompute::sendNodeInteractionsToGpu(DoubleWalkState *state,
#ifdef HAPI_TRACE
tp->localNodeInteractions += state->nodeLists.totalNumInteractions;
#endif
// TODO does this ever trigger when GPU_LOCAL_TREE_WALK is defined?
TreePieceCellListDataTransferLocal(data);
}
else if(type == Remote && !state->resume){
Expand Down
3 changes: 0 additions & 3 deletions Compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,6 @@ class ListCompute : public Compute{
void enableCpu() {bUseCpu = 1;}

#ifdef CUDA
#ifdef GPU_LOCAL_TREE_WALK
void sendLocalTreeWalkTriggerToGpu(State *state, TreePiece *tp, int activeRung, int startBucket, int endBucket);
#endif //GPU_LOCAL_TREE_WALK
void sendNodeInteractionsToGpu(DoubleWalkState *state, TreePiece *tp);
void sendPartInteractionsToGpu(DoubleWalkState *state, TreePiece *tp);
#endif
Expand Down
155 changes: 124 additions & 31 deletions DataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ void DataManager::init() {
root = NULL;
oldNumChunks = 0;
chunkRoots = NULL;
cleanupTreePieces = true;
#ifdef CUDA
treePiecesDone = 0;
treePiecesDonePrefetch = 0;
Expand Down Expand Up @@ -162,21 +161,20 @@ void DataManager::notifyPresence(Tree::GenericTreeNode *root, TreePiece *tp) {
/// \brief Clear registeredTreePieces on this node.
void DataManager::clearRegisteredPieces(const CkCallback& cb) {
registeredTreePieces.removeAll();
cleanupTreePieces = true;
contribute(cb);
}

#ifdef CUDA
// This gets called before a tree build happens and ensures that
// registeredTreePieces doesnt get cleared during combineLocalTrees
// if we are about to do a gravity calculation on the GPU
void DataManager::unmarkTreePiecesForCleanup(const CkCallback& cb) {
cleanupTreePieces = false;
contribute(cb);
void DataManager::assignCUDAStreams(const CkCallback& cb) {
int tpIdx;
for(int i = 0; i < registeredTreePieces.size(); i++) {
tpIdx = registeredTreePieces[i].treePiece->getIndex();
treePieces[tpIdx].assignCUDAStream((intptr_t) &streams[tpIdx % numStreams]);
}
contribute(cb);
}
#endif


/// \brief Build a local tree inside the node.
///
/// This will be an exact superset of all the trees in this
Expand Down Expand Up @@ -223,10 +221,6 @@ void DataManager::combineLocalTrees(CkReductionMsg *msg) {
}
root = buildProcessorTree(totalChares, &gtn[0]);

if (cleanupTreePieces) {
registeredTreePieces.removeAll();
}

#ifdef PRINT_MERGED_TREE
ostringstream dmName;
dmName << "dm_" << CkMyNode();
Expand Down Expand Up @@ -500,30 +494,131 @@ void DataManager::serializeLocalTree(){
CmiUnlock(__nodelock);
}

/// @brief Record when all TreePieces have finished their Ewald initilization
/// Launch the Ewald kernel on the GPU
void DataManager::startEwaldGPU(int largephase) {
CmiLock(__nodelock);
treePiecesEwaldReady++;
if(treePiecesEwaldReady == registeredTreePieces.length()){
treePiecesEwaldReady = 0;
CmiUnlock(__nodelock);
}
else {
CmiUnlock(__nodelock);
return;
}

localTransferCallback
= new CkCallback(CkIndex_DataManager::finishEwaldGPU(), CkMyNode(), dMProxy);

DataManagerEwald(d_localParts, d_localVars, ewt, cachedData, savedNumTotalParticles-1, streams[0], localTransferCallback);
}

/// @brief Callback from Ewald kernel launch on GPU
void DataManager::finishEwaldGPU() {
delete localTransferCallback;

freePinnedHostMemory(h_idata);
freePinnedHostMemory(ewt);
freePinnedHostMemory(cachedData);

for(int i = 0; i < registeredTreePieces.length(); i++){
for (int j = 0; j < registeredTreePieces[i].treePiece->getNumBuckets(); j++) {
registeredTreePieces[i].treePiece->bucketReqs[j].finished = 1;
registeredTreePieces[i].treePiece->finishBucket(j);
}
}
}

/// @brief Callback from local tree walk on GPU
/// Call finishBucket for all buckets and TreePieces
/// Start Ewald calculation if enabled
void DataManager::finishLocalWalk() {
delete localTransferCallback;

for(int i = 0; i < registeredTreePieces.length(); i++){
registeredTreePieces[i].treePiece->cudaFinishAllBuckets();
}

allocatePinnedHostMemory((void **)&h_idata, sizeof(EwaldData)*savedNumTotalParticles-1);
allocatePinnedHostMemory((void **)&ewt, sizeof(EwtData)*NEWH);
allocatePinnedHostMemory((void **)&cachedData, sizeof(EwaldReadOnlyData));

treePiecesEwaldReady = 0;
int numTotalParts = 0;
int pidx = 0;
for(int i = 0; i < registeredTreePieces.length(); i++){
int in = registeredTreePieces[i].treePiece->getIndex();
if(registeredTreePieces[0].treePiece->bEwald) {
EwaldMsg *msg = new (8*sizeof(int)) EwaldMsg;
msg->fromInit = false;
// Make priority lower than smooth.
*((int *)CkPriorityPtr(msg)) = 3*numTreePieces + in + 1;
CkSetQueueing(msg,CK_QUEUEING_IFIFO);
msg->h_idata = &h_idata[i];
msg->cachedData = cachedData;
msg->ewt = ewt;
treePieces[in].calculateEwald(msg);
}
pidx += registeredTreePieces[i].treePiece->getNumActiveParticles();
}
}

/// @brief Callback from local data transfer to GPU
/// Indicate the transfer is done, and start the local gravity walks
/// on the treepieces on this node.
/// Indicate the transfer is done, and start the local gravity walk
/// in one big kernel launch
void DataManager::startLocalWalk() {
delete localTransferCallback;

// Notify TreePieces of device memory pointers for remote gravity
for(int i = 0; i < registeredTreePieces.length(); i++){
if(verbosity > 1) CkPrintf("[%d] GravityLocal %d\n", CkMyPe(), i);
int in = registeredTreePieces[i].treePiece->getIndex();
treePieces[in].commenceCalculateGravityLocal((intptr_t)d_localMoments,
(intptr_t)d_localParts,
(intptr_t)d_localVars,
(intptr_t)streams, numStreams,
sMoments, sCompactParts, sVarParts);
if(registeredTreePieces[0].treePiece->bEwald) {
EwaldMsg *msg = new (8*sizeof(int)) EwaldMsg;
msg->fromInit = false;
// Make priority lower than gravity or smooth.
*((int *)CkPriorityPtr(msg)) = 3*numTreePieces + in + 1;
CkSetQueueing(msg,CK_QUEUEING_IFIFO);
treePieces[in].calculateEwald(msg);
}
registeredTreePieces[i].treePiece->assignGPUGravityPtrs((intptr_t)d_localMoments,
(intptr_t)d_localParts,
(intptr_t)d_localVars,
sMoments, sCompactParts, sVarParts);
treePieces[in].commenceCalculateGravityLocal();
}

localTransferCallback
= new CkCallback(CkIndex_DataManager::finishLocalWalk(), CkMyNode(), dMProxy);

CudaRequest *request = new CudaRequest;

request->d_localMoments = d_localMoments;
request->d_localParts = d_localParts;
request->d_localVars = d_localVars;
request->sMoments = sMoments;
request->sCompactParts = sCompactParts;
request->sVarParts = sVarParts;
request->stream = streams[0];

request->numBucketsPlusOne = 0;
request->affectedBuckets = 0;
request->tp = this;
request->state = NULL;
request->node = true;
request->remote = false;
request->firstParticle = 0;
request->lastParticle = savedNumTotalParticles-1;

request->rootIdx = 0;
request->theta = theta;
request->thetaMono = thetaMono;
request->nReplicas = registeredTreePieces[0].treePiece->nReplicas;
request->fperiod = registeredTreePieces[0].treePiece->fPeriod.x;
request->fperiodY = registeredTreePieces[0].treePiece->fPeriod.y;
request->fperiodZ = registeredTreePieces[0].treePiece->fPeriod.z;
request->cb = localTransferCallback;

request->list = NULL;
request->bucketMarkers = NULL;
request->bucketStarts = NULL;
request->bucketSizes = NULL;
request->numInteractions = 0;

DataManagerLocalTreeWalk(request);

freePinnedHostMemory(bufLocalMoments);
freePinnedHostMemory(bufLocalParts);
freePinnedHostMemory(bufLocalVars);
Expand Down Expand Up @@ -1008,7 +1103,6 @@ void DataManager::transferParticleVarsBack(){
cudaFree(d_localVars);
cudaFree(d_remoteMoments);
cudaFree(d_remoteParts);
cleanupTreePieces = true;

#ifdef CUDA_PRINT_ERRORS
printf("transferParticleVarsBack: %s\n", cudaGetErrorString( cudaGetLastError() ) );
Expand Down Expand Up @@ -1069,7 +1163,6 @@ void DataManager::updateParticlesFreeMemory(UpdateParticlesStruct *data)
}
delete (data->cb);
delete data;
registeredTreePieces.length() = 0;
}
CmiUnlock(__nodelock);
}
Expand Down
Loading
Loading