26 #ifndef _GALOIS_GLUONSUB_H_
27 #define _GALOIS_GLUONSUB_H_
29 #include <unordered_map>
38 #ifdef GALOIS_ENABLE_GPU
48 #ifdef GALOIS_USE_BARE_MPI
49 extern BareMPI bare_mpi;
82 template <
typename GraphTy>
92 constexpr
static const char*
const RNAME =
"Gluon";
99 std::pair<unsigned, unsigned> cartesianGrid;
100 bool partitionAgnostic;
119 std::vector<std::vector<size_t>> masterNodes;
122 std::vector<std::vector<size_t>>& mirrorNodes;
124 size_t maxSharedSize;
126 #ifdef GALOIS_USE_BARE_MPI
127 std::vector<MPI_Group> mpi_identity_groups;
139 void reset_bitset(SyncType syncType,
141 size_t numMasters = userGraph.numMasters();
142 if (numMasters > 0) {
145 if (syncType == syncBroadcast) {
148 assert(syncType == syncReduce);
150 if (numMasters < userGraph.size()) {
156 if (syncType == syncReduce) {
157 if (userGraph.size() > 0) {
165 void inline incrementEvilPhase() {
181 void exchangeProxyInfo() {
185 for (
unsigned x = 0; x < numHosts; ++x) {
190 gSerialize(b, mirrorNodes[x]);
195 for (
unsigned x = 0; x < numHosts; ++x) {
206 incrementEvilPhase();
213 void sendInfoToHost() {
216 uint64_t global_total_mirror_nodes =
217 userGraph.size() - userGraph.numMasters();
218 uint64_t global_total_owned_nodes = userGraph.numMasters();
221 for (
unsigned x = 0; x < numHosts; ++x) {
226 gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes);
231 for (
unsigned x = 0; x < numHosts; ++x) {
240 uint64_t total_mirror_nodes_from_others;
241 uint64_t total_owned_nodes_from_others;
243 total_owned_nodes_from_others);
244 global_total_mirror_nodes += total_mirror_nodes_from_others;
245 global_total_owned_nodes += total_owned_nodes_from_others;
247 incrementEvilPhase();
249 assert(userGraph.globalSize() == global_total_owned_nodes);
252 reportProxyStats(global_total_mirror_nodes, global_total_owned_nodes);
260 void setupCommunication() {
274 for (uint32_t h = 0; h < masterNodes.size(); ++h) {
278 masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]);
280 #if GALOIS_COMM_STATS
286 for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
290 mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]);
292 #if GALOIS_COMM_STATS
302 for (
auto x = 0U; x < masterNodes.size(); ++x) {
305 std::string master_nodes_str =
306 "MasterNodesFrom_" + std::to_string(
id) +
"_To_" + std::to_string(x);
307 galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
308 RNAME, master_nodes_str, masterNodes[x].size());
309 if (masterNodes[x].size() > maxSharedSize) {
310 maxSharedSize = masterNodes[x].size();
314 for (
auto x = 0U; x < mirrorNodes.size(); ++x) {
317 std::string mirror_nodes_str =
318 "MirrorNodesFrom_" + std::to_string(x) +
"_To_" + std::to_string(
id);
319 galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
320 RNAME, mirror_nodes_str, mirrorNodes[x].size());
321 if (mirrorNodes[x].size() > maxSharedSize) {
322 maxSharedSize = mirrorNodes[x].size();
341 void reportProxyStats(uint64_t global_total_mirror_nodes,
342 uint64_t GALOIS_UNUSED(global_total_owned_nodes)) {
343 float replication_factor =
344 (float)(global_total_mirror_nodes + userGraph.globalSize()) /
345 (
float)userGraph.globalSize();
349 galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
350 RNAME,
"TotalNodes", userGraph.globalSize());
351 galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
352 RNAME,
"TotalGlobalMirrorNodes", global_total_mirror_nodes);
363 #ifdef GALOIS_USE_BARE_MPI
364 if (bare_mpi == noBareMPI)
367 #ifdef GALOIS_USE_LCI
370 MPI_Comm_rank(MPI_COMM_WORLD, &taskRank);
371 if ((
unsigned)taskRank !=
id)
374 MPI_Comm_size(MPI_COMM_WORLD, &numTasks);
375 if ((
unsigned)numTasks != numHosts)
379 MPI_Group world_group;
380 MPI_Comm_group(MPI_COMM_WORLD, &world_group);
381 mpi_identity_groups.resize(numHosts);
383 for (
unsigned x = 0; x < numHosts; ++x) {
384 const int g[1] = {(int)x};
385 MPI_Group_incl(world_group, 1, g, &mpi_identity_groups[x]);
390 case nonBlockingBareMPI:
393 case oneSidedBareMPI:
424 GraphTy& _userGraph,
unsigned host,
unsigned numHosts,
bool _transposed,
425 std::pair<unsigned, unsigned> _cartesianGrid = std::make_pair(0u, 0u),
426 bool _partitionAgnostic =
false,
428 : galois::runtime::
GlobalObject(this), userGraph(_userGraph), id(host),
429 transposed(_transposed), isVertexCut(userGraph.is_vertex_cut()),
430 cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic),
431 substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),
432 num_round(0), currentBVFlag(nullptr),
433 mirrorNodes(userGraph.getMirrorNodes()) {
434 if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {
435 GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,
436 "Cartesian split doesn't equal number of hosts");
438 galois::gInfo(
"Gluon optimizing communication for 2-D cartesian cut: ",
439 cartesianGrid.first,
" x ", cartesianGrid.second);
443 assert(cartesianGrid.first == 0 && cartesianGrid.second == 0);
452 masterNodes.resize(numHosts);
455 "GraphCommSetupTime", RNAME);
456 Tgraph_construct_comm.
start();
457 setupCommunication();
458 Tgraph_construct_comm.
stop();
479 template <SyncType syncType>
480 void getOffsetsFromBitset(
const std::string& loopName,
483 size_t& bit_set_count)
const {
485 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
486 std::string offsets_timer_str(syncTypeStr +
"Offsets_" +
494 std::vector<unsigned int> t_prefix_bit_counts(
activeThreads);
499 unsigned int block_size = bitset_comm.
size() / nthreads;
500 if ((bitset_comm.
size() % nthreads) > 0)
502 assert((block_size * nthreads) >= bitset_comm.
size());
504 unsigned int start = tid * block_size;
505 unsigned int end = (tid + 1) * block_size;
506 if (end > bitset_comm.
size())
507 end = bitset_comm.
size();
509 unsigned int count = 0;
510 for (
unsigned int i = start; i < end; ++i) {
511 if (bitset_comm.
test(i))
515 t_prefix_bit_counts[tid] = count;
520 t_prefix_bit_counts[i] += t_prefix_bit_counts[i - 1];
523 bit_set_count = t_prefix_bit_counts[activeThreads - 1];
527 if (bit_set_count > 0) {
528 offsets.
resize(bit_set_count);
533 unsigned int block_size = bitset_comm.
size() / nthreads;
534 if ((bitset_comm.
size() % nthreads) > 0)
536 assert((block_size * nthreads) >= bitset_comm.
size());
538 unsigned int start = tid * block_size;
539 unsigned int end = (tid + 1) * block_size;
540 if (end > bitset_comm.
size())
541 end = bitset_comm.
size();
543 unsigned int count = 0;
544 unsigned int t_prefix_bit_count;
546 t_prefix_bit_count = 0;
548 t_prefix_bit_count = t_prefix_bit_counts[tid - 1];
551 for (
unsigned int i = start; i < end; ++i) {
552 if (bitset_comm.
test(i)) {
553 offsets[t_prefix_bit_count + count] = i;
584 template <
typename FnTy, SyncType syncType>
585 void getBitsetAndOffsets(
const std::string& loopName,
586 const std::vector<size_t>& indices,
590 size_t& bit_set_count,
592 if (substrateDataMode !=
onlyData) {
594 std::string syncTypeStr =
595 (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
596 std::string doall_str(syncTypeStr +
"Bitset_" + loopName);
605 size_t lid = indices[n];
606 if (bitset_compute.
test(lid)) {
610 #if GALOIS_COMM_STATS
616 getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
621 get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
624 template <
typename SyncFnTy>
625 size_t getMaxSendBufferSize(uint32_t numShared) {
626 if (substrateDataMode ==
gidsData) {
627 return sizeof(
DataCommMode) +
sizeof(
size_t) +
sizeof(size_t) +
628 (numShared *
sizeof(
unsigned int)) +
sizeof(
size_t) +
629 (numShared *
sizeof(
typename SyncFnTy::ValTy));
631 return sizeof(
DataCommMode) +
sizeof(
size_t) +
sizeof(size_t) +
632 (numShared *
sizeof(
unsigned int)) +
sizeof(
size_t) +
633 (numShared *
sizeof(
typename SyncFnTy::ValTy));
635 size_t bitset_alloc_size = ((numShared + 63) / 64) *
sizeof(uint64_t);
639 + bitset_alloc_size +
sizeof(size_t) +
640 (numShared *
sizeof(
typename SyncFnTy::ValTy));
642 size_t bitset_alloc_size = ((numShared + 63) / 64) *
sizeof(uint64_t);
646 + bitset_alloc_size +
sizeof(size_t) +
647 (numShared *
sizeof(
typename SyncFnTy::ValTy));
665 template <SyncType syncType>
666 void convertLIDToGID(
const std::string& loopName,
667 const std::vector<size_t>& indices,
669 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
670 std::string doall_str(syncTypeStr +
"_LID2GID_" +
676 static_cast<uint32_t
>(userGraph.getGID(indices[offsets[n]]));
678 #if GALOIS_COMM_STATS
692 template <SyncType syncType>
693 void convertGIDToLID(
const std::string& loopName,
695 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
696 std::string doall_str(syncTypeStr +
"_GID2LID_" +
701 [&](
size_t n) { offsets[n] = userGraph.getLID(offsets[n]); },
702 #if GALOIS_COMM_STATS
724 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
726 typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
727 void getSendBuffer(std::string loopName,
unsigned x,
729 auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
731 if (BitsetFnTy::is_valid()) {
732 syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
733 loopName, x, sharedNodes[x], b);
735 syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
739 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
740 std::string statSendBytes_str(syncTypeStr +
"SendBytes_" +
746 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
748 typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
749 void getSendBuffer(std::string loopName,
unsigned x,
751 auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
753 syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
754 loopName, x, sharedNodes[x], b);
756 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
757 std::string statSendBytes_str(syncTypeStr +
"SendBytesVector_" +
780 template <
bool async, SyncType syncType,
typename VecType>
781 void serializeMessage(std::string loopName,
DataCommMode data_mode,
782 size_t bit_set_count, std::vector<size_t>& indices,
786 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
787 std::string serialize_timer_str(syncTypeStr +
"SerializeMessage_" +
790 serialize_timer_str.c_str(), RNAME);
791 if (data_mode ==
noData) {
794 gSerialize(b, data_mode);
798 offsets.
resize(bit_set_count);
799 convertLIDToGID<syncType>(loopName, indices, offsets);
800 val_vec.resize(bit_set_count);
802 gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
805 offsets.
resize(bit_set_count);
806 val_vec.resize(bit_set_count);
808 gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
811 val_vec.resize(bit_set_count);
813 gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);
817 gSerialize(b, data_mode, val_vec);
846 template <SyncType syncType,
typename VecType>
847 void deserializeMessage(std::string loopName,
DataCommMode data_mode,
849 size_t& bit_set_count,
852 size_t& buf_start,
size_t& retval, VecType& val_vec) {
853 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
854 std::string serialize_timer_str(syncTypeStr +
"DeserializeMessage_" +
857 serialize_timer_str.c_str(), RNAME);
858 Tdeserialize.
start();
866 convertGIDToLID<syncType>(loopName, offsets);
890 unsigned gridRowID()
const {
return (
id / cartesianGrid.second); }
892 unsigned gridRowID(
unsigned hid)
const {
893 return (hid / cartesianGrid.second);
896 unsigned gridColumnID()
const {
return (
id % cartesianGrid.second); }
898 unsigned gridColumnID(
unsigned hid)
const {
899 return (hid % cartesianGrid.second);
905 bool isNotCommPartnerCVC(
unsigned host, SyncType syncType,
908 assert(cartesianGrid.first != 0);
909 assert(cartesianGrid.second != 0);
912 if (syncType == syncReduce) {
913 switch (writeLocation) {
915 return (gridColumnID() != gridColumnID(host));
917 return (gridRowID() != gridRowID(host));
919 assert((gridRowID() == gridRowID(host)) ||
920 (gridColumnID() == gridColumnID(host)));
921 return ((gridRowID() != gridRowID(host)) &&
922 (gridColumnID() != gridColumnID(host)));
927 switch (readLocation) {
929 return (gridColumnID() != gridColumnID(host));
931 return (gridRowID() != gridRowID(host));
933 assert((gridRowID() == gridRowID(host)) ||
934 (gridColumnID() == gridColumnID(host)));
935 return ((gridRowID() != gridRowID(host)) &&
936 (gridColumnID() != gridColumnID(host)));
942 if (syncType == syncReduce) {
943 switch (writeLocation) {
945 return (gridRowID() != gridRowID(host));
947 return (gridColumnID() != gridColumnID(host));
949 assert((gridRowID() == gridRowID(host)) ||
950 (gridColumnID() == gridColumnID(host)));
951 return ((gridRowID() != gridRowID(host)) &&
952 (gridColumnID() != gridColumnID(host)));
957 switch (readLocation) {
959 return (gridRowID() != gridRowID(host));
961 return (gridColumnID() != gridColumnID(host));
963 assert((gridRowID() == gridRowID(host)) ||
964 (gridColumnID() == gridColumnID(host)));
965 return ((gridRowID() != gridRowID(host)) &&
966 (gridColumnID() != gridColumnID(host)));
989 bool nothingToSend(
unsigned host, SyncType syncType,
991 auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
994 return (sharedNodes[host].size() == 0);
997 if (sharedNodes[host].size() > 0) {
998 return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);
1018 bool nothingToRecv(
unsigned host, SyncType syncType,
1020 auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
1023 return (sharedNodes[host].size() == 0);
1025 if (sharedNodes[host].size() > 0) {
1026 return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);
1047 template <
typename SyncFnTy>
1048 void reportRedundantSize(std::string loopName, std::string syncTypeStr,
1049 uint32_t totalToSend,
size_t bitSetCount,
1051 size_t redundant_size =
1052 (totalToSend - bitSetCount) *
sizeof(
typename SyncFnTy::ValTy);
1053 size_t bit_set_size = (bitSetComm.
get_vec().size() *
sizeof(uint64_t));
1055 if (redundant_size > bit_set_size) {
1056 std::string statSavedBytes_str(syncTypeStr +
"SavedBytes_" +
1059 galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
1060 RNAME, statSavedBytes_str, (redundant_size - bit_set_size));
1080 template <
typename FnTy, SyncType syncType>
1081 inline typename FnTy::ValTy extractWrapper(
size_t lid) {
1082 if (syncType == syncReduce) {
1083 auto val = FnTy::extract(lid, userGraph.getData(lid));
1087 return FnTy::extract(lid, userGraph.getData(lid));
1106 template <
typename FnTy, SyncType syncType>
1107 inline typename FnTy::ValTy extractWrapper(
size_t lid,
unsigned vecIndex) {
1108 if (syncType == syncReduce) {
1109 auto val = FnTy::extract(lid, userGraph.getData(lid), vecIndex);
1110 FnTy::reset(lid, userGraph.getData(lid), vecIndex);
1113 return FnTy::extract(lid, userGraph.getData(lid), vecIndex);
1138 template <
typename FnTy, SyncType syncType,
typename VecTy,
1139 bool identity_offsets =
false,
bool parallelize =
true>
1140 void extractSubset(
const std::string& loopName,
1141 const std::vector<size_t>& indices,
size_t size,
1143 VecTy& val_vec,
size_t start = 0) {
1145 std::string syncTypeStr =
1146 (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1147 std::string doall_str(syncTypeStr +
"ExtractVal_" + loopName);
1151 [&](
unsigned int n) {
1152 unsigned int offset;
1153 if (identity_offsets)
1156 offset = offsets[n];
1157 size_t lid = indices[offset];
1158 val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
1160 #if GALOIS_COMM_STATS
1165 for (
unsigned n = start; n < start + size; ++n) {
1166 unsigned int offset;
1167 if (identity_offsets)
1170 offset = offsets[n];
1172 size_t lid = indices[offset];
1173 val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);
1205 template <
typename FnTy, SyncType syncType,
typename VecTy,
1206 bool identity_offsets =
false,
bool parallelize =
true,
1207 bool vecSync =
false,
1208 typename std::enable_if<vecSync>::type* =
nullptr>
1209 void extractSubset(
const std::string& loopName,
1210 const std::vector<size_t>& indices,
size_t size,
1212 VecTy& val_vec,
unsigned vecIndex,
size_t start = 0) {
1213 val_vec.resize(size);
1216 std::string syncTypeStr =
1217 (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1218 std::string doall_str(syncTypeStr +
"ExtractValVector_" + loopName);
1222 [&](
unsigned int n) {
1223 unsigned int offset;
1224 if (identity_offsets)
1227 offset = offsets[n];
1228 size_t lid = indices[offset];
1229 val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
1231 #if GALOIS_COMM_STATS
1236 for (
unsigned n = start; n < start + size; ++n) {
1237 unsigned int offset;
1238 if (identity_offsets)
1241 offset = offsets[n];
1242 size_t lid = indices[offset];
1243 val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);
1272 template <
typename FnTy,
typename SeqTy, SyncType syncType,
1273 bool identity_offsets =
false,
bool parallelize =
true>
1274 void extractSubset(
const std::string& loopName,
1275 const std::vector<size_t>& indices,
size_t size,
1280 std::string syncTypeStr =
1281 (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1282 std::string doall_str(syncTypeStr +
"ExtractVal_" + loopName);
1286 [&](
unsigned int n) {
1287 unsigned int offset;
1288 if (identity_offsets)
1291 offset = offsets[n];
1293 size_t lid = indices[offset];
1294 gSerializeLazy(b, lseq, n - start,
1295 extractWrapper<FnTy, syncType>(lid));
1297 #if GALOIS_COMM_STATS
1302 for (
unsigned int n = start; n < start + size; ++n) {
1303 unsigned int offset;
1304 if (identity_offsets)
1307 offset = offsets[n];
1308 size_t lid = indices[offset];
1309 gSerializeLazy(b, lseq, n - start, extractWrapper<FnTy, syncType>(lid));
1326 template <
typename FnTy, SyncType syncType>
1328 if (syncType == syncReduce) {
1329 return FnTy::extract_reset_batch(x, b.
getVec().
data());
1331 return FnTy::extract_batch(x, b.
getVec().
data());
1353 template <
typename FnTy, SyncType syncType>
1356 if (syncType == syncReduce) {
1357 return FnTy::extract_reset_batch(x, b.
getVec().
data(), &s, &data_mode);
1359 return FnTy::extract_batch(x, b.
getVec().
data(), &s, &data_mode);
1378 template <
typename FnTy, SyncType syncType,
bool async>
1379 inline void setWrapper(
size_t lid,
typename FnTy::ValTy val,
1381 if (syncType == syncReduce) {
1382 if (FnTy::reduce(lid, userGraph.getData(lid), val)) {
1383 if (bit_set_compute.
size() != 0)
1384 bit_set_compute.
set(lid);
1388 FnTy::reduce(lid, userGraph.getData(lid), val);
1390 FnTy::setVal(lid, userGraph.getData(lid), val);
1410 template <
typename FnTy, SyncType syncType,
bool async>
1411 inline void setWrapper(
size_t lid,
typename FnTy::ValTy val,
1413 unsigned vecIndex) {
1414 if (syncType == syncReduce) {
1415 if (FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex)) {
1416 if (bit_set_compute.
size() != 0)
1417 bit_set_compute.
set(lid);
1421 FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex);
1423 FnTy::setVal(lid, userGraph.getData(lid), val, vecIndex);
1450 template <
typename IndicesVecTy,
typename FnTy, SyncType syncType,
1451 typename VecTy,
bool async,
bool identity_offsets =
false,
1452 bool parallelize =
true>
1453 void setSubset(
const std::string& loopName,
const IndicesVecTy& indices,
1458 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1459 std::string doall_str(syncTypeStr +
"SetVal_" +
1465 [&](
unsigned int n) {
1466 unsigned int offset;
1467 if (identity_offsets)
1470 offset = offsets[n];
1471 auto lid = indices[offset];
1472 setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
1475 #if GALOIS_COMM_STATS
1480 for (
unsigned int n = start; n < start + size; ++n) {
1481 unsigned int offset;
1482 if (identity_offsets)
1485 offset = offsets[n];
1486 auto lid = indices[offset];
1487 setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
1525 template <
typename IndicesVecTy,
typename FnTy, SyncType syncType,
1526 typename VecTy,
bool async,
bool identity_offsets =
false,
1527 bool parallelize =
true,
bool vecSync =
false,
1528 typename std::enable_if<vecSync>::type* =
nullptr>
1529 void setSubset(
const std::string& loopName,
const IndicesVecTy& indices,
1533 unsigned vecIndex,
size_t start = 0) {
1534 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1535 std::string doall_str(syncTypeStr +
"SetValVector_" +
1541 [&](
unsigned int n) {
1542 unsigned int offset;
1543 if (identity_offsets)
1546 offset = offsets[n];
1547 auto lid = indices[offset];
1548 setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
1549 bit_set_compute, vecIndex);
1551 #if GALOIS_COMM_STATS
1556 for (
unsigned int n = start; n < start + size; ++n) {
1557 unsigned int offset;
1558 if (identity_offsets)
1561 offset = offsets[n];
1562 auto lid = indices[offset];
1563 setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],
1564 bit_set_compute, vecIndex);
1580 template <
typename FnTy, SyncType syncType,
bool async>
1582 if (syncType == syncReduce) {
1609 template <
typename FnTy, SyncType syncType,
bool async>
1612 if (syncType == syncReduce) {
1643 template <SyncType syncType,
typename SyncFnTy,
typename VecTy,
bool async,
1645 typename SyncFnTy::ValTy>::value>::type* =
nullptr>
1646 void syncExtract(std::string loopName,
unsigned from_id,
1647 std::vector<size_t>& indices,
1649 uint32_t num = indices.size();
1650 static VecTy val_vec;
1652 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1653 std::string extract_timer_str(syncTypeStr +
"Extract_" +
1657 std::string extract_batch_timer_str(syncTypeStr +
"ExtractBatch_" +
1660 extract_batch_timer_str.c_str(), RNAME);
1669 (num *
sizeof(
typename SyncFnTy::ValTy)));
1671 Textractbatch.start();
1672 bool batch_succeeded =
1673 extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
1674 Textractbatch.stop();
1676 if (!batch_succeeded) {
1678 val_vec.reserve(maxSharedSize);
1679 val_vec.resize(num);
1681 auto lseq = gSerializeLazySeq(
1684 extractSubset<SyncFnTy, decltype(lseq), syncType, true, true>(
1685 loopName, indices, num, offsets, b, lseq);
1688 (num *
sizeof(
typename SyncFnTy::ValTy)));
1700 std::string metadata_str(syncTypeStr +
"MetadataMode_" +
1701 std::to_string(data_mode) +
"_" +
1703 galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
1724 template <SyncType syncType,
typename SyncFnTy,
typename VecTy,
bool async,
1726 typename SyncFnTy::ValTy>::value>::type* =
nullptr>
1727 void syncExtract(std::string loopName,
unsigned from_id,
1728 std::vector<size_t>& indices,
1730 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1731 std::string extract_timer_str(syncTypeStr +
"Extract_" +
1735 std::string extract_batch_timer_str(syncTypeStr +
"ExtractBatch_" +
1738 extract_batch_timer_str.c_str(), RNAME);
1742 uint32_t num = indices.size();
1743 static VecTy val_vec;
1751 (num *
sizeof(
typename SyncFnTy::ValTy)));
1753 Textractbatch.start();
1754 bool batch_succeeded =
1755 extractBatchWrapper<SyncFnTy, syncType>(from_id, b);
1756 Textractbatch.stop();
1758 if (!batch_succeeded) {
1760 val_vec.reserve(maxSharedSize);
1761 val_vec.resize(num);
1764 extractSubset<SyncFnTy, syncType, VecTy, true, true>(
1765 loopName, indices, num, dummyVector, val_vec);
1769 (num *
sizeof(
typename SyncFnTy::ValTy)));
1782 std::string metadata_str(syncTypeStr +
"MetadataMode_" +
1783 std::to_string(data_mode) +
"_" +
1785 galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
1807 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
1809 typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
1810 void syncExtract(std::string loopName,
unsigned from_id,
1811 std::vector<size_t>& indices,
1813 uint32_t num = indices.size();
1815 static VecTy val_vec;
1818 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1819 std::string extract_timer_str(syncTypeStr +
"Extract_" +
1823 std::string extract_alloc_timer_str(syncTypeStr +
"ExtractAlloc_" +
1826 extract_alloc_timer_str.c_str(), RNAME);
1827 std::string extract_batch_timer_str(syncTypeStr +
"ExtractBatch_" +
1830 extract_batch_timer_str.c_str(), RNAME);
1837 size_t bit_set_count = 0;
1838 Textractalloc.start();
1839 b.
reserve(getMaxSendBufferSize<SyncFnTy>(num));
1840 Textractalloc.stop();
1842 Textractbatch.start();
1843 bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
1844 from_id, b, bit_set_count, data_mode);
1845 Textractbatch.stop();
1849 if (!batch_succeeded) {
1850 Textractalloc.start();
1852 bit_set_comm.
reserve(maxSharedSize);
1853 offsets.
reserve(maxSharedSize);
1854 val_vec.reserve(maxSharedSize);
1855 bit_set_comm.
resize(num);
1857 val_vec.resize(num);
1858 Textractalloc.stop();
1861 getBitsetAndOffsets<SyncFnTy, syncType>(
1862 loopName, indices, bit_set_compute, bit_set_comm, offsets,
1863 bit_set_count, data_mode);
1866 bit_set_count = indices.size();
1867 extractSubset<SyncFnTy, syncType, VecTy, true, true>(
1868 loopName, indices, bit_set_count, offsets, val_vec);
1869 }
else if (data_mode !=
1871 extractSubset<SyncFnTy, syncType, VecTy, false, true>(
1872 loopName, indices, bit_set_count, offsets, val_vec);
1874 serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
1875 indices, offsets, bit_set_comm,
1878 if (data_mode ==
noData) {
1881 gSerialize(b, data_mode);
1883 }
else if (data_mode ==
gidsData) {
1884 b.resize(
sizeof(
DataCommMode) +
sizeof(bit_set_count) +
1885 sizeof(
size_t) + (bit_set_count *
sizeof(
unsigned int)) +
1887 (bit_set_count *
sizeof(
typename SyncFnTy::ValTy)));
1889 b.resize(
sizeof(
DataCommMode) +
sizeof(bit_set_count) +
1890 sizeof(
size_t) + (bit_set_count *
sizeof(
unsigned int)) +
1892 (bit_set_count *
sizeof(
typename SyncFnTy::ValTy)));
1894 size_t bitset_alloc_size = ((num + 63) / 64) *
sizeof(uint64_t);
1895 b.resize(
sizeof(
DataCommMode) +
sizeof(bit_set_count) +
1898 + bitset_alloc_size +
sizeof(
size_t) +
1899 (bit_set_count *
sizeof(
typename SyncFnTy::ValTy)));
1902 (num *
sizeof(
typename SyncFnTy::ValTy)));
1906 reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
1918 std::string metadata_str(syncTypeStr +
"MetadataMode_" +
1919 std::to_string(data_mode) +
"_" +
1921 galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
1946 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
1948 typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
1949 void syncExtract(std::string loopName,
unsigned, std::vector<size_t>& indices,
1951 uint32_t num = indices.size();
1953 static VecTy val_vec;
1956 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
1957 std::string extract_timer_str(syncTypeStr +
"ExtractVector_" +
1965 bit_set_comm.
reserve(maxSharedSize);
1966 offsets.
reserve(maxSharedSize);
1967 val_vec.reserve(maxSharedSize);
1968 bit_set_comm.
resize(num);
1970 val_vec.resize(num);
1976 for (
unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {
1978 bit_set_comm.
reset();
1980 size_t bit_set_count = 0;
1985 getBitsetAndOffsets<SyncFnTy, syncType>(
1986 loopName, indices, bit_set_compute, bit_set_comm, offsets,
1987 bit_set_count, data_mode);
1994 bit_set_count = indices.size();
1995 extractSubset<SyncFnTy, syncType, VecTy, true, true, true>(
1996 loopName, indices, bit_set_count, offsets, val_vec, i);
1997 }
else if (data_mode !=
2000 extractSubset<SyncFnTy, syncType, VecTy, false, true, true>(
2001 loopName, indices, bit_set_count, offsets, val_vec, i);
2004 reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
2006 serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
2007 indices, offsets, bit_set_comm,
2026 #ifdef GALOIS_USE_BARE_MPI
2031 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2032 typename VecTy,
bool async>
2033 void sync_mpi_send(std::string loopName) {
2034 static std::vector<galois::runtime::SendBuffer> b;
2035 static std::vector<MPI_Request> request;
2037 request.resize(numHosts, MPI_REQUEST_NULL);
2039 for (
unsigned h = 1; h < numHosts; ++h) {
2040 unsigned x = (
id + h) % numHosts;
2042 if (nothingToSend(x, syncType, writeLocation, readLocation))
2046 MPI_Test(&request[x], &ready, MPI_STATUS_IGNORE);
2048 assert(b[x].size() > 0);
2049 MPI_Wait(&request[x], MPI_STATUS_IGNORE);
2051 if (b[x].size() > 0) {
2052 b[x].getVec().clear();
2055 getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
2058 MPI_Isend((uint8_t*)b[x].linearData(), b[x].size(), MPI_BYTE, x, 32767,
2059 MPI_COMM_WORLD, &request[x]);
2062 if (BitsetFnTy::is_valid()) {
2063 reset_bitset(syncType, &BitsetFnTy::reset_range);
2071 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2072 typename VecTy,
bool async>
2073 void sync_mpi_put(std::string loopName,
const MPI_Group& mpi_access_group,
2074 const std::vector<MPI_Win>& window) {
2076 MPI_Win_start(mpi_access_group, 0, window[
id]);
2078 std::vector<galois::runtime::SendBuffer> b(numHosts);
2079 std::vector<size_t> size(numHosts);
2080 uint64_t send_buffers_size = 0;
2082 for (
unsigned h = 1; h < numHosts; ++h) {
2083 unsigned x = (
id + h) % numHosts;
2085 if (nothingToSend(x, syncType, writeLocation, readLocation))
2088 getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
2091 size[x] = b[x].size();
2092 send_buffers_size += size[x];
2093 MPI_Put((uint8_t*)&size[x],
sizeof(
size_t), MPI_BYTE, x, 0,
2094 sizeof(
size_t), MPI_BYTE, window[
id]);
2095 MPI_Put((uint8_t*)b[x].linearData(), size[x], MPI_BYTE, x,
sizeof(
size_t),
2096 size[x], MPI_BYTE, window[
id]);
2100 net.incrementMemUsage(send_buffers_size);
2102 MPI_Win_complete(window[
id]);
2103 net.decrementMemUsage(send_buffers_size);
2105 if (BitsetFnTy::is_valid()) {
2106 reset_bitset(syncType, &BitsetFnTy::reset_range);
2124 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2125 typename VecTy,
bool async>
2126 void syncNetSend(std::string loopName) {
2132 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2133 std::string statNumMessages_str(syncTypeStr +
"NumMessages_" +
2136 size_t numMessages = 0;
2137 for (
unsigned h = 1; h < numHosts; ++h) {
2138 unsigned x = (
id + h) % numHosts;
2140 if (nothingToSend(x, syncType, writeLocation, readLocation))
2143 getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
2146 if ((!async) || (b.
size() > 0)) {
2147 size_t syncTypePhase = 0;
2148 if (async && (syncType == syncBroadcast))
2159 if (BitsetFnTy::is_valid()) {
2160 reset_bitset(syncType, &BitsetFnTy::reset_range);
2179 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2180 typename VecTy,
bool async>
2181 void syncSend(std::string loopName) {
2182 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2187 syncNetSend<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
2188 VecTy, async>(loopName);
2212 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
2214 typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
2216 std::string loopName) {
2217 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2218 std::string set_timer_str(syncTypeStr +
"Set_" +
2221 std::string set_batch_timer_str(syncTypeStr +
"SetBatch_" +
2224 set_batch_timer_str.c_str(), RNAME);
2227 static VecTy val_vec;
2230 auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
2231 uint32_t num = sharedNodes[from_id].
size();
2241 if (data_mode !=
noData) {
2244 bool batch_succeeded =
2245 setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
2249 if (!batch_succeeded) {
2250 size_t bit_set_count = num;
2251 size_t buf_start = 0;
2255 deserializeMessage<syncType>(loopName, data_mode, num, buf,
2256 bit_set_count, offsets, bit_set_comm,
2257 buf_start, retval, val_vec);
2259 bit_set_comm.reserve(maxSharedSize);
2260 offsets.reserve(maxSharedSize);
2261 val_vec.reserve(maxSharedSize);
2266 size_t bit_set_count2;
2267 getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
2269 assert(bit_set_count == bit_set_count2);
2273 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2274 async,
true,
true>(loopName, sharedNodes[from_id],
2275 bit_set_count, offsets, val_vec,
2278 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2279 async,
true,
true>(loopName, sharedNodes[from_id],
2280 bit_set_count, offsets, val_vec,
2281 bit_set_compute, buf_start);
2282 }
else if (data_mode ==
gidsData) {
2283 setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async,
true,
2284 true>(loopName, offsets, bit_set_count, offsets, val_vec,
2287 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2288 async,
false,
true>(loopName, sharedNodes[from_id],
2289 bit_set_count, offsets, val_vec,
2324 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
typename VecTy,
2326 typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* =
nullptr>
2328 std::string loopName) {
2329 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2330 std::string set_timer_str(syncTypeStr +
"SetVector_" +
2335 static VecTy val_vec;
2338 auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
2339 uint32_t num = sharedNodes[from_id].
size();
2345 for (
unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {
2350 if (data_mode !=
noData) {
2351 size_t bit_set_count = num;
2352 size_t buf_start = 0;
2356 deserializeMessage<syncType>(loopName, data_mode, num, buf,
2357 bit_set_count, offsets, bit_set_comm,
2358 buf_start, retval, val_vec);
2363 size_t bit_set_count2;
2364 getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
2366 assert(bit_set_count == bit_set_count2);
2373 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2374 async,
true,
true,
true>(loopName, sharedNodes[from_id],
2375 bit_set_count, offsets, val_vec,
2376 bit_set_compute, i);
2378 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2379 true, async,
true,
true,
true>(
2380 loopName, sharedNodes[from_id], bit_set_count, offsets, val_vec,
2381 bit_set_compute, i, buf_start);
2382 }
else if (data_mode ==
gidsData) {
2383 setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async,
true,
2384 true,
true>(loopName, offsets, bit_set_count, offsets,
2385 val_vec, bit_set_compute, i);
2387 setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
2388 async,
false,
true,
true>(loopName, sharedNodes[from_id],
2389 bit_set_count, offsets, val_vec,
2390 bit_set_compute, i);
2401 #ifdef GALOIS_USE_BARE_MPI
2406 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy>
2407 void sync_mpi_recv_post(std::vector<MPI_Request>& request,
2408 const std::vector<std::vector<uint8_t>>& rb) {
2409 for (
unsigned h = 1; h < numHosts; ++h) {
2410 unsigned x = (
id + numHosts - h) % numHosts;
2411 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2414 MPI_Irecv((uint8_t*)rb[x].data(), rb[x].size(), MPI_BYTE, x, 32767,
2415 MPI_COMM_WORLD, &request[x]);
2423 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2424 typename VecTy,
bool async>
2425 void sync_mpi_recv_wait(std::string loopName,
2426 std::vector<MPI_Request>& request,
2427 const std::vector<std::vector<uint8_t>>& rb) {
2428 for (
unsigned h = 1; h < numHosts; ++h) {
2429 unsigned x = (
id + numHosts - h) % numHosts;
2430 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2434 MPI_Wait(&request[x], &status);
2437 MPI_Get_count(&status, MPI_BYTE, &size);
2441 syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,
2450 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2451 typename VecTy,
bool async>
2452 void sync_mpi_get(std::string loopName,
const std::vector<MPI_Win>& window,
2453 const std::vector<std::vector<uint8_t>>& rb) {
2454 for (
unsigned h = 1; h < numHosts; ++h) {
2455 unsigned x = (
id + numHosts - h) % numHosts;
2456 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2459 MPI_Win_wait(window[x]);
2462 memcpy(&size, rb[x].data(),
sizeof(
size_t));
2465 rb[x].begin() +
sizeof(
size_t) + size);
2467 MPI_Win_post(mpi_identity_groups[x], 0, window[x]);
2469 syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,
2488 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2489 typename VecTy,
bool async>
2490 void syncNetRecv(std::string loopName) {
2497 size_t syncTypePhase = 0;
2498 if (syncType == syncBroadcast)
2507 syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
2508 p->first, p->second, loopName);
2512 for (
unsigned x = 0; x < numHosts; ++x) {
2515 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2525 syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
2526 p->first, p->second, loopName);
2528 incrementEvilPhase();
2545 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2546 typename VecTy,
bool async>
2547 void syncRecv(std::string loopName) {
2548 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2553 syncNetRecv<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
2554 VecTy, async>(loopName);
2561 #ifdef GALOIS_USE_BARE_MPI
2566 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2567 typename VecTy,
bool async>
2568 void syncNonblockingMPI(std::string loopName,
2569 bool use_bitset_to_send =
true) {
2570 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2576 static std::vector<std::vector<uint8_t>> rb;
2577 static std::vector<MPI_Request> request;
2579 if (rb.size() == 0) {
2581 auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
2582 rb.resize(numHosts);
2583 request.resize(numHosts, MPI_REQUEST_NULL);
2585 for (
unsigned h = 1; h < numHosts; ++h) {
2586 unsigned x = (
id + numHosts - h) % numHosts;
2587 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2590 size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());
2597 sync_mpi_recv_post<writeLocation, readLocation, syncType, SyncFnTy,
2598 BitsetFnTy>(request, rb);
2602 if (use_bitset_to_send) {
2603 sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
2604 VecTy, async>(loopName);
2606 sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy,
2612 sync_mpi_recv_wait<writeLocation, readLocation, syncType, SyncFnTy,
2613 BitsetFnTy, VecTy, async>(loopName, request, rb);
2621 SyncType syncType,
typename SyncFnTy,
typename BitsetFnTy,
2622 typename VecTy,
bool async>
2623 void syncOnesidedMPI(std::string loopName,
bool use_bitset_to_send =
true) {
2624 std::string syncTypeStr = (syncType == syncReduce) ?
"Reduce" :
"Broadcast";
2630 static std::vector<MPI_Win> window;
2631 static MPI_Group mpi_access_group;
2632 static std::vector<std::vector<uint8_t>> rb;
2634 if (window.size() == 0) {
2636 auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
2637 window.resize(numHosts);
2638 rb.resize(numHosts);
2640 uint64_t recv_buffers_size = 0;
2641 for (
unsigned x = 0; x < numHosts; ++x) {
2642 size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());
2644 recv_buffers_size += size;
2647 MPI_Info_create(&info);
2648 MPI_Info_set(info,
"no_locks",
"true");
2649 MPI_Info_set(info,
"same_disp_unit",
"true");
2651 MPI_Win_create(rb[x].data(), size, 1, info, MPI_COMM_WORLD, &window[x]);
2653 MPI_Info_free(&info);
2656 net.incrementMemUsage(recv_buffers_size);
2658 for (
unsigned h = 1; h < numHosts; ++h) {
2659 unsigned x = (
id + numHosts - h) % numHosts;
2660 if (nothingToRecv(x, syncType, writeLocation, readLocation))
2664 MPI_Win_post(mpi_identity_groups[x], 0, window[x]);
2669 std::vector<int> access_hosts;
2670 for (
unsigned h = 1; h < numHosts; ++h) {
2671 unsigned x = (
id + h) % numHosts;
2673 if (nothingToSend(x, syncType, writeLocation, readLocation))
2676 access_hosts.push_back(x);
2678 MPI_Group world_group;
2679 MPI_Comm_group(MPI_COMM_WORLD, &world_group);
2681 MPI_Group_incl(world_group, access_hosts.size(), access_hosts.data(),
2687 if (use_bitset_to_send) {
2688 sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
2689 VecTy, async>(loopName, mpi_access_group, window);
2691 sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy,
2693 loopName, mpi_access_group, window);
2698 sync_mpi_get<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
2699 VecTy, async>(loopName, window, rb);
2719 typename ReduceFnTy,
typename BitsetFnTy,
bool async>
2720 inline void reduce(std::string loopName) {
2725 typedef typename ReduceFnTy::ValTy T;
2727 typename std::conditional<galois::runtime::is_memory_copyable<T>::value,
2731 TsyncReduce.start();
2733 #ifdef GALOIS_USE_BARE_MPI
2737 syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
2738 VecTy, async>(loopName);
2739 syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
2740 VecTy, async>(loopName);
2741 #ifdef GALOIS_USE_BARE_MPI
2743 case nonBlockingBareMPI:
2744 syncNonblockingMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,
2745 BitsetFnTy, VecTy, async>(loopName);
2747 case oneSidedBareMPI:
2748 syncOnesidedMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,
2749 BitsetFnTy, VecTy, async>(loopName);
2770 typename BroadcastFnTy,
typename BitsetFnTy,
bool async>
2771 inline void broadcast(std::string loopName) {
2776 typedef typename BroadcastFnTy::ValTy T;
2778 typename std::conditional<galois::runtime::is_memory_copyable<T>::value,
2779 galois::PODResizeableArray<T>,
2782 TsyncBroadcast.
start();
2784 bool use_bitset =
true;
2786 if (currentBVFlag !=
nullptr) {
2791 currentBVFlag =
nullptr;
2796 currentBVFlag =
nullptr;
2797 }
else if (readLocation ==
readAny &&
2802 GALOIS_DIE(
"readAny + use of bitvector flag without none_invalid "
2803 "should never happen");
2807 #ifdef GALOIS_USE_BARE_MPI
2812 syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
2813 BitsetFnTy, VecTy, async>(loopName);
2815 syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
2818 syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
2819 BitsetFnTy, VecTy, async>(loopName);
2820 #ifdef GALOIS_USE_BARE_MPI
2822 case nonBlockingBareMPI:
2823 syncNonblockingMPI<writeLocation, readLocation, syncBroadcast,
2824 BroadcastFnTy, BitsetFnTy, VecTy, async>(loopName,
2827 case oneSidedBareMPI:
2828 syncOnesidedMPI<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
2829 BitsetFnTy, VecTy, async>(loopName, use_bitset);
2836 TsyncBroadcast.stop();
2847 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2848 inline void sync_src_to_src(std::string loopName) {
2851 if (transposed || isVertexCut) {
2852 reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
2853 broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
2865 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2866 inline void sync_src_to_dst(std::string loopName) {
2871 reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
2874 broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
2879 reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
2882 broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
2895 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2896 inline void sync_src_to_any(std::string loopName) {
2899 if (transposed || isVertexCut) {
2900 reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
2902 broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
2913 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2914 inline void sync_dst_to_src(std::string loopName) {
2920 reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
2923 broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
2926 reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
2929 broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
2943 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2944 inline void sync_dst_to_dst(std::string loopName) {
2947 if (!transposed || isVertexCut) {
2948 reduce<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
2950 broadcast<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
2963 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2964 inline void sync_dst_to_any(std::string loopName) {
2967 if (!transposed || isVertexCut) {
2968 reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
2970 broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
2981 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
2982 inline void sync_any_to_src(std::string loopName) {
2985 reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
2986 if (transposed || isVertexCut) {
2987 broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
2999 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
3000 inline void sync_any_to_dst(std::string loopName) {
3003 reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName);
3005 if (!transposed || isVertexCut) {
3006 broadcast<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(
3019 template <
typename SyncFnTy,
typename BitsetFnTy,
bool async>
3020 inline void sync_any_to_any(std::string loopName) {
3022 reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
3023 broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
3046 inline void sync(std::string loopName) {
3052 if (partitionAgnostic) {
3053 sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
3057 sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
3059 sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
3061 sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
3065 sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
3067 sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
3069 sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
3073 sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
3075 sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
3077 sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
3093 template <ReadLocation rl,
typename SyncFnTy,
typename BitsetFnTy>
3094 struct SyncOnDemandHandler {
3098 void call() {
GALOIS_DIE(
"invalid read location for sync on demand"); }
3107 template <
typename SyncFnTy,
typename BitsetFnTy>
3108 struct SyncOnDemandHandler<
readSource, SyncFnTy, BitsetFnTy> {
3122 std::string loopName,
const BITVECTOR_STATUS&) {
3124 substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
3126 substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
3128 substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
3141 template <
typename SyncFnTy,
typename BitsetFnTy>
3156 std::string loopName,
const BITVECTOR_STATUS&) {
3158 substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3160 substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3162 substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3175 template <
typename SyncFnTy,
typename BitsetFnTy>
3176 struct SyncOnDemandHandler<
readAny, SyncFnTy, BitsetFnTy> {
3190 std::string loopName,
3191 const BITVECTOR_STATUS& bvFlag) {
3195 if (!(src_write && dst_write)) {
3204 substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName);
3208 substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3209 substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
3213 substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
3214 substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3216 GALOIS_DIE(
"invalid bitvector flag setting in syncOnDemand");
3219 substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
3221 substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3223 }
else if (dst_write) {
3226 substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName);
3228 substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3229 substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
3231 substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
3232 substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3234 GALOIS_DIE(
"invalid bitvector flag setting in syncOnDemand");
3237 substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
3239 substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3253 if (src_read && dst_read) {
3255 substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName);
3257 substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3258 substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
3260 substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
3261 substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3263 GALOIS_DIE(
"invalid bitvector flag setting in syncOnDemand");
3265 }
else if (src_read) {
3266 substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
3268 substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
3281 #ifdef GALOIS_ENABLE_GPU
3283 using GraphNode =
typename GraphTy::GraphNode;
3284 using edge_iterator =
typename GraphTy::edge_iterator;
3285 using EdgeTy =
typename GraphTy::EdgeType;
3288 template <
bool isVoidType,
3289 typename std::enable_if<isVoidType>::type* =
nullptr>
3290 inline void setMarshalEdge(
MarshalGraph& GALOIS_UNUSED(m),
3291 const size_t GALOIS_UNUSED(index),
3292 const edge_iterator& GALOIS_UNUSED(e)) {
3296 template <
bool isVoidType,
3297 typename std::enable_if<!isVoidType>::type* =
nullptr>
3298 inline void setMarshalEdge(
MarshalGraph& m,
const size_t index,
3299 const edge_iterator& e) {
3300 m.
edge_data[index] = userGraph.getEdgeData(e);
3305 m.
nnodes = userGraph.size();
3306 m.
nedges = userGraph.sizeEdges();
3307 m.
numOwned = userGraph.numMasters();
3318 if (std::is_void<EdgeTy>::value) {
3321 if (!std::is_same<EdgeTy, edge_data_type>::value) {
3322 galois::gWarn(
"Edge data type mismatch between CPU and GPU\n");
3331 [&](
const GraphNode& nodeID) {
3333 m.
node_data[nodeID] = userGraph.getGID(nodeID);
3334 m.
row_start[nodeID] = *(userGraph.edge_begin(nodeID));
3335 for (
auto e = userGraph.edge_begin(nodeID);
3336 e != userGraph.edge_end(nodeID); e++) {
3338 setMarshalEdge<std::is_void<EdgeTy>::value>(m, edgeID, e);
3339 m.edge_dst[edgeID] = userGraph.getEdgeDst(e);
3350 (
unsigned int*)calloc(masterNodes.size(),
sizeof(
unsigned int));
3353 (
unsigned int**)calloc(masterNodes.size(),
sizeof(
unsigned int*));
3356 for (uint32_t h = 0; h < masterNodes.size(); ++h) {
3359 if (masterNodes[h].size() > 0) {
3361 (
unsigned int*)calloc(masterNodes[h].size(),
sizeof(
unsigned int));
3363 std::copy(masterNodes[h].begin(), masterNodes[h].end(),
3371 (
unsigned int*)calloc(mirrorNodes.size(),
sizeof(
unsigned int));
3374 (
unsigned int**)calloc(mirrorNodes.size(),
sizeof(
unsigned int*));
3376 for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
3379 if (mirrorNodes[h].size() > 0) {
3381 (
unsigned int*)calloc(mirrorNodes[h].size(),
sizeof(
unsigned int));
3383 std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
3392 userGraph.deallocate();
3394 #endif // het galois def
3413 template <
ReadLocation readLocation,
typename SyncFnTy,
3416 std::string loopName) {
3424 SyncOnDemandHandler<readLocation, SyncFnTy, BitsetFnTy>::call(
3425 this, fieldFlags, loopName, *currentBVFlag);
3427 currentBVFlag =
nullptr;
3465 #if GALOIS_PER_ROUND_STATS
3466 return std::string(std::to_string(num_run) +
"_" +
3467 std::to_string(num_round));
3469 return std::string(std::to_string(num_run));
3481 #if GALOIS_PER_ROUND_STATS
3482 return std::string(std::string(loop_name) +
"_" + std::to_string(num_run) +
3483 "_" + std::to_string(num_round));
3485 return std::string(std::string(loop_name) +
"_" + std::to_string(num_run));
3501 unsigned alterID)
const {
3502 #if GALOIS_PER_ROUND_STATS
3503 return std::string(std::string(loop_name) +
"_" + std::to_string(alterID) +
3504 "_" + std::to_string(num_run) +
"_" +
3505 std::to_string(num_round));
3507 return std::string(std::string(loop_name) +
"_" + std::to_string(alterID) +
3508 "_" + std::to_string(num_run));
3518 template <
typename FnTy>
3521 auto mirrorRanges = userGraph.getMirrorRanges();
3522 for (
auto r : mirrorRanges) {
3523 if (r.first == r.second)
3525 assert(r.first < r.second);
3528 bool batch_succeeded = FnTy::reset_batch(r.first, r.second - 1);
3531 if (!batch_succeeded) {
3534 [&](uint32_t lid) {
FnTy::reset(lid, userGraph.getData(lid)); },
3546 #ifdef GALOIS_CHECKPOINT
3630 template <
typename GraphTy>
3635 #endif // header guard
Contains macros for easily defining common Galois sync structures and the field flags class used for ...
BITVECTOR_STATUS bitvectorStatus
Status of the bitvector in terms of if it can be used to sync the field.
Definition: SyncStructures.h:76
Contains forward declarations and the definition of the MarshalGraph class, which is used to marshal ...
void reserve(uint64_t n)
Reserves capacity for the bitset.
Definition: libgalois/include/galois/DynamicBitset.h:90
std::string get_run_identifier() const
Get a run identifier using the set run and set round.
Definition: GluonSubstrate.h:3464
write at source and/or destination
Definition: GluonSubstrate.h:59
void set_num_round(const uint32_t round)
Set the round number for use in the run identifier.
Definition: GluonSubstrate.h:3454
bool src_to_src() const
Return true if src2src is set.
Definition: SyncStructures.h:90
void gInfo(Args &&...args)
Prints an info string from a sequence of things.
Definition: gIO.h:55
__global__ void bitset_reset_range(DynamicBitset *__restrict__ bitset, size_t vec_begin, size_t vec_end, bool test1, size_t bit_index1, uint64_t mask1, bool test2, size_t bit_index2, uint64_t mask2)
Definition: DeviceEdgeSync.h:298
write at source
Definition: GluonSubstrate.h:55
unsigned int getActiveThreads() noexcept
Returns the number of threads in use.
Definition: Threads.cpp:37
WriteLocation
Enumeration for specifiying write location for sync calls.
Definition: GluonSubstrate.h:53
Definition: DataCommMode.h:35
GluonSubstrate()=delete
Delete default constructor: this class NEEDS to have a graph passed into it.
void set_num_run(const uint32_t runNum)
Set the run number.
Definition: GluonSubstrate.h:3440
void resize(size_t n)
Definition: PODResizeableArray.h:142
Concurrent dynamically allocated bitset.
Definition: libgalois/include/galois/DynamicBitset.h:47
Buffer for serialization of data.
Definition: Serialize.h:56
void reserve(size_t s)
Reserve more space in the serialize buffer.
Definition: Serialize.h:110
unsigned int index_type
Definition: EdgeHostDecls.h:33
std::string get_run_identifier(std::string loop_name, unsigned alterID) const
Get a run identifier using the set run and set round and append to the passed in string in addition t...
Definition: GluonSubstrate.h:3500
unsigned int * num_mirror_nodes
Definition: HostDecls.h:53
read at destination
Definition: GluonSubstrate.h:66
unsigned int numNodesWithEdges
Definition: HostDecls.h:43
void clear_read_dst()
Sets write dst flags to false.
Definition: SyncStructures.h:128
Contains the DynamicBitSet class and most of its implementation.
ReadLocation
Enumeration for specifiying read location for sync calls.
Definition: GluonSubstrate.h:62
void gDeserialize(DeSerializeBuffer &buf, T1 &&t1, Args &&...args)
Deserialize data in a buffer into a series of objects.
Definition: Serialize.h:1032
const char * loopname
Definition: Executor_ParaMeter.h:145
#define GALOIS_DIE(...)
Definition: gIO.h:96
Definition: DataCommMode.h:37
void reportStat_Tsum(const S1 ®ion, const S2 &category, const T &value)
Definition: Statistics.h:562
vTy & getVec()
Returns vector of data stored in this serialize buffer.
Definition: Serialize.h:115
bool test(size_t index) const
Check a bit to see if it is currently set.
Definition: libgalois/include/galois/DynamicBitset.h:192
unsigned int numOwned
Definition: HostDecls.h:41
void reserve(size_t n)
Definition: PODResizeableArray.h:129
edge_data_type * edge_data
Definition: HostDecls.h:50
A class to be inherited from so that all child classes will have a tracked unique ID...
Definition: GlobalObj.h:43
read at source
Definition: GluonSubstrate.h:64
#define GALOIS_ASSERT(cond,...)
Like assert but unconditionally executed.
Definition: gIO.h:102
unsigned numHosts
Definition: HostDecls.h:46
void reset_mirrorField()
Given a sync structure, reset the field specified by the structure to the 0 of the reduction on mirro...
Definition: GluonSubstrate.h:3519
void resize(uint64_t n)
Resizes the bitset.
Definition: libgalois/include/galois/DynamicBitset.h:78
Contains the DataCommMode enumeration and a function that chooses a data comm mode based on its argum...
std::vector< T, Pow2Alloc< T >> Vector
[STL vector using Pow_2_VarSizeAlloc]
Definition: gstl.h:52
size_t nnodes
Definition: HostDecls.h:39
unsigned int * num_master_nodes
Definition: HostDecls.h:51
bool src_invalid(BITVECTOR_STATUS bv_flag)
Return true if the sources are invalid in bitvector flag.
Definition: SyncStructures.cpp:30
void reportStat_Single(const S1 ®ion, const S2 &category, const T &value)
Definition: Statistics.h:544
const Ty max(std::atomic< Ty > &a, const Ty &b)
Definition: AtomicHelpers.h:40
void reset()
Gets the space taken by the bitset.
Definition: libgalois/include/galois/DynamicBitset.h:110
int id
Definition: HostDecls.h:45
index_type * edge_dst
Definition: HostDecls.h:48
Contains declaration of DistStatManager, which reports runtime statistics of a distributed applicatio...
size_type size() const
Returns the size of the serialize buffer.
Definition: Serialize.h:125
unsigned int ** master_nodes
Definition: HostDecls.h:52
void gPrint(Args &&...args)
Prints a sequence of things.
Definition: gIO.h:47
unsigned int activeThreads
Definition: Threads.cpp:26
NetworkInterface & getSystemNetworkInterface()
Get the network interface.
Definition: Network.cpp:131
size_t size() const
Gets the size of the bitset.
Definition: libgalois/include/galois/DynamicBitset.h:99
A structure representing an empty bitset.
Definition: libgalois/include/galois/DynamicBitset.h:413
void sync(std::string loopName)
Main sync call exposed to the user that calls the correct sync function based on provided template ar...
Definition: GluonSubstrate.h:3046
bool src_to_dst() const
Return true if src2dst is set.
Definition: SyncStructures.h:93
size_type size() const
Definition: PODResizeableArray.h:125
void reset(Ty &var, Ty val)
Definition: AtomicHelpers.h:202
send no data
Definition: DataCommMode.h:34
none of the bitvector is invalid
Definition: SyncStructures.h:46
Definition: DataCommMode.h:38
void do_all(const RangeFunc &rangeMaker, FunctionTy &&fn, const Args &...args)
Standard do-all loop.
Definition: Loops.h:71
bool set(size_t index)
Set a bit in the bitset.
Definition: libgalois/include/galois/DynamicBitset.h:206
unsigned int node_data_type
Definition: EdgeHostDecls.h:34
Definition: DataCommMode.h:36
void syncOnDemand(galois::runtime::FieldFlags &fieldFlags, std::string loopName)
Given a structure that contains flags signifying what needs to be synchronized, syncOnDemand will syn...
Definition: GluonSubstrate.h:3415
void clear_read_src()
Sets write src flags to false.
Definition: SyncStructures.h:122
GlobalObject(const GlobalObject &)=delete
void start()
Definition: Timer.cpp:82
void on_each(FunctionTy &&fn, const Args &...args)
Low-level parallel loop.
Definition: Loops.h:86
index_type * row_start
Definition: HostDecls.h:47
Defines the GlobalObject class, which is a base class that other classes inherit from to be assigned ...
write at destination
Definition: GluonSubstrate.h:57
substrate::Barrier & getHostBarrier()
Returns a host barrier, which is a regular MPI-Like Barrier for all hosts.
Definition: libdist/src/Barrier.cpp:109
unsigned getOffset() const
Gets the current offset into the deserialize buffer.
Definition: Serialize.h:210
void resize(size_t bytes)
Definition: Serialize.h:103
BITVECTOR_STATUS
Bitvector status enum specifying validness of certain things in bitvector.
Definition: SyncStructures.h:45
Each field has a FieldFlags object that indicates synchronization status of that field.
Definition: SyncStructures.h:65
bool dst_invalid(BITVECTOR_STATUS bv_flag)
Return true if the destinations are invalid in bitvector flag.
Definition: SyncStructures.cpp:35
DataCommMode enforcedDataMode
Specifies what format to send metadata in.
Definition: GluonSubstrate.cpp:29
uint32_t evilPhase
Variable that keeps track of which network send/recv phase a program is currently on...
Definition: Network.cpp:36
uint32_t get_run_num() const
Get the set run number.
Definition: GluonSubstrate.h:3447
GluonSubstrate(GraphTy &_userGraph, unsigned host, unsigned numHosts, bool _transposed, std::pair< unsigned, unsigned > _cartesianGrid=std::make_pair(0u, 0u), bool _partitionAgnostic=false, DataCommMode _enforcedDataMode=DataCommMode::noData)
Constructor for GluonSubstrate.
Definition: GluonSubstrate.h:423
bool dst_to_dst() const
Return true if dst2dst is set.
Definition: SyncStructures.h:99
std::string get_run_identifier(std::string loop_name) const
Get a run identifier using the set run and set round and append to the passed in string.
Definition: GluonSubstrate.h:3480
Buffer for deserialization of data.
Definition: Serialize.h:147
auto iterate(C &cont)
Definition: Range.h:323
Definition: HostDecls.h:38
pointer data()
Definition: PODResizeableArray.h:174
void gWarn(Args &&...args)
Prints a warning string from a sequence of things.
Definition: gIO.h:63
unsigned edge_data_type
Definition: EdgeHostDecls.h:35
size_t nedges
Definition: HostDecls.h:40
DataCommMode
Enumeration of data communication modes that can be used in synchronization.
Definition: DataCommMode.h:33
unsigned int beginMaster
Definition: HostDecls.h:42
unsigned int ** mirror_nodes
Definition: HostDecls.h:54
Contains the BareMPI enum and the command line option that controls bare MPI usage.
const auto & get_vec() const
Returns the underlying bitset representation to the user.
Definition: libgalois/include/galois/DynamicBitset.h:63
Definition: DataCommMode.h:39
node_data_type * node_data
Definition: HostDecls.h:49
Indicates if T is memory copyable.
Definition: ExtraTraits.h:64
Definition: DataCommMode.h:40
bool dst_to_src() const
Return true if dst2src is set.
Definition: SyncStructures.h:96
vTy & getVec()
Get the underlying vector storing the data of the deserialize buffer.
Definition: Serialize.h:244
void stop()
Definition: Timer.cpp:87
Galois Timer that automatically reports stats upon destruction Provides statistic interface around ti...
Definition: Timer.h:63
Gluon communication substrate that handles communication given a user graph.
Definition: GluonSubstrate.h:83
read at source and/or destination
Definition: GluonSubstrate.h:68