20 #ifndef _GALOIS_RUNTIME_TILEDEXECUTOR_H_
21 #define _GALOIS_RUNTIME_TILEDEXECUTOR_H_
23 #include "galois/config.h"
31 template <
typename Graph,
bool UseExp = false>
33 static constexpr
int numDims = 2;
36 using GNode =
typename Graph::GraphNode;
37 using iterator =
typename Graph::iterator;
38 using edge_iterator =
typename Graph::edge_iterator;
39 using Point = std::array<size_t, numDims>;
44 SimpleAtomic() : value(0) {}
45 SimpleAtomic(
const SimpleAtomic& o) : value(o.value.load()) {}
46 T relaxedLoad() {
return value.load(std::memory_order_relaxed); }
47 void relaxedAdd(T delta) {
48 value.store(relaxedLoad() + delta, std::memory_order_relaxed);
61 SimpleAtomic<unsigned> updates;
68 struct GetDst :
public std::unary_function<edge_iterator, GNode> {
71 GetDst(Graph* _g) : g(_g) {}
73 GNode
operator()(edge_iterator ii)
const {
return g->getEdgeDst(ii); }
77 using edge_dst_iterator =
78 boost::transform_iterator<GetDst, no_deref_iterator>;
85 std::array<std::vector<SpinLock>, numDims> locks;
86 std::vector<Task> tasks;
100 void nextPoint(Point& p,
int dim,
int delta) {
101 assert(dim < numDims);
104 while (p[dim] >= locks[dim].size()) {
105 p[dim] -= locks[dim].size();
118 Task* getTask(
const Point& p) {
119 Task* t = &tasks[p[0] + p[1] * locks[0].size()];
121 assert(t < &tasks[numTasks]);
122 assert(t >= &tasks[0]);
140 Task* probeBlockWithLock(Point& start,
int dim,
size_t n) {
143 for (
size_t i = 0; i < n; ++i) {
144 Task* t = getTask(p);
146 assert(p[0] == t->coord[0]);
147 assert(p[1] == t->coord[1]);
148 assert(t->coord[0] < locks[0].size());
149 assert(t->coord[1] < locks[1].size());
151 if (t->updates.relaxedLoad() < maxUpdates) {
152 if (std::try_lock(locks[0][t->coord[0]], locks[1][t->coord[1]]) < 0) {
153 if (t->updates.relaxedLoad() < maxUpdates) {
154 t->updates.relaxedAdd(1);
160 for (
int i = 0; i < numDims; ++i) {
161 locks[i][t->coord[i]].unlock();
166 nextPoint(p, dim, 1);
184 Task* probeBlockWithoutLock(Point& start,
int dim,
size_t n) {
187 for (
size_t i = 0; i < n; ++i) {
188 Task* t = getTask(p);
190 assert(p[0] == t->coord[0]);
191 assert(p[1] == t->coord[1]);
192 assert(t->coord[0] < locks[0].size());
193 assert(t->coord[1] < locks[1].size());
195 if (t->updates.relaxedLoad() < maxUpdates) {
196 if (t->updates.value.fetch_add(1) < maxUpdates) {
202 nextPoint(p, dim, 1);
227 Task* probeBlock(Point& start,
int dim,
size_t n) {
231 return probeBlockWithLock(start, dim, n);
233 return probeBlockWithoutLock(start, dim, n);
254 Task* nextBlock(Point& start,
bool inclusive) {
260 for (
int times = 0; times < 2; ++times) {
261 Point limit{{locks[0].size(), locks[1].size()}};
263 int inclusiveDelta = (inclusive && times == 0) ? 0 : 1;
268 for (
int i = 0; i < numDims; ++i) {
270 nextPoint(p, i, inclusiveDelta);
272 if ((t = probeBlock(p, i, limit[i] - inclusiveDelta))) {
284 for (
int i = 0; i < numDims; ++i) {
294 while (std::any_of(limit.begin(), limit.end(),
295 [](
size_t x) {
return x > 0; })) {
296 for (
int i = 0; i < numDims; ++i) {
297 if (limit[i] > 1 && (t = probeBlock(p, i, limit[i] - 1))) {
303 for (
int i = 0; i < numDims; ++i) {
326 template <
bool UseDense,
typename Function>
327 void executeBlock(Function& fn, Task& task,
328 typename std::enable_if<UseDense>::type* = 0) {
331 for (
auto ii = task.startX; ii != task.endX; ++ii) {
332 for (
auto jj = g.begin() + task.startY,
333 ej = g.begin() + task.endYInclusive + 1;
351 template <
bool UseDense,
typename Function>
352 void executeBlock(Function& fn, Task& task,
353 typename std::enable_if<!UseDense>::type* = 0) {
356 for (
auto ii = task.startX; ii != task.endX; ++ii) {
363 edge_dst_iterator dbegin(nbegin, getDst);
364 edge_dst_iterator dend(nend, getDst);
381 for (
auto jj = std::lower_bound(dbegin, dend, task.startY); jj != dend;) {
404 edge_iterator edge = *jj.base();
405 if (*jj > task.endYInclusive)
430 template <
bool UseDense,
typename Function>
431 void executeLoopExp(Function fn,
unsigned tid,
unsigned total) {
432 Point numBlocks{locks[0].size(), locks[1].size()};
439 for (
int i = 0; i < numDims; ++i) {
440 block[i] = (numBlocks[i] + total - 1) / total;
441 start[i] =
std::min(block[i] * tid, numBlocks[i] - 1);
446 int dim = numBlocks[0] < numBlocks[1] ? 1 : 0;
447 int odim = (dim + 1) % 2;
449 size_t maxRounds = numBlocks[dim];
451 for (
size_t rounds = 0; rounds < maxRounds; ++rounds) {
452 Point p{start[0], start[1]};
453 nextPoint(p, dim, rounds);
456 std::min(block[odim] * (tid + 1), numBlocks[odim]) - start[odim];
457 for (
size_t tries = 0; tries < ntries; ++tries) {
458 Task* t = probeBlock(p, 0, 1);
460 executeBlock<UseDense>(fn, *t);
463 for (
int i = 0; i < numDims; ++i)
464 locks[i][t->coord[i]].unlock();
468 for (
int i = 0; i < numDims; ++i)
478 template <
bool UseDense,
typename Function>
479 void executeLoopExp2(Function fn,
unsigned tid,
unsigned total) {
480 Point numBlocks{{locks[0].size(), locks[1].size()}};
483 for (
int i = 0; i < numDims; ++i) {
484 block[i] = (numBlocks[i] + total - 1) / total;
485 start[i] =
std::min(block[i] * tid, numBlocks[i] - 1);
489 int dim = numBlocks[0] < numBlocks[1] ? 1 : 0;
490 int odim = (dim + 1) % 2;
491 size_t maxRounds = numBlocks[dim];
493 for (
size_t round = 0; round < maxRounds; ++round) {
494 Point base{{start[0], start[1]}};
495 nextPoint(base, dim, round);
496 for (
size_t tries = 0; tries < numBlocks[odim]; ++tries) {
497 size_t index = tries + base[odim];
498 if (index >= numBlocks[odim])
499 index -= numBlocks[odim];
501 nextPoint(p, dim, round);
502 nextPoint(p, odim, index);
503 nextPoint(p, dim, index);
505 Task* t = probeBlock(p, 0, 1);
508 executeBlock<UseDense>(fn, *t);
511 for (
int i = 0; i < numDims; ++i)
512 locks[i][t->coord[i]].unlock();
534 template <
bool UseDense,
typename Function>
535 void executeLoopOrig(Function fn,
unsigned tid,
unsigned total) {
536 Point numBlocks{{locks[0].size(), locks[1].size()}};
542 for (
int i = 0; i < numDims; ++i) {
543 block[i] = (numBlocks[i] + total - 1) / total;
544 start[i] =
std::min(block[i] * tid, numBlocks[i] - 1);
547 unsigned coresPerSocket =
563 for (
int i = 0;; ++i) {
564 Task* t = nextBlock(p, i == 0);
569 executeBlock<UseDense>(fn, *t);
574 for (
int i = 0; i < numDims; ++i) {
575 locks[i][t->coord[i]].unlock();
591 template <
bool UseDense,
typename Function>
592 void executeLoop(Function fn,
unsigned tid,
unsigned total) {
596 executeLoopOrig<UseDense>(fn, tid, total);
611 void initializeTasks(iterator firstX, iterator lastX, iterator firstY,
612 iterator lastY,
size_t sizeX,
size_t sizeY) {
613 const size_t numXBlocks =
614 (std::distance(firstX, lastX) + sizeX - 1) / sizeX;
615 const size_t numYBlocks =
616 (std::distance(firstY, lastY) + sizeY - 1) / sizeY;
617 const size_t numBlocks = numXBlocks * numYBlocks;
622 locks[0].resize(numXBlocks);
623 locks[1].resize(numYBlocks);
624 tasks.resize(numBlocks);
628 for (
size_t i = 0; i < numBlocks; ++i) {
629 Task& task = tasks[i];
630 task.coord = {{i % numXBlocks, i / numXBlocks}};
631 std::tie(task.startX, task.endX) =
639 task.endYInclusive = *e - 1;
651 template <
bool UseDense,
typename Function>
656 void operator()(
unsigned tid,
unsigned total) {
657 self->executeLoop<UseDense>(fn, tid, total);
663 : g(g), cutoff(cutoff),
692 template <
typename Function>
693 void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY,
694 size_t sizeX,
size_t sizeY, Function fn,
bool _useLocks,
695 unsigned numIterations = 1) {
696 initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);
697 numTasks = tasks.size();
698 maxUpdates = numIterations;
699 useLocks = _useLocks;
701 Process<false, Function> p{
this, fn};
706 if (std::any_of(tasks.begin(), tasks.end(),
707 [
this](Task& t) {
return t.updates.value < maxUpdates; })) {
730 template <
typename Function>
732 iterator lastY,
size_t sizeX,
size_t sizeY, Function fn,
733 bool _useLocks,
int numIterations = 1) {
734 initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);
735 numTasks = tasks.size();
736 maxUpdates = numIterations;
737 useLocks = _useLocks;
738 Process<true, Function> p{
this, fn};
742 if (std::any_of(tasks.begin(), tasks.end(),
743 [
this](Task& t) {
return t.updates.value < maxUpdates; })) {
ThreadPool & getThreadPool(void)
return a reference to system thread pool
Definition: ThreadPool.cpp:259
unsigned int getActiveThreads() noexcept
Returns the number of threads in use.
Definition: Threads.cpp:37
void executeDense(iterator firstX, iterator lastX, iterator firstY, iterator lastY, size_t sizeX, size_t sizeY, Function fn, bool _useLocks, int numIterations=1)
Execute a function on a provided X set of nodes and Y set of nodes for a certain number of iterations...
Definition: TiledExecutor.h:731
~Fixed2DGraphTiledExecutor()
Report the number of probe block failures to statistics.
Definition: TiledExecutor.h:669
std::pair< IterTy, IterTy > block_range(IterTy b, IterTy e, unsigned id, unsigned num)
Returns a continuous block from the range based on the number of divisions and the id of the block re...
Definition: gstl.h:244
unsigned getMaxCores() const
Definition: ThreadPool.h:179
substrate::Barrier & getBarrier(unsigned activeThreads)
Have a pre-instantiated barrier available for use.
Definition: Substrate.cpp:24
Fixed2DGraphTiledExecutor(Graph &g, int cutoff=0)
Definition: TiledExecutor.h:662
void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY, size_t sizeX, size_t sizeY, Function fn, bool _useLocks, unsigned numIterations=1)
Execute a function on a provided X set of nodes and Y set of nodes for a certain number of iterations...
Definition: TiledExecutor.h:693
Definition: TiledExecutor.h:32
Modify an iterator so that *it == it.
Definition: NoDerefIterator.h:31
void reportStat_Single(const S1 ®ion, const S2 &category, const T &value)
Definition: Statistics.h:544
Definition: PaddedLock.h:35
void operator()(void)
Definition: Executor_ParaMeter.h:417
const Ty min(std::atomic< Ty > &a, const Ty &b)
Definition: AtomicHelpers.h:70
void on_each(FunctionTy &&fn, const Args &...args)
Low-level parallel loop.
Definition: Loops.h:86
T & reduce()
Returns the final reduction value.
Definition: Reduction.h:102
unsigned getMaxSockets() const
Definition: ThreadPool.h:180
void gWarn(Args &&...args)
Prints a warning string from a sequence of things.
Definition: gIO.h:63