api/current/TiledExecutor_8h_source.html

 /*

  * This file belongs to the Galois project, a C++ library for exploiting

  * parallelism. The code is being released under the terms of the 3-Clause BSD

  * License (a copy is located in LICENSE.txt at the top-level directory).

  *

  * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.

  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS

  * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,

  * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF

  * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF

  * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH

  * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances

  * shall University be liable for incidental, special, indirect, direct or

  * consequential damages or loss of profits, interruption of business, or

  * related expenses which may arise from use of Software or Documentation,

  * including but not limited to those resulting from defects in Software and/or

  * Documentation, or loss or inaccuracy of data of any kind.

  */


 #ifndef _GALOIS_RUNTIME_TILEDEXECUTOR_H_

 #define _GALOIS_RUNTIME_TILEDEXECUTOR_H_


 #include "galois/config.h"

 #include "galois/Galois.h"

 #include "galois/LargeArray.h"

 #include "galois/NoDerefIterator.h"


 namespace galois {

 namespace runtime {


 template <typename Graph, bool UseExp = false>

 class Fixed2DGraphTiledExecutor {

   static constexpr int numDims = 2; // code is specialized to 2


   using SpinLock      = galois::substrate::PaddedLock<true>;

   using GNode         = typename Graph::GraphNode;

   using iterator      = typename Graph::iterator;

   using edge_iterator = typename Graph::edge_iterator;

   using Point         = std::array<size_t, numDims>;


   template <typename T>

   struct SimpleAtomic {

     std::atomic<T> value;

     SimpleAtomic() : value(0) {}

     SimpleAtomic(const SimpleAtomic& o) : value(o.value.load()) {}

     T relaxedLoad() { return value.load(std::memory_order_relaxed); }

     void relaxedAdd(T delta) {

       value.store(relaxedLoad() + delta, std::memory_order_relaxed);

     }

   };


   struct Task {

     iterator startX;

     iterator endX;

     GNode startY;

     GNode endYInclusive;

     Point coord;

     SimpleAtomic<unsigned> updates;

   };


   struct GetDst : public std::unary_function<edge_iterator, GNode> {

     Graph* g;

     GetDst() {}

     GetDst(Graph* _g) : g(_g) {}


     GNode operator()(edge_iterator ii) const { return g->getEdgeDst(ii); }

   };


   using no_deref_iterator = galois::NoDerefIterator<edge_iterator>;

   using edge_dst_iterator =

       boost::transform_iterator<GetDst, no_deref_iterator>;


   Graph& g;

   int cutoff;                          // XXX: UseExp

   galois::substrate::Barrier& barrier; // XXX: UseExp

   // std::array<galois::LargeArray<SpinLock>, numDims> locks;

   // galois::LargeArray<Task> tasks;

   std::array<std::vector<SpinLock>, numDims> locks;

   std::vector<Task> tasks;

   size_t numTasks;

   unsigned maxUpdates;

   bool useLocks;

   galois::GAccumulator<unsigned> failedProbes;


   void nextPoint(Point& p, int dim, int delta) {

     assert(dim < numDims);

     p[dim] += delta;

     // account for overflow

     while (p[dim] >= locks[dim].size()) {

       p[dim] -= locks[dim].size();

     }

   }


   Task* getTask(const Point& p) {

     Task* t = &tasks[p[0] + p[1] * locks[0].size()];


     assert(t < &tasks[numTasks]);

     assert(t >= &tasks[0]);


     return t;

   }


   Task* probeBlockWithLock(Point& start, int dim, size_t n) {

     Point p = start;


     for (size_t i = 0; i < n; ++i) {

       Task* t = getTask(p);


       assert(p[0] == t->coord[0]);

       assert(p[1] == t->coord[1]);

       assert(t->coord[0] < locks[0].size());

       assert(t->coord[1] < locks[1].size());


       if (t->updates.relaxedLoad() < maxUpdates) {

         if (std::try_lock(locks[0][t->coord[0]], locks[1][t->coord[1]]) < 0) {

           if (t->updates.relaxedLoad() < maxUpdates) {

             t->updates.relaxedAdd(1);

             start = p;

             return t;

           }


           // TODO add to worklist

           for (int i = 0; i < numDims; ++i) {

             locks[i][t->coord[i]].unlock();

           }

         }

       }


       nextPoint(p, dim, 1);

     }


     failedProbes += 1;

     return nullptr;

   }


   Task* probeBlockWithoutLock(Point& start, int dim, size_t n) {

     Point p = start;


     for (size_t i = 0; i < n; ++i) {

       Task* t = getTask(p);


       assert(p[0] == t->coord[0]);

       assert(p[1] == t->coord[1]);

       assert(t->coord[0] < locks[0].size());

       assert(t->coord[1] < locks[1].size());


       if (t->updates.relaxedLoad() < maxUpdates) {

         if (t->updates.value.fetch_add(1) < maxUpdates) {

           // hasn't reached maxed updates at point of fetch

           start = p;

           return t;

         }

       }

       nextPoint(p, dim, 1);

     }


     failedProbes += 1;

     return nullptr;

   }


   Task* probeBlock(Point& start, int dim, size_t n) {

     assert(dim < 2);


     if (useLocks) {

       return probeBlockWithLock(start, dim, n);

     } else {

       return probeBlockWithoutLock(start, dim, n);

     }

   }


   // TODO (Loc) this function needs an overhaul; right now it's too hacky and

   // imprecise

   Task* nextBlock(Point& start, bool inclusive) {

     Task* t;


     // repeats twice just to make sure there are actually no unused blocks

     // TODO this method of termination detection is hacky and imprecise,

     // find a better way

     for (int times = 0; times < 2; ++times) {

       Point limit{{locks[0].size(), locks[1].size()}};


       int inclusiveDelta = (inclusive && times == 0) ? 0 : 1;


       // First iteration (i.e. inclusive = true) is INCLUSIVE of start

       // Otherwise, check the next blocks in the x and y direction for the

       // next block

       for (int i = 0; i < numDims; ++i) {

         Point p = start;

         nextPoint(p, i, inclusiveDelta);


         if ((t = probeBlock(p, i, limit[i] - inclusiveDelta))) {

           start = p;

           return t;

         }

       }


       // if the above for loop failed, it means all blocks in both directions

       // (left->right, up->down) from current block from point are locked

       // and/or all blocks have reached max updates

       Point p = start;

       // solution to above issue in comment = advance using diagonal and check

       // from there

       for (int i = 0; i < numDims; ++i) {

         nextPoint(p, i, 1);

       }


       // below will end up looping through entire grid looking for a block

       // to work on; in some cases a block will be looped over more than once

       // (see below TODO)

       // TODO probably unoptimal: if any limit has hit 0, is it the case that

       // the entire grid has been looked at already? This comment writer thinks

       // the answer is yes in which case the below is doing extra work

       while (std::any_of(limit.begin(), limit.end(),

                          [](size_t x) { return x > 0; })) {

         for (int i = 0; i < numDims; ++i) {

           if (limit[i] > 1 && (t = probeBlock(p, i, limit[i] - 1))) {

             start = p;

             return t;

           }

         }


         for (int i = 0; i < numDims; ++i) {

           if (limit[i] > 0) {

             limit[i] -= 1;

             nextPoint(p, i, 1);

           }

         }

       }

     }


     return nullptr;

   }


   template <bool UseDense, typename Function>

   void executeBlock(Function& fn, Task& task,

                     typename std::enable_if<UseDense>::type* = 0) {

     GetDst getDst{&g};


     for (auto ii = task.startX; ii != task.endX; ++ii) {

       for (auto jj = g.begin() + task.startY,

                 ej = g.begin() + task.endYInclusive + 1;

            jj != ej; ++jj) {

         fn(*ii, *jj);

       }

     }

   }


   template <bool UseDense, typename Function>

   void executeBlock(Function& fn, Task& task,

                     typename std::enable_if<!UseDense>::type* = 0) {

     GetDst getDst{&g};


     for (auto ii = task.startX; ii != task.endX; ++ii) {

       no_deref_iterator nbegin(

           g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED));

       no_deref_iterator nend(g.edge_end(*ii, galois::MethodFlag::UNPROTECTED));


       // iterates over the edges, but edge_dst_iterator xforms it to the dest

       // node itself

       edge_dst_iterator dbegin(nbegin, getDst);

       edge_dst_iterator dend(nend, getDst);


       // TODO check if we want to use experimental

       // if (UseExp &&

       //    cutoff < 0 &&

       //    std::distance(g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED),

       //      g.edge_end(*ii, galois::MethodFlag::UNPROTECTED)) >= -cutoff) {

       //  continue;

       //} else if (UseExp &&

       //           cutoff > 0 &&

       //           std::distance(g.edge_begin(*ii,

       //                           galois::MethodFlag::UNPROTECTED),

       //                         g.edge_end(*ii,

       //                           galois::MethodFlag::UNPROTECTED)) < cutoff) {

       //  continue;

       //}


       for (auto jj = std::lower_bound(dbegin, dend, task.startY); jj != dend;) {

         // if (UseExp) {

         //  constexpr int numTimes = 1;

         //  constexpr int width = 1;

         //  bool done = false;

         //  for (int times = 0; times < numTimes; ++times) {

         //    for (int i = 0; i < width; ++i) {

         //      edge_iterator edge = *(jj+i).base();

         //      if (*(jj + i) > task.endYInclusive) {

         //        done = true;

         //        break;

         //      }


         //      fn(*ii, *(jj+i), edge);

         //    }

         //  }

         //  if (done)

         //    break;

         //  for (int i = 0; jj != dend && i < width; ++jj, ++i)

         //    ;

         //  if (jj == dend)

         //    break;

         //} else {

         edge_iterator edge = *jj.base();

         if (*jj > task.endYInclusive)

           break;


         fn(*ii, *jj, edge);

         ++jj;

         //}

       }

     }

   }


   template <bool UseDense, typename Function>

   void executeLoopExp(Function fn, unsigned tid, unsigned total) {

     Point numBlocks{locks[0].size(), locks[1].size()};

     Point block;

     Point start;


     // TODO this assigns each thread a block along the diagonal, which is

     // probably NOT what you want in this executor since each block will go

     // along the diagonal; fix this

     for (int i = 0; i < numDims; ++i) {

       block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread

       start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start

     }


     // Move diagonal along dim each round

     // if more y than x, then dim is 1 (i.e. y), else 0

     int dim  = numBlocks[0] < numBlocks[1] ? 1 : 0;

     int odim = (dim + 1) % 2;

     // num blocks in dim dimension

     size_t maxRounds = numBlocks[dim];


     for (size_t rounds = 0; rounds < maxRounds; ++rounds) {

       Point p{start[0], start[1]};

       nextPoint(p, dim, rounds);


       size_t ntries =

           std::min(block[odim] * (tid + 1), numBlocks[odim]) - start[odim];

       for (size_t tries = 0; tries < ntries; ++tries) {

         Task* t = probeBlock(p, 0, 1); // probe block I am currently on

         if (t) {

           executeBlock<UseDense>(fn, *t);


           if (useLocks) {

             for (int i = 0; i < numDims; ++i)

               locks[i][t->coord[i]].unlock();

           }

         }


         for (int i = 0; i < numDims; ++i)

           nextPoint(p, i, 1);

       }


       barrier.wait();

     }

   }


   // TODO examine this

   // bulk synchronous diagonals: dynamic assignment within diagonals

   template <bool UseDense, typename Function>

   void executeLoopExp2(Function fn, unsigned tid, unsigned total) {

     Point numBlocks{{locks[0].size(), locks[1].size()}};

     Point block;

     Point start;

     for (int i = 0; i < numDims; ++i) {

       block[i] = (numBlocks[i] + total - 1) / total;

       start[i] = std::min(block[i] * tid, numBlocks[i] - 1);

     }


     // Move diagonal along dim each round

     int dim          = numBlocks[0] < numBlocks[1] ? 1 : 0;

     int odim         = (dim + 1) % 2;

     size_t maxRounds = numBlocks[dim];


     for (size_t round = 0; round < maxRounds; ++round) {

       Point base{{start[0], start[1]}};

       nextPoint(base, dim, round);

       for (size_t tries = 0; tries < numBlocks[odim]; ++tries) {

         size_t index = tries + base[odim];

         if (index >= numBlocks[odim])

           index -= numBlocks[odim];

         Point p{};

         nextPoint(p, dim, round);

         nextPoint(p, odim, index);

         nextPoint(p, dim, index);


         Task* t = probeBlock(p, 0, 1);

         if (!t)

           continue;

         executeBlock<UseDense>(fn, *t);


         if (useLocks) {

           for (int i = 0; i < numDims; ++i)

             locks[i][t->coord[i]].unlock();

         }

       }


       barrier.wait();

     }

   }


   // TODO this function is imprecise by virtue of nextBlock being a bad

   // function

   template <bool UseDense, typename Function>

   void executeLoopOrig(Function fn, unsigned tid, unsigned total) {

     Point numBlocks{{locks[0].size(), locks[1].size()}};

     Point block;

     Point start;


     // find out each thread's starting point; essentially what it is doing

     // is assinging each thread to a block on the diagonal to begin with

     for (int i = 0; i < numDims; ++i) {

       block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread

       start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start

     }


     unsigned coresPerSocket =

         galois::substrate::getThreadPool().getMaxCores() /

         galois::substrate::getThreadPool().getMaxSockets();


     // if using locks, readjust start Y location of this thread to location of

     // the thread's socket

     if (useLocks) {

       start = {{start[0],

                 std::min(block[1] *

                              galois::substrate::getThreadPool().getSocket(tid) *

                              coresPerSocket,

                          numBlocks[1] - 1)}};

     }


     Point p = start;


     for (int i = 0;; ++i) {

       Task* t = nextBlock(p, i == 0);

       // TODO: Replace with sparse worklist, etc.

       if (!t)

         break;


       executeBlock<UseDense>(fn, *t);


       // unlock the task block if using locks (next block returns the task with

       // the block locked)

       if (useLocks) {

         for (int i = 0; i < numDims; ++i) {

           locks[i][t->coord[i]].unlock();

         }

       }

     }

   }


   template <bool UseDense, typename Function>

   void executeLoop(Function fn, unsigned tid, unsigned total) {

     // if (false && UseExp)

     //  executeLoopExp2<UseDense>(fn, tid, total);

     // else

     executeLoopOrig<UseDense>(fn, tid, total);

   }


   void initializeTasks(iterator firstX, iterator lastX, iterator firstY,

                        iterator lastY, size_t sizeX, size_t sizeY) {

     const size_t numXBlocks =

         (std::distance(firstX, lastX) + sizeX - 1) / sizeX;

     const size_t numYBlocks =

         (std::distance(firstY, lastY) + sizeY - 1) / sizeY;

     const size_t numBlocks = numXBlocks * numYBlocks;


     // locks[0].create(numXBlocks);

     // locks[1].create(numYBlocks);

     // tasks.create(numBlocks);

     locks[0].resize(numXBlocks);

     locks[1].resize(numYBlocks);

     tasks.resize(numBlocks);


     // TODO parallelize this?

     // assign each block the X and Y that it is responsible for

     for (size_t i = 0; i < numBlocks; ++i) {

       Task& task = tasks[i];

       task.coord = {{i % numXBlocks, i / numXBlocks}};

       std::tie(task.startX, task.endX) =

           galois::block_range(firstX, lastX, task.coord[0], numXBlocks);

       iterator s;

       iterator e;

       std::tie(s, e) =

           galois::block_range(firstY, lastY, task.coord[1], numYBlocks);

       // XXX: Works for CSR graphs

       task.startY        = *s;

       task.endYInclusive = *e - 1;

     }

   }


   template <bool UseDense, typename Function>

   struct Process {

     Fixed2DGraphTiledExecutor* self;

     Function fn;


     void operator()(unsigned tid, unsigned total) {

       self->executeLoop<UseDense>(fn, tid, total);

     }

   };


 public:

   Fixed2DGraphTiledExecutor(Graph& g, int cutoff = 0)

       : g(g), cutoff(cutoff),

         barrier(galois::runtime::getBarrier(galois::getActiveThreads())) {}


   ~Fixed2DGraphTiledExecutor() {

     galois::runtime::reportStat_Single("TiledExecutor", "ProbeFailures",

                                        failedProbes.reduce());

   }


   template <typename Function>

   void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY,

                size_t sizeX, size_t sizeY, Function fn, bool _useLocks,

                unsigned numIterations = 1) {

     initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);

     numTasks   = tasks.size();

     maxUpdates = numIterations;

     useLocks   = _useLocks;


     Process<false, Function> p{this, fn};


     galois::on_each(p);


     // TODO remove after worklist fix

     if (std::any_of(tasks.begin(), tasks.end(),

                     [this](Task& t) { return t.updates.value < maxUpdates; })) {

       galois::gWarn("Missing tasks");

     }

   }


   template <typename Function>

   void executeDense(iterator firstX, iterator lastX, iterator firstY,

                     iterator lastY, size_t sizeX, size_t sizeY, Function fn,

                     bool _useLocks, int numIterations = 1) {

     initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);

     numTasks   = tasks.size();

     maxUpdates = numIterations;

     useLocks   = _useLocks;

     Process<true, Function> p{this, fn};

     galois::on_each(p);


     // TODO remove after worklist fix

     if (std::any_of(tasks.begin(), tasks.end(),

                     [this](Task& t) { return t.updates.value < maxUpdates; })) {

       galois::gWarn("Missing tasks");

     }

   }

 };


 } // namespace runtime

 } // namespace galois

 #endif

Galois.h

galois::substrate::getThreadPool
ThreadPool & getThreadPool(void)
return a reference to system thread pool
Definition: ThreadPool.cpp:259

galois::getActiveThreads
unsigned int getActiveThreads() noexcept
Returns the number of threads in use.
Definition: Threads.cpp:37

galois::runtime::Fixed2DGraphTiledExecutor::executeDense
void executeDense(iterator firstX, iterator lastX, iterator firstY, iterator lastY, size_t sizeX, size_t sizeY, Function fn, bool _useLocks, int numIterations=1)
Execute a function on a provided X set of nodes and Y set of nodes for a certain number of iterations...
Definition: TiledExecutor.h:731

galois::runtime::Fixed2DGraphTiledExecutor::~Fixed2DGraphTiledExecutor
~Fixed2DGraphTiledExecutor()
Report the number of probe block failures to statistics.
Definition: TiledExecutor.h:669

galois::MethodFlag::UNPROTECTED

galois::block_range
std::pair< IterTy, IterTy > block_range(IterTy b, IterTy e, unsigned id, unsigned num)
Returns a continuous block from the range based on the number of divisions and the id of the block re...
Definition: gstl.h:244

galois::substrate::ThreadPool::getMaxCores
unsigned getMaxCores() const
Definition: ThreadPool.h:179

NoDerefIterator.h

galois::runtime::getBarrier
substrate::Barrier & getBarrier(unsigned activeThreads)
Have a pre-instantiated barrier available for use.
Definition: Substrate.cpp:24

galois::substrate::Barrier
Definition: Barrier.h:33

galois::runtime::Fixed2DGraphTiledExecutor::Fixed2DGraphTiledExecutor
Fixed2DGraphTiledExecutor(Graph &g, int cutoff=0)
Definition: TiledExecutor.h:662

galois::GAccumulator< unsigned >

LargeArray.h

galois::runtime::Fixed2DGraphTiledExecutor::execute
void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY, size_t sizeX, size_t sizeY, Function fn, bool _useLocks, unsigned numIterations=1)
Execute a function on a provided X set of nodes and Y set of nodes for a certain number of iterations...
Definition: TiledExecutor.h:693

galois::runtime::Fixed2DGraphTiledExecutor
Definition: TiledExecutor.h:32

galois::NoDerefIterator
Modify an iterator so that *it == it.
Definition: NoDerefIterator.h:31

galois::runtime::reportStat_Single
void reportStat_Single(const S1 &region, const S2 &category, const T &value)
Definition: Statistics.h:544

galois::substrate::PaddedLock< true >
Definition: PaddedLock.h:35

galois::runtime::ParaMeter::operator()
void operator()(void)
Definition: Executor_ParaMeter.h:417

galois::min
const Ty min(std::atomic< Ty > &a, const Ty &b)
Definition: AtomicHelpers.h:70

galois::on_each
void on_each(FunctionTy &&fn, const Args &...args)
Low-level parallel loop.
Definition: Loops.h:86

galois::Reducible::reduce
T & reduce()
Returns the final reduction value.
Definition: Reduction.h:102

galois::substrate::ThreadPool::getMaxSockets
unsigned getMaxSockets() const
Definition: ThreadPool.h:180

galois::substrate::Barrier::wait
virtual void wait()=0

galois::gWarn
void gWarn(Args &&...args)
Prints a warning string from a sequence of things.
Definition: gIO.h:63