Galois
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DeviceEdgeSync.h
Go to the documentation of this file.
1 /*
2  * This file belongs to the Galois project, a C++ library for exploiting
3  * parallelism. The code is being released under the terms of the 3-Clause BSD
4  * License (a copy is located in LICENSE.txt at the top-level directory).
5  *
6  * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
7  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
8  * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
9  * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
10  * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
11  * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
12  * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
13  * shall University be liable for incidental, special, indirect, direct or
14  * consequential damages or loss of profits, interruption of business, or
15  * related expenses which may arise from use of Software or Documentation,
16  * including but not limited to those resulting from defects in Software and/or
17  * Documentation, or loss or inaccuracy of data of any kind.
18  */
19 
20 #pragma once
21 
28 #pragma once
32 #include "cub/util_allocator.cuh"
33 
34 #ifdef GALOIS_CUDA_CHECK_ERROR
35 #define check_cuda_kernel \
36  check_cuda(cudaDeviceSynchronize()); \
37  check_cuda(cudaGetLastError());
38 #else
39 #define check_cuda_kernel check_cuda(cudaGetLastError());
40 #endif
41 
44 
45 void kernel_sizing(dim3& blocks, dim3& threads) {
46  threads.x = 256;
47  threads.y = threads.z = 1;
48  blocks.x = ggc_get_nSM() * 8;
49  blocks.y = blocks.z = 1;
50 }
51 
52 template <typename DataType>
53 __global__ void batch_get_subset(index_type subset_size,
54  const unsigned int* __restrict__ indices,
55  DataType* __restrict__ subset,
56  const DataType* __restrict__ array) {
57  unsigned tid = TID_1D;
58  unsigned nthreads = TOTAL_THREADS_1D;
59  index_type src_end = subset_size;
60  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
61  unsigned index = indices[src];
62  subset[src] = array[index];
63  }
64 }
65 
66 template <typename DataType, typename OffsetIteratorType>
67 __global__ void batch_get_subset(index_type subset_size,
68  const unsigned int* __restrict__ indices,
69  const OffsetIteratorType offsets,
70  DataType* __restrict__ subset,
71  const DataType* __restrict__ array) {
72  unsigned tid = TID_1D;
73  unsigned nthreads = TOTAL_THREADS_1D;
74  index_type src_end = subset_size;
75  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
76  unsigned index = indices[offsets[src]];
77  subset[src] = array[index];
78  }
79 }
80 
81 template <typename DataType>
82 __global__ void batch_get_reset_subset(index_type subset_size,
83  const unsigned int* __restrict__ indices,
84  DataType* __restrict__ subset,
85  DataType* __restrict__ array,
86  DataType reset_value) {
87  unsigned tid = TID_1D;
88  unsigned nthreads = TOTAL_THREADS_1D;
89  index_type src_end = subset_size;
90  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
91  unsigned index = indices[src];
92  subset[src] = array[index];
93  array[index] = reset_value;
94  }
95 }
96 
97 template <typename DataType, typename OffsetIteratorType>
98 __global__ void batch_get_reset_subset(index_type subset_size,
99  const unsigned int* __restrict__ indices,
100  const OffsetIteratorType offsets,
101  DataType* __restrict__ subset,
102  DataType* __restrict__ array,
103  DataType reset_value) {
104  unsigned tid = TID_1D;
105  unsigned nthreads = TOTAL_THREADS_1D;
106  index_type src_end = subset_size;
107  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
108  unsigned index = indices[offsets[src]];
109  subset[src] = array[index];
110  array[index] = reset_value;
111  }
112 }
113 
114 template <typename DataType, SharedType sharedType>
115 __global__ void batch_set_subset(index_type subset_size,
116  const unsigned int* __restrict__ indices,
117  const DataType* __restrict__ subset,
118  DataType* __restrict__ array,
119  DynamicBitset* __restrict__ is_array_updated) {
120  unsigned tid = TID_1D;
121  unsigned nthreads = TOTAL_THREADS_1D;
122  index_type src_end = subset_size;
123  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
124  unsigned index = indices[src];
125  array[index] = subset[src];
126  if (sharedType != sharedMirror) {
127  is_array_updated->set(index);
128  }
129  }
130 }
131 
132 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
133 __global__ void batch_set_subset(index_type subset_size,
134  const unsigned int* __restrict__ indices,
135  const OffsetIteratorType offsets,
136  const DataType* __restrict__ subset,
137  DataType* __restrict__ array,
138  DynamicBitset* __restrict__ is_array_updated) {
139  unsigned tid = TID_1D;
140  unsigned nthreads = TOTAL_THREADS_1D;
141  index_type src_end = subset_size;
142  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
143  unsigned index = indices[offsets[src]];
144  array[index] = subset[src];
145  if (sharedType != sharedMirror) {
146  is_array_updated->set(index);
147  }
148  }
149 }
150 
151 template <typename DataType, SharedType sharedType>
152 __global__ void batch_add_subset(index_type subset_size,
153  const unsigned int* __restrict__ indices,
154  const DataType* __restrict__ subset,
155  DataType* __restrict__ array,
156  DynamicBitset* __restrict__ is_array_updated) {
157  unsigned tid = TID_1D;
158  unsigned nthreads = TOTAL_THREADS_1D;
159  index_type src_end = subset_size;
160  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
161  unsigned index = indices[src];
162  array[index] += subset[src];
163  if (sharedType != sharedMirror) {
164  is_array_updated->set(index);
165  }
166  }
167 }
168 
169 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
170 __global__ void batch_add_subset(index_type subset_size,
171  const unsigned int* __restrict__ indices,
172  const OffsetIteratorType offsets,
173  const DataType* __restrict__ subset,
174  DataType* __restrict__ array,
175  DynamicBitset* __restrict__ is_array_updated) {
176  unsigned tid = TID_1D;
177  unsigned nthreads = TOTAL_THREADS_1D;
178  index_type src_end = subset_size;
179  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
180  unsigned index = indices[offsets[src]];
181  array[index] += subset[src];
182  if (sharedType != sharedMirror) {
183  is_array_updated->set(index);
184  }
185  }
186 }
187 
188 template <typename DataType, SharedType sharedType>
189 __global__ void batch_min_subset(index_type subset_size,
190  const unsigned int* __restrict__ indices,
191  const DataType* __restrict__ subset,
192  DataType* __restrict__ array,
193  DynamicBitset* __restrict__ is_array_updated) {
194  unsigned tid = TID_1D;
195  unsigned nthreads = TOTAL_THREADS_1D;
196  index_type src_end = subset_size;
197  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
198  unsigned index = indices[src];
199  if (array[index] > subset[src]) {
200  array[index] = subset[src];
201  if (sharedType != sharedMirror) {
202  is_array_updated->set(index);
203  }
204  }
205  }
206 }
207 
208 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
209 __global__ void batch_min_subset(index_type subset_size,
210  const unsigned int* __restrict__ indices,
211  const OffsetIteratorType offsets,
212  const DataType* __restrict__ subset,
213  DataType* __restrict__ array,
214  DynamicBitset* __restrict__ is_array_updated) {
215  unsigned tid = TID_1D;
216  unsigned nthreads = TOTAL_THREADS_1D;
217  index_type src_end = subset_size;
218  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
219  unsigned index = indices[offsets[src]];
220  if (array[index] > subset[src]) {
221  array[index] = subset[src];
222  if (sharedType != sharedMirror) {
223  is_array_updated->set(index);
224  }
225  }
226  }
227 }
228 
229 template <typename DataType, SharedType sharedType>
230 __global__ void batch_max_subset(index_type subset_size,
231  const unsigned int* __restrict__ indices,
232  const DataType* __restrict__ subset,
233  DataType* __restrict__ array,
234  DynamicBitset* __restrict__ is_array_updated) {
235  unsigned tid = TID_1D;
236  unsigned nthreads = TOTAL_THREADS_1D;
237  index_type src_end = subset_size;
238  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
239  unsigned index = indices[src];
240  if (array[index] < subset[src]) {
241  array[index] = subset[src];
242  if (sharedType != sharedMirror) {
243  is_array_updated->set(index);
244  }
245  }
246  }
247 }
248 
249 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
250 __global__ void batch_max_subset(index_type subset_size,
251  const unsigned int* __restrict__ indices,
252  const OffsetIteratorType offsets,
253  const DataType* __restrict__ subset,
254  DataType* __restrict__ array,
255  DynamicBitset* __restrict__ is_array_updated) {
256  unsigned tid = TID_1D;
257  unsigned nthreads = TOTAL_THREADS_1D;
258  index_type src_end = subset_size;
259  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
260  unsigned index = indices[offsets[src]];
261  if (array[index] < subset[src]) {
262  array[index] = subset[src];
263  if (sharedType != sharedMirror) {
264  is_array_updated->set(index);
265  }
266  }
267  }
268 }
269 
270 template <typename DataType>
271 __global__ void batch_reset(DataType* __restrict__ array, index_type begin,
272  index_type end, DataType val) {
273  unsigned tid = TID_1D;
274  unsigned nthreads = TOTAL_THREADS_1D;
275  index_type src_end = end;
276  for (index_type src = begin + tid; src < src_end; src += nthreads) {
277  array[src] = val;
278  }
279 }
280 
281 __global__ void
283  const unsigned int* __restrict__ indices,
284  DynamicBitset* __restrict__ is_subset_updated,
285  DynamicBitset* __restrict__ is_array_updated) {
286  unsigned tid = TID_1D;
287  unsigned nthreads = TOTAL_THREADS_1D;
288  index_type src_end = subset_size;
289  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
290  unsigned index = indices[src];
291  if (is_array_updated->test(index)) {
292  is_subset_updated->set(src);
293  }
294  }
295 }
296 
297 // inclusive range
298 __global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,
299  size_t vec_begin, size_t vec_end, bool test1,
300  size_t bit_index1, uint64_t mask1,
301  bool test2, size_t bit_index2,
302  uint64_t mask2) {
303  unsigned tid = TID_1D;
304  unsigned nthreads = TOTAL_THREADS_1D;
305 
306  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
307  bitset->batch_reset(src);
308  }
309 
310  if (tid == 0) {
311  if (test1) {
312  bitset->batch_bitwise_and(bit_index1, mask1);
313  }
314  if (test2) {
315  bitset->batch_bitwise_and(bit_index2, mask2);
316  }
317  }
318 }
319 
320 template <typename DataType>
322  size_t begin, size_t end) {
323  dim3 blocks;
324  dim3 threads;
325  kernel_sizing(blocks, threads);
326  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();
327  assert(begin <= (bitset_cpu->size() - 1));
328  assert(end <= (bitset_cpu->size() - 1));
329 
330  size_t vec_begin = (begin + 63) / 64;
331  size_t vec_end;
332 
333  if (end == (bitset_cpu->size() - 1))
334  vec_end = bitset_cpu->vec_size();
335  else
336  vec_end = (end + 1) / 64; // floor
337 
338  size_t begin2 = vec_begin * 64;
339  size_t end2 = vec_end * 64;
340 
341  bool test1;
342  size_t bit_index1;
343  uint64_t mask1;
344 
345  bool test2;
346  size_t bit_index2;
347  uint64_t mask2;
348 
349  if (begin2 > end2) {
350  test2 = false;
351 
352  if (begin < begin2) {
353  test1 = true;
354  bit_index1 = begin / 64;
355  size_t diff = begin2 - begin;
356  assert(diff < 64);
357  mask1 = ((uint64_t)1 << (64 - diff)) - 1;
358 
359  // create or mask
360  size_t diff2 = end - end2 + 1;
361  assert(diff2 < 64);
362  mask2 = ~(((uint64_t)1 << diff2) - 1);
363  mask1 |= ~mask2;
364  } else {
365  test1 = false;
366  }
367  } else {
368  if (begin < begin2) {
369  test1 = true;
370  bit_index1 = begin / 64;
371  size_t diff = begin2 - begin;
372  assert(diff < 64);
373  mask1 = ((uint64_t)1 << (64 - diff)) - 1;
374  } else {
375  test1 = false;
376  }
377 
378  if (end >= end2) {
379  test2 = true;
380  bit_index2 = end / 64;
381  size_t diff = end - end2 + 1;
382  assert(diff < 64);
383  mask2 = ~(((uint64_t)1 << diff) - 1);
384  } else {
385  test2 = false;
386  }
387  }
388 
389  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),
390  vec_begin, vec_end, test1, bit_index1,
391  mask1, test2, bit_index2, mask2);
392 }
393 
394 template <typename DataType>
396  size_t begin, size_t end, DataType val) {
397  dim3 blocks;
398  dim3 threads;
399  kernel_sizing(blocks, threads);
400 
401  batch_reset<DataType><<<blocks, threads>>>(
402  field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);
403 }
404 
406  unsigned int* __restrict__ offsets,
407  DynamicBitset* __restrict__ bitset,
408  size_t* __restrict__ num_set_bits) {
409  cub::CachingDeviceAllocator g_allocator(
410  true); // Caching allocator for device memory
411  DynamicBitsetIterator flag_iterator(bitset);
412  IdentityIterator offset_iterator;
413  Shared<size_t> num_set_bits_ptr;
414  num_set_bits_ptr.alloc(1);
415  void* d_temp_storage = NULL;
416  size_t temp_storage_bytes = 0;
417  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
418  offset_iterator, flag_iterator, offsets,
419  num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
421  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
422  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));
423  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
424  offset_iterator, flag_iterator, offsets,
425  num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
427  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));
428  if (d_temp_storage)
429  CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
430  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
431 }
432 
433 template <typename DataType, SharedType sharedType, bool reset>
436  unsigned from_id, uint8_t* send_buffer,
437  DataType i = 0) {
438  struct CUDA_Context_Shared_Edges* shared;
439  if (sharedType == sharedMaster) {
440  shared = &ctx->master;
441  } else { // sharedMirror
442  shared = &ctx->mirror;
443  }
444  DeviceOnly<DataType>* shared_data = &field->shared_data;
445  dim3 blocks;
446  dim3 threads;
447  kernel_sizing(blocks, threads);
448 
449  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
450  // timer.start();
451  // timer1.start();
452  size_t v_size = shared->num_edges[from_id];
453  if (reset) {
454  batch_get_reset_subset<DataType><<<blocks, threads>>>(
455  v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),
456  field->data.gpu_wr_ptr(), i);
457  } else {
458  batch_get_subset<DataType><<<blocks, threads>>>(
459  v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),
460  field->data.gpu_rd_ptr());
461  }
463  // timer1.stop();
464  // timer2.start();
465  DataCommMode data_mode = onlyData;
466  memcpy(send_buffer, &data_mode, sizeof(data_mode));
467  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
468  shared_data->copy_to_cpu(
469  (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
470  // timer2.stop();
471  // timer.stop();
472  // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
473  // ctx->id, from_id,
474  // timer1.duration_ms(), timer2.duration_ms(),
475  // timer.duration_ms());
476 }
477 
478 template <typename DataType>
480  DataCommMode data_mode, size_t bit_set_count,
481  size_t num_shared, DeviceOnly<DataType>* shared_data,
482  uint8_t* send_buffer) {
483  if (data_mode == noData) {
484  // do nothing
485  return;
486  }
487 
488  size_t offset = 0;
489 
490  // serialize data_mode
491  memcpy(send_buffer, &data_mode, sizeof(data_mode));
492  offset += sizeof(data_mode);
493 
494  if (data_mode != onlyData) {
495  // serialize bit_set_count
496  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
497  offset += sizeof(bit_set_count);
498  }
499 
500  if ((data_mode == gidsData) || (data_mode == offsetsData)) {
501  // serialize offsets vector
502  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
503  offset += sizeof(bit_set_count);
504  ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),
505  bit_set_count);
506  offset += bit_set_count * sizeof(unsigned int);
507  } else if ((data_mode == bitsetData)) {
508  // serialize bitset
509  memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));
510  offset += sizeof(num_shared);
511  size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
512  memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));
513  offset += sizeof(vec_size);
514  ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(
515  (uint64_t*)(send_buffer + offset));
516  offset += vec_size * sizeof(uint64_t);
517  }
518 
519  // serialize data vector
520  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
521  offset += sizeof(bit_set_count);
522  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
523  // offset += bit_set_count * sizeof(DataType);
524 }
525 
526 template <typename DataType, SharedType sharedType, bool reset>
529  unsigned from_id, uint8_t* send_buffer,
530  size_t* v_size, DataCommMode* data_mode,
531  DataType i = 0) {
532  struct CUDA_Context_Shared_Edges* shared;
533  if (sharedType == sharedMaster) {
534  shared = &ctx->master;
535  } else { // sharedMirror
536  shared = &ctx->mirror;
537  }
538  DeviceOnly<DataType>* shared_data = &field->shared_data;
539  dim3 blocks;
540  dim3 threads;
541  kernel_sizing(blocks, threads);
542 
543  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
544  // timer3("timer3"), timer4("timer 4"); timer.start();
545  if (enforcedDataMode != onlyData) {
546  // timer1.start();
547  ctx->is_updated.cpu_rd_ptr()->resize(shared->num_edges[from_id]);
548  ctx->is_updated.cpu_rd_ptr()->reset();
549  batch_get_subset_bitset<<<blocks, threads>>>(
550  shared->num_edges[from_id], shared->edges[from_id].device_ptr(),
551  ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
553  // timer1.stop();
554  // timer2.start();
555  get_offsets_from_bitset(shared->num_edges[from_id],
556  ctx->offsets.device_ptr(),
557  ctx->is_updated.gpu_rd_ptr(), v_size);
558  // timer2.stop();
559  }
560  *data_mode = get_data_mode<DataType>(*v_size, shared->num_edges[from_id]);
561  // timer3.start();
562  if ((*data_mode) == onlyData) {
563  *v_size = shared->num_edges[from_id];
564  if (reset) {
565  batch_get_reset_subset<DataType><<<blocks, threads>>>(
566  *v_size, shared->edges[from_id].device_ptr(),
567  shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
568  } else {
569  batch_get_subset<DataType><<<blocks, threads>>>(
570  *v_size, shared->edges[from_id].device_ptr(),
571  shared_data->device_ptr(), field->data.gpu_rd_ptr());
572  }
573  } else { // bitsetData || offsetsData
574  if (reset) {
575  batch_get_reset_subset<DataType><<<blocks, threads>>>(
576  *v_size, shared->edges[from_id].device_ptr(),
577  ctx->offsets.device_ptr(), shared_data->device_ptr(),
578  field->data.gpu_wr_ptr(), i);
579  } else {
580  batch_get_subset<DataType><<<blocks, threads>>>(
581  *v_size, shared->edges[from_id].device_ptr(),
582  ctx->offsets.device_ptr(), shared_data->device_ptr(),
583  field->data.gpu_rd_ptr());
584  }
585  }
587  // timer3.stop();
588  // timer4.start();
589  serializeMessage(ctx, *data_mode, *v_size, shared->num_edges[from_id],
590  shared_data, send_buffer);
591  // timer4.stop();
592  // timer.stop();
593  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
594  // + %llu + %llu + %llu = %llu\n",
595  // ctx->id, from_id, *data_mode,
596  // ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
597  // (*v_size), timer1.duration_ms(), timer2.duration_ms(),
598  // timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
599 }
600 
601 template <typename DataType>
603  DataCommMode data_mode, size_t& bit_set_count,
604  size_t num_shared, DeviceOnly<DataType>* shared_data,
605  uint8_t* recv_buffer) {
606  size_t offset = 0; // data_mode is already deserialized
607 
608  if (data_mode != onlyData) {
609  // deserialize bit_set_count
610  memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));
611  offset += sizeof(bit_set_count);
612  } else {
613  bit_set_count = num_shared;
614  }
615 
616  assert(data_mode != gidsData); // not supported for deserialization on GPUs
617  if (data_mode == offsetsData) {
618  // deserialize offsets vector
619  offset += sizeof(bit_set_count);
620  ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),
621  bit_set_count);
622  offset += bit_set_count * sizeof(unsigned int);
623  } else if ((data_mode == bitsetData)) {
624  // deserialize bitset
625  ctx->is_updated.cpu_rd_ptr()->resize(num_shared);
626  offset += sizeof(num_shared);
627  size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
628  offset += sizeof(vec_size);
629  ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(
630  (uint64_t*)(recv_buffer + offset));
631  offset += vec_size * sizeof(uint64_t);
632  // get offsets
633  size_t v_size;
634  get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),
635  ctx->is_updated.gpu_rd_ptr(), &v_size);
636 
637  assert(bit_set_count == v_size);
638  }
639 
640  // deserialize data vector
641  offset += sizeof(bit_set_count);
642  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
643  // offset += bit_set_count * sizeof(DataType);
644 }
645 
646 template <typename DataType, SharedType sharedType, UpdateOp op>
649  unsigned from_id, uint8_t* recv_buffer,
650  DataCommMode data_mode) {
651  assert(data_mode != noData);
652  struct CUDA_Context_Shared_Edges* shared;
653  if (sharedType == sharedMaster) {
654  shared = &ctx->master;
655  } else { // sharedMirror
656  shared = &ctx->mirror;
657  }
658  DeviceOnly<DataType>* shared_data = &field->shared_data;
659  dim3 blocks;
660  dim3 threads;
661  kernel_sizing(blocks, threads);
662  size_t v_size;
663 
664  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
665  // timer.start();
666  // timer1.start();
667  deserializeMessage(ctx, data_mode, v_size, shared->num_edges[from_id],
668  shared_data, recv_buffer);
669  // timer1.stop();
670  // timer2.start();
671  if (data_mode == onlyData) {
672  if (op == setOp) {
673  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
674  v_size, shared->edges[from_id].device_ptr(),
675  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
676  field->is_updated.gpu_wr_ptr());
677  } else if (op == addOp) {
678  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
679  v_size, shared->edges[from_id].device_ptr(),
680  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
681  field->is_updated.gpu_wr_ptr());
682  } else if (op == minOp) {
683  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
684  v_size, shared->edges[from_id].device_ptr(),
685  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
686  field->is_updated.gpu_wr_ptr());
687  }
688  } else if (data_mode == gidsData) {
689  if (op == setOp) {
690  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
691  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
692  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
693  } else if (op == addOp) {
694  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
695  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
696  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
697  } else if (op == minOp) {
698  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
699  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
700  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
701  }
702  } else { // bitsetData || offsetsData
703  if (op == setOp) {
704  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
705  v_size, shared->edges[from_id].device_ptr(),
706  ctx->offsets.device_ptr(), shared_data->device_ptr(),
707  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
708  } else if (op == addOp) {
709  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
710  v_size, shared->edges[from_id].device_ptr(),
711  ctx->offsets.device_ptr(), shared_data->device_ptr(),
712  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
713  } else if (op == minOp) {
714  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
715  v_size, shared->edges[from_id].device_ptr(),
716  ctx->offsets.device_ptr(), shared_data->device_ptr(),
717  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
718  }
719  }
721  // timer2.stop();
722  // timer.stop();
723  // fprintf(stderr, "Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\n",
724  // ctx->id, from_id, data_mode,
725  // timer1.duration_ms(), timer2.duration_ms(),
726  // timer.duration_ms());
727 }
Definition: DeviceEdgeSync.h:43
__global__ void bitset_reset_range(DynamicBitset *__restrict__ bitset, size_t vec_begin, size_t vec_end, bool test1, size_t bit_index1, uint64_t mask1, bool test2, size_t bit_index2, uint64_t mask2)
Definition: DeviceEdgeSync.h:298
__device__ __host__ size_t vec_size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:76
Definition: DataCommMode.h:35
__global__ void batch_reset(DataType *__restrict__ array, index_type begin, index_type end, DataType val)
Definition: DeviceEdgeSync.h:271
Shared< Type > data
Definition: EdgeContext.h:57
Definition: DeviceEdgeSync.h:42
Dynamic Bitset, CUDA version.
Definition: libgluon/include/galois/cuda/DynamicBitset.h:40
DeviceOnly< Type > shared_data
Definition: EdgeContext.h:59
Definition: DeviceEdgeSync.h:42
Definition: EdgeContext.h:36
void kernel_sizing(dim3 &blocks, dim3 &threads)
Definition: DeviceEdgeSync.h:45
unsigned int index_type
Definition: EdgeHostDecls.h:33
Contains definition of CUDA context structures.
SharedType
Definition: DeviceEdgeSync.h:42
void deserializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t &bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *recv_buffer)
Definition: DeviceEdgeSync.h:602
void get_offsets_from_bitset(index_type bitset_size, unsigned int *__restrict__ offsets, DynamicBitset *__restrict__ bitset, size_t *__restrict__ num_set_bits)
Definition: DeviceEdgeSync.h:405
__global__ void batch_min_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:189
__global__ void batch_max_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:230
Definition: DataCommMode.h:37
Definition: DeviceEdgeSync.h:43
Definition: DeviceEdgeSync.h:43
Definition: EdgeContext.h:56
void reset_bitset_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end)
Definition: DeviceEdgeSync.h:321
struct CUDA_Context_Shared_Edges mirror
Definition: EdgeContext.h:50
void serializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *send_buffer)
Definition: DeviceEdgeSync.h:479
__device__ __host__ size_t size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:74
UpdateOp
Definition: DeviceEdgeSync.h:43
Contains the DataCommMode enumeration and a function that chooses a data comm mode based on its argum...
Shared< DynamicBitset > is_updated
Definition: EdgeContext.h:52
__global__ void batch_get_reset_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, DataType *__restrict__ array, DataType reset_value)
Definition: DeviceEdgeSync.h:82
DeviceOnly< unsigned int > offsets
Definition: EdgeContext.h:51
unsigned int * num_edges
Definition: EdgeContext.h:37
void reset(Ty &var, Ty val)
Definition: AtomicHelpers.h:202
send no data
Definition: DataCommMode.h:34
Definition: DataCommMode.h:38
__global__ void batch_add_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:152
struct CUDA_Context_Shared_Edges master
Definition: EdgeContext.h:49
Definition: DataCommMode.h:36
void reset_data_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end, DataType val)
Definition: DeviceEdgeSync.h:395
Definition: libgluon/include/galois/cuda/DynamicBitset.h:133
void batch_get_shared_edge(struct CUDA_Context_Common_Edges *ctx, struct CUDA_Context_Field_Edges< DataType > *field, unsigned from_id, uint8_t *send_buffer, DataType i=0)
Definition: DeviceEdgeSync.h:434
Definition: libgluon/include/galois/cuda/DynamicBitset.h:209
__global__ void batch_set_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:115
DataCommMode enforcedDataMode
Specifies what format to send metadata in.
Definition: GluonSubstrate.cpp:29
DeviceOnly< unsigned int > * edges
Definition: EdgeContext.h:38
__global__ void batch_get_subset_bitset(index_type subset_size, const unsigned int *__restrict__ indices, DynamicBitset *__restrict__ is_subset_updated, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:282
Definition: EdgeContext.h:41
Contains implementation of CUDA dynamic bitset and iterators for it.
DataCommMode
Enumeration of data communication modes that can be used in synchronization.
Definition: DataCommMode.h:33
void batch_set_shared_edge(struct CUDA_Context_Common_Edges *ctx, struct CUDA_Context_Field_Edges< DataType > *field, unsigned from_id, uint8_t *recv_buffer, DataCommMode data_mode)
Definition: DeviceEdgeSync.h:647
#define check_cuda_kernel
Definition: DeviceEdgeSync.h:39
__global__ void batch_get_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, const DataType *__restrict__ array)
Definition: DeviceEdgeSync.h:53
Shared< DynamicBitset > is_updated
Definition: EdgeContext.h:58