Galois
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DeviceSync.h
Go to the documentation of this file.
1 /*
2  * This file belongs to the Galois project, a C++ library for exploiting
3  * parallelism. The code is being released under the terms of the 3-Clause BSD
4  * License (a copy is located in LICENSE.txt at the top-level directory).
5  *
6  * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
7  * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
8  * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
9  * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
10  * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
11  * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
12  * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
13  * shall University be liable for incidental, special, indirect, direct or
14  * consequential damages or loss of profits, interruption of business, or
15  * related expenses which may arise from use of Software or Documentation,
16  * including but not limited to those resulting from defects in Software and/or
17  * Documentation, or loss or inaccuracy of data of any kind.
18  */
19 
20 /*
21  */
22 
30 #pragma once
32 #include "galois/cuda/Context.h"
34 #include "cub/util_allocator.cuh"
35 
36 #ifdef GALOIS_CUDA_CHECK_ERROR
37 #define check_cuda_kernel \
38  check_cuda(cudaDeviceSynchronize()); \
39  check_cuda(cudaGetLastError());
40 #else
41 #define check_cuda_kernel check_cuda(cudaGetLastError());
42 #endif
43 
46 
47 void kernel_sizing(dim3& blocks, dim3& threads) {
48  threads.x = 256;
49  threads.y = threads.z = 1;
50  blocks.x = ggc_get_nSM() * 8;
51  blocks.y = blocks.z = 1;
52 }
53 
54 template <typename DataType>
55 __global__ void batch_get_subset(index_type subset_size,
56  const unsigned int* __restrict__ indices,
57  DataType* __restrict__ subset,
58  const DataType* __restrict__ array) {
59  unsigned tid = TID_1D;
60  unsigned nthreads = TOTAL_THREADS_1D;
61  index_type src_end = subset_size;
62  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
63  unsigned index = indices[src];
64  subset[src] = array[index];
65  }
66 }
67 
68 template <typename DataType, typename OffsetIteratorType>
69 __global__ void batch_get_subset(index_type subset_size,
70  const unsigned int* __restrict__ indices,
71  const OffsetIteratorType offsets,
72  DataType* __restrict__ subset,
73  const DataType* __restrict__ array) {
74  unsigned tid = TID_1D;
75  unsigned nthreads = TOTAL_THREADS_1D;
76  index_type src_end = subset_size;
77  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
78  unsigned index = indices[offsets[src]];
79  subset[src] = array[index];
80  }
81 }
82 
83 template <typename DataType>
84 __global__ void batch_get_reset_subset(index_type subset_size,
85  const unsigned int* __restrict__ indices,
86  DataType* __restrict__ subset,
87  DataType* __restrict__ array,
88  DataType reset_value) {
89  unsigned tid = TID_1D;
90  unsigned nthreads = TOTAL_THREADS_1D;
91  index_type src_end = subset_size;
92  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
93  unsigned index = indices[src];
94  subset[src] = array[index];
95  array[index] = reset_value;
96  }
97 }
98 
99 template <typename DataType, typename OffsetIteratorType>
100 __global__ void batch_get_reset_subset(index_type subset_size,
101  const unsigned int* __restrict__ indices,
102  const OffsetIteratorType offsets,
103  DataType* __restrict__ subset,
104  DataType* __restrict__ array,
105  DataType reset_value) {
106  unsigned tid = TID_1D;
107  unsigned nthreads = TOTAL_THREADS_1D;
108  index_type src_end = subset_size;
109  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
110  unsigned index = indices[offsets[src]];
111  subset[src] = array[index];
112  array[index] = reset_value;
113  }
114 }
115 
116 template <typename DataType, SharedType sharedType>
117 __global__ void batch_set_subset(index_type subset_size,
118  const unsigned int* __restrict__ indices,
119  const DataType* __restrict__ subset,
120  DataType* __restrict__ array,
121  DynamicBitset* __restrict__ is_array_updated) {
122  unsigned tid = TID_1D;
123  unsigned nthreads = TOTAL_THREADS_1D;
124  index_type src_end = subset_size;
125  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
126  unsigned index = indices[src];
127  array[index] = subset[src];
128  if (sharedType != sharedMirror) {
129  is_array_updated->set(index);
130  }
131  }
132 }
133 
134 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
135 __global__ void batch_set_subset(index_type subset_size,
136  const unsigned int* __restrict__ indices,
137  const OffsetIteratorType offsets,
138  const DataType* __restrict__ subset,
139  DataType* __restrict__ array,
140  DynamicBitset* __restrict__ is_array_updated) {
141  unsigned tid = TID_1D;
142  unsigned nthreads = TOTAL_THREADS_1D;
143  index_type src_end = subset_size;
144  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
145  unsigned index = indices[offsets[src]];
146  array[index] = subset[src];
147  if (sharedType != sharedMirror) {
148  is_array_updated->set(index);
149  }
150  }
151 }
152 
153 template <typename DataType, SharedType sharedType>
154 __global__ void batch_add_subset(index_type subset_size,
155  const unsigned int* __restrict__ indices,
156  const DataType* __restrict__ subset,
157  DataType* __restrict__ array,
158  DynamicBitset* __restrict__ is_array_updated) {
159  unsigned tid = TID_1D;
160  unsigned nthreads = TOTAL_THREADS_1D;
161  index_type src_end = subset_size;
162  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
163  unsigned index = indices[src];
164  array[index] += subset[src];
165  if (sharedType != sharedMirror) {
166  is_array_updated->set(index);
167  }
168  }
169 }
170 
171 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
172 __global__ void batch_add_subset(index_type subset_size,
173  const unsigned int* __restrict__ indices,
174  const OffsetIteratorType offsets,
175  const DataType* __restrict__ subset,
176  DataType* __restrict__ array,
177  DynamicBitset* __restrict__ is_array_updated) {
178  unsigned tid = TID_1D;
179  unsigned nthreads = TOTAL_THREADS_1D;
180  index_type src_end = subset_size;
181  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
182  unsigned index = indices[offsets[src]];
183  array[index] += subset[src];
184  if (sharedType != sharedMirror) {
185  is_array_updated->set(index);
186  }
187  }
188 }
189 
190 template <typename DataType, SharedType sharedType>
191 __global__ void batch_min_subset(index_type subset_size,
192  const unsigned int* __restrict__ indices,
193  const DataType* __restrict__ subset,
194  DataType* __restrict__ array,
195  DynamicBitset* __restrict__ is_array_updated) {
196  unsigned tid = TID_1D;
197  unsigned nthreads = TOTAL_THREADS_1D;
198  index_type src_end = subset_size;
199  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
200  unsigned index = indices[src];
201  if (array[index] > subset[src]) {
202  array[index] = subset[src];
203  if (sharedType != sharedMirror) {
204  is_array_updated->set(index);
205  }
206  }
207  }
208 }
209 
210 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
211 __global__ void batch_min_subset(index_type subset_size,
212  const unsigned int* __restrict__ indices,
213  const OffsetIteratorType offsets,
214  const DataType* __restrict__ subset,
215  DataType* __restrict__ array,
216  DynamicBitset* __restrict__ is_array_updated) {
217  unsigned tid = TID_1D;
218  unsigned nthreads = TOTAL_THREADS_1D;
219  index_type src_end = subset_size;
220  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
221  unsigned index = indices[offsets[src]];
222  if (array[index] > subset[src]) {
223  array[index] = subset[src];
224  if (sharedType != sharedMirror) {
225  is_array_updated->set(index);
226  }
227  }
228  }
229 }
230 
231 template <typename DataType, SharedType sharedType>
232 __global__ void batch_max_subset(index_type subset_size,
233  const unsigned int* __restrict__ indices,
234  const DataType* __restrict__ subset,
235  DataType* __restrict__ array,
236  DynamicBitset* __restrict__ is_array_updated) {
237  unsigned tid = TID_1D;
238  unsigned nthreads = TOTAL_THREADS_1D;
239  index_type src_end = subset_size;
240  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
241  unsigned index = indices[src];
242  if (array[index] < subset[src]) {
243  array[index] = subset[src];
244  if (sharedType != sharedMirror) {
245  is_array_updated->set(index);
246  }
247  }
248  }
249 }
250 
251 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
252 __global__ void batch_max_subset(index_type subset_size,
253  const unsigned int* __restrict__ indices,
254  const OffsetIteratorType offsets,
255  const DataType* __restrict__ subset,
256  DataType* __restrict__ array,
257  DynamicBitset* __restrict__ is_array_updated) {
258  unsigned tid = TID_1D;
259  unsigned nthreads = TOTAL_THREADS_1D;
260  index_type src_end = subset_size;
261  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
262  unsigned index = indices[offsets[src]];
263  if (array[index] < subset[src]) {
264  array[index] = subset[src];
265  if (sharedType != sharedMirror) {
266  is_array_updated->set(index);
267  }
268  }
269  }
270 }
271 
272 template <typename DataType>
273 __global__ void batch_reset(DataType* __restrict__ array, index_type begin,
274  index_type end, DataType val) {
275  unsigned tid = TID_1D;
276  unsigned nthreads = TOTAL_THREADS_1D;
277  index_type src_end = end;
278  for (index_type src = begin + tid; src < src_end; src += nthreads) {
279  array[src] = val;
280  }
281 }
282 
283 __global__ void
285  const unsigned int* __restrict__ indices,
286  DynamicBitset* __restrict__ is_subset_updated,
287  DynamicBitset* __restrict__ is_array_updated) {
288  unsigned tid = TID_1D;
289  unsigned nthreads = TOTAL_THREADS_1D;
290  index_type src_end = subset_size;
291  for (index_type src = 0 + tid; src < src_end; src += nthreads) {
292  unsigned index = indices[src];
293  if (is_array_updated->test(index)) {
294  is_subset_updated->set(src);
295  }
296  }
297 }
298 
299 // inclusive range
300 __global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,
301  size_t vec_begin, size_t vec_end, bool test1,
302  size_t bit_index1, uint64_t mask1,
303  bool test2, size_t bit_index2,
304  uint64_t mask2) {
305  unsigned tid = TID_1D;
306  unsigned nthreads = TOTAL_THREADS_1D;
307 
308  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
309  bitset->batch_reset(src);
310  }
311 
312  if (tid == 0) {
313  if (test1) {
314  bitset->batch_bitwise_and(bit_index1, mask1);
315  }
316  if (test2) {
317  bitset->batch_bitwise_and(bit_index2, mask2);
318  }
319  }
320 }
321 
322 template <typename DataType>
324  size_t begin, size_t end) {
325  dim3 blocks;
326  dim3 threads;
327  kernel_sizing(blocks, threads);
328  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();
329  assert(begin <= (bitset_cpu->size() - 1));
330  assert(end <= (bitset_cpu->size() - 1));
331 
332  size_t vec_begin = (begin + 63) / 64;
333  size_t vec_end;
334 
335  if (end == (bitset_cpu->size() - 1))
336  vec_end = bitset_cpu->vec_size();
337  else
338  vec_end = (end + 1) / 64; // floor
339 
340  size_t begin2 = vec_begin * 64;
341  size_t end2 = vec_end * 64;
342 
343  bool test1;
344  size_t bit_index1;
345  uint64_t mask1;
346 
347  bool test2;
348  size_t bit_index2;
349  uint64_t mask2;
350 
351  if (begin2 > end2) {
352  test2 = false;
353 
354  if (begin < begin2) {
355  test1 = true;
356  bit_index1 = begin / 64;
357  size_t diff = begin2 - begin;
358  assert(diff < 64);
359  mask1 = ((uint64_t)1 << (64 - diff)) - 1;
360 
361  // create or mask
362  size_t diff2 = end - end2 + 1;
363  assert(diff2 < 64);
364  mask2 = ~(((uint64_t)1 << diff2) - 1);
365  mask1 |= ~mask2;
366  } else {
367  test1 = false;
368  }
369  } else {
370  if (begin < begin2) {
371  test1 = true;
372  bit_index1 = begin / 64;
373  size_t diff = begin2 - begin;
374  assert(diff < 64);
375  mask1 = ((uint64_t)1 << (64 - diff)) - 1;
376  } else {
377  test1 = false;
378  }
379 
380  if (end >= end2) {
381  test2 = true;
382  bit_index2 = end / 64;
383  size_t diff = end - end2 + 1;
384  assert(diff < 64);
385  mask2 = ~(((uint64_t)1 << diff) - 1);
386  } else {
387  test2 = false;
388  }
389  }
390 
391  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),
392  vec_begin, vec_end, test1, bit_index1,
393  mask1, test2, bit_index2, mask2);
394 }
395 
396 template <typename DataType>
397 void reset_data_field(struct CUDA_Context_Field<DataType>* field, size_t begin,
398  size_t end, DataType val) {
399  dim3 blocks;
400  dim3 threads;
401  kernel_sizing(blocks, threads);
402 
403  batch_reset<DataType><<<blocks, threads>>>(
404  field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);
405 }
406 
408  unsigned int* __restrict__ offsets,
409  DynamicBitset* __restrict__ bitset,
410  size_t* __restrict__ num_set_bits) {
411  cub::CachingDeviceAllocator g_allocator(
412  true); // Caching allocator for device memory
413  DynamicBitsetIterator flag_iterator(bitset);
414  IdentityIterator offset_iterator;
415  Shared<size_t> num_set_bits_ptr;
416  num_set_bits_ptr.alloc(1);
417  void* d_temp_storage = NULL;
418  size_t temp_storage_bytes = 0;
419  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
420  offset_iterator, flag_iterator, offsets,
421  num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
423  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
424  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));
425  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
426  offset_iterator, flag_iterator, offsets,
427  num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);
429  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));
430  if (d_temp_storage)
431  CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
432  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
433 }
434 
435 template <typename DataType, SharedType sharedType, bool reset>
437  struct CUDA_Context_Field<DataType>* field,
438  unsigned from_id, uint8_t* send_buffer,
439  DataType i = 0) {
440  struct CUDA_Context_Shared* shared;
441  if (sharedType == sharedMaster) {
442  shared = &ctx->master;
443  } else { // sharedMirror
444  shared = &ctx->mirror;
445  }
446  DeviceOnly<DataType>* shared_data = &field->shared_data;
447  dim3 blocks;
448  dim3 threads;
449  kernel_sizing(blocks, threads);
450 
451  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
452  // timer.start();
453  // timer1.start();
454  size_t v_size = shared->num_nodes[from_id];
455  if (reset) {
456  batch_get_reset_subset<DataType><<<blocks, threads>>>(
457  v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
458  field->data.gpu_wr_ptr(), i);
459  } else {
460  batch_get_subset<DataType><<<blocks, threads>>>(
461  v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
462  field->data.gpu_rd_ptr());
463  }
465  // timer1.stop();
466  // timer2.start();
467  DataCommMode data_mode = onlyData;
468  memcpy(send_buffer, &data_mode, sizeof(data_mode));
469  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
470  shared_data->copy_to_cpu(
471  (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
472  // timer2.stop();
473  // timer.stop();
474  // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
475  // ctx->id, from_id,
476  // timer1.duration_ms(), timer2.duration_ms(),
477  // timer.duration_ms());
478 }
479 
480 template <typename DataType>
482  size_t bit_set_count, size_t num_shared,
483  DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {
484  if (data_mode == noData) {
485  // do nothing
486  return;
487  }
488 
489  size_t offset = 0;
490 
491  // serialize data_mode
492  memcpy(send_buffer, &data_mode, sizeof(data_mode));
493  offset += sizeof(data_mode);
494 
495  if (data_mode != onlyData) {
496  // serialize bit_set_count
497  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
498  offset += sizeof(bit_set_count);
499  }
500 
501  if ((data_mode == gidsData) || (data_mode == offsetsData)) {
502  // serialize offsets vector
503  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
504  offset += sizeof(bit_set_count);
505  ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),
506  bit_set_count);
507  offset += bit_set_count * sizeof(unsigned int);
508  } else if ((data_mode == bitsetData)) {
509  // serialize bitset
510  memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));
511  offset += sizeof(num_shared);
512  size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
513  memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));
514  offset += sizeof(vec_size);
515  ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(
516  (uint64_t*)(send_buffer + offset));
517  offset += vec_size * sizeof(uint64_t);
518  }
519 
520  // serialize data vector
521  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
522  offset += sizeof(bit_set_count);
523  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
524  // offset += bit_set_count * sizeof(DataType);
525 }
526 
527 template <typename DataType, SharedType sharedType, bool reset>
529  struct CUDA_Context_Field<DataType>* field,
530  unsigned from_id, uint8_t* send_buffer,
531  size_t* v_size, DataCommMode* data_mode,
532  DataType i = 0) {
533  struct CUDA_Context_Shared* shared;
534  if (sharedType == sharedMaster) {
535  shared = &ctx->master;
536  } else { // sharedMirror
537  shared = &ctx->mirror;
538  }
539  DeviceOnly<DataType>* shared_data = &field->shared_data;
540  dim3 blocks;
541  dim3 threads;
542  kernel_sizing(blocks, threads);
543 
544  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
545  // timer3("timer3"), timer4("timer 4"); timer.start();
546  if (enforcedDataMode != onlyData) {
547  // timer1.start();
548  ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]);
549  ctx->is_updated.cpu_rd_ptr()->reset();
550  batch_get_subset_bitset<<<blocks, threads>>>(
551  shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(),
552  ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
554  // timer1.stop();
555  // timer2.start();
556  get_offsets_from_bitset(shared->num_nodes[from_id],
557  ctx->offsets.device_ptr(),
558  ctx->is_updated.gpu_rd_ptr(), v_size);
559  // timer2.stop();
560  }
561  *data_mode = get_data_mode<DataType>(*v_size, shared->num_nodes[from_id]);
562  // timer3.start();
563  if ((*data_mode) == onlyData) {
564  *v_size = shared->num_nodes[from_id];
565  if (reset) {
566  batch_get_reset_subset<DataType><<<blocks, threads>>>(
567  *v_size, shared->nodes[from_id].device_ptr(),
568  shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
569  } else {
570  batch_get_subset<DataType><<<blocks, threads>>>(
571  *v_size, shared->nodes[from_id].device_ptr(),
572  shared_data->device_ptr(), field->data.gpu_rd_ptr());
573  }
574  } else { // bitsetData || offsetsData
575  if (reset) {
576  batch_get_reset_subset<DataType><<<blocks, threads>>>(
577  *v_size, shared->nodes[from_id].device_ptr(),
578  ctx->offsets.device_ptr(), shared_data->device_ptr(),
579  field->data.gpu_wr_ptr(), i);
580  } else {
581  batch_get_subset<DataType><<<blocks, threads>>>(
582  *v_size, shared->nodes[from_id].device_ptr(),
583  ctx->offsets.device_ptr(), shared_data->device_ptr(),
584  field->data.gpu_rd_ptr());
585  }
586  }
588  // timer3.stop();
589  // timer4.start();
590  serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id],
591  shared_data, send_buffer);
592  // timer4.stop();
593  // timer.stop();
594  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
595  // + %llu + %llu + %llu = %llu\n",
596  // ctx->id, from_id, *data_mode,
597  // ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
598  // (*v_size), timer1.duration_ms(), timer2.duration_ms(),
599  // timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
600 }
601 
602 template <typename DataType>
604  size_t& bit_set_count, size_t num_shared,
605  DeviceOnly<DataType>* shared_data,
606  uint8_t* recv_buffer) {
607  size_t offset = 0; // data_mode is already deserialized
608 
609  if (data_mode != onlyData) {
610  // deserialize bit_set_count
611  memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));
612  offset += sizeof(bit_set_count);
613  } else {
614  bit_set_count = num_shared;
615  }
616 
617  assert(data_mode != gidsData); // not supported for deserialization on GPUs
618  if (data_mode == offsetsData) {
619  // deserialize offsets vector
620  offset += sizeof(bit_set_count);
621  ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),
622  bit_set_count);
623  offset += bit_set_count * sizeof(unsigned int);
624  } else if ((data_mode == bitsetData)) {
625  // deserialize bitset
626  ctx->is_updated.cpu_rd_ptr()->resize(num_shared);
627  offset += sizeof(num_shared);
628  size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();
629  offset += sizeof(vec_size);
630  ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(
631  (uint64_t*)(recv_buffer + offset));
632  offset += vec_size * sizeof(uint64_t);
633  // get offsets
634  size_t v_size;
635  get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),
636  ctx->is_updated.gpu_rd_ptr(), &v_size);
637 
638  assert(bit_set_count == v_size);
639  }
640 
641  // deserialize data vector
642  offset += sizeof(bit_set_count);
643  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
644  // offset += bit_set_count * sizeof(DataType);
645 }
646 
647 template <typename DataType, SharedType sharedType, UpdateOp op>
649  struct CUDA_Context_Field<DataType>* field,
650  unsigned from_id, uint8_t* recv_buffer,
651  DataCommMode data_mode) {
652  assert(data_mode != noData);
653  struct CUDA_Context_Shared* shared;
654  if (sharedType == sharedMaster) {
655  shared = &ctx->master;
656  } else { // sharedMirror
657  shared = &ctx->mirror;
658  }
659  DeviceOnly<DataType>* shared_data = &field->shared_data;
660  dim3 blocks;
661  dim3 threads;
662  kernel_sizing(blocks, threads);
663  size_t v_size;
664 
665  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2");
666  // timer.start();
667  // timer1.start();
668  deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id],
669  shared_data, recv_buffer);
670  // timer1.stop();
671  // timer2.start();
672  if (data_mode == onlyData) {
673  if (op == setOp) {
674  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
675  v_size, shared->nodes[from_id].device_ptr(),
676  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
677  field->is_updated.gpu_wr_ptr());
678  } else if (op == addOp) {
679  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
680  v_size, shared->nodes[from_id].device_ptr(),
681  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
682  field->is_updated.gpu_wr_ptr());
683  } else if (op == minOp) {
684  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
685  v_size, shared->nodes[from_id].device_ptr(),
686  shared_data->device_ptr(), field->data.gpu_wr_ptr(),
687  field->is_updated.gpu_wr_ptr());
688  }
689  } else if (data_mode == gidsData) {
690  if (op == setOp) {
691  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
692  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
693  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
694  } else if (op == addOp) {
695  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
696  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
697  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
698  } else if (op == minOp) {
699  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
700  v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
701  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
702  }
703  } else { // bitsetData || offsetsData
704  if (op == setOp) {
705  batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
706  v_size, shared->nodes[from_id].device_ptr(),
707  ctx->offsets.device_ptr(), shared_data->device_ptr(),
708  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
709  } else if (op == addOp) {
710  batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
711  v_size, shared->nodes[from_id].device_ptr(),
712  ctx->offsets.device_ptr(), shared_data->device_ptr(),
713  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
714  } else if (op == minOp) {
715  batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
716  v_size, shared->nodes[from_id].device_ptr(),
717  ctx->offsets.device_ptr(), shared_data->device_ptr(),
718  field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
719  }
720  }
722  // timer2.stop();
723  // timer.stop();
724  // fprintf(stderr, "Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\n",
725  // ctx->id, from_id, data_mode,
726  // timer1.duration_ms(), timer2.duration_ms(),
727  // timer.duration_ms());
728 }
Definition: DeviceEdgeSync.h:43
__global__ void bitset_reset_range(DynamicBitset *__restrict__ bitset, size_t vec_begin, size_t vec_end, bool test1, size_t bit_index1, uint64_t mask1, bool test2, size_t bit_index2, uint64_t mask2)
Definition: DeviceEdgeSync.h:298
__device__ __host__ size_t vec_size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:76
struct CUDA_Context_Shared mirror
Definition: libgluon/include/galois/cuda/Context.h:50
Definition: DataCommMode.h:35
__global__ void batch_reset(DataType *__restrict__ array, index_type begin, index_type end, DataType val)
Definition: DeviceEdgeSync.h:271
Definition: DeviceEdgeSync.h:42
Dynamic Bitset, CUDA version.
Definition: libgluon/include/galois/cuda/DynamicBitset.h:40
Definition: DeviceEdgeSync.h:42
Shared< DynamicBitset > is_updated
Definition: libgluon/include/galois/cuda/Context.h:52
void kernel_sizing(dim3 &blocks, dim3 &threads)
Definition: DeviceEdgeSync.h:45
DeviceOnly< Type > shared_data
Definition: libgluon/include/galois/cuda/Context.h:59
unsigned int index_type
Definition: EdgeHostDecls.h:33
SharedType
Definition: DeviceEdgeSync.h:42
void deserializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t &bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *recv_buffer)
Definition: DeviceEdgeSync.h:602
Shared< DynamicBitset > is_updated
Definition: libgluon/include/galois/cuda/Context.h:58
Definition: libgluon/include/galois/cuda/Context.h:56
void get_offsets_from_bitset(index_type bitset_size, unsigned int *__restrict__ offsets, DynamicBitset *__restrict__ bitset, size_t *__restrict__ num_set_bits)
Definition: DeviceEdgeSync.h:405
__global__ void batch_min_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:189
__global__ void batch_max_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:230
Definition: DataCommMode.h:37
Definition: DeviceEdgeSync.h:43
Definition: DeviceEdgeSync.h:43
void reset_bitset_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end)
Definition: DeviceEdgeSync.h:321
void serializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *send_buffer)
Definition: DeviceEdgeSync.h:479
Definition: libgluon/include/galois/cuda/Context.h:36
__device__ __host__ size_t size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:74
UpdateOp
Definition: DeviceEdgeSync.h:43
Contains the DataCommMode enumeration and a function that chooses a data comm mode based on its argum...
Definition: libgluon/include/galois/cuda/Context.h:41
__global__ void batch_get_reset_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, DataType *__restrict__ array, DataType reset_value)
Definition: DeviceEdgeSync.h:82
#define check_cuda_kernel
Definition: DeviceSync.h:41
void batch_set_shared_field(struct CUDA_Context_Common *ctx, struct CUDA_Context_Field< DataType > *field, unsigned from_id, uint8_t *recv_buffer, DataCommMode data_mode)
Definition: DeviceSync.h:648
void reset(Ty &var, Ty val)
Definition: AtomicHelpers.h:202
Contains definition of CUDA context structures.
send no data
Definition: DataCommMode.h:34
Definition: DataCommMode.h:38
__global__ void batch_add_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:152
Definition: DataCommMode.h:36
void reset_data_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end, DataType val)
Definition: DeviceEdgeSync.h:395
unsigned int * num_nodes
Definition: libgluon/include/galois/cuda/Context.h:37
Shared< Type > data
Definition: libgluon/include/galois/cuda/Context.h:57
Definition: libgluon/include/galois/cuda/DynamicBitset.h:133
Definition: libgluon/include/galois/cuda/DynamicBitset.h:209
__global__ void batch_set_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:115
DataCommMode enforcedDataMode
Specifies what format to send metadata in.
Definition: GluonSubstrate.cpp:29
void batch_get_shared_field(struct CUDA_Context_Common *ctx, struct CUDA_Context_Field< DataType > *field, unsigned from_id, uint8_t *send_buffer, DataType i=0)
Definition: DeviceSync.h:436
DeviceOnly< unsigned int > offsets
Definition: libgluon/include/galois/cuda/Context.h:51
__global__ void batch_get_subset_bitset(index_type subset_size, const unsigned int *__restrict__ indices, DynamicBitset *__restrict__ is_subset_updated, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:282
Contains implementation of CUDA dynamic bitset and iterators for it.
DataCommMode
Enumeration of data communication modes that can be used in synchronization.
Definition: DataCommMode.h:33
struct CUDA_Context_Shared master
Definition: libgluon/include/galois/cuda/Context.h:49
DeviceOnly< unsigned int > * nodes
Definition: libgluon/include/galois/cuda/Context.h:38
__global__ void batch_get_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, const DataType *__restrict__ array)
Definition: DeviceEdgeSync.h:53