34 #include "cub/util_allocator.cuh"
36 #ifdef GALOIS_CUDA_CHECK_ERROR
37 #define check_cuda_kernel \
38 check_cuda(cudaDeviceSynchronize()); \
39 check_cuda(cudaGetLastError());
41 #define check_cuda_kernel check_cuda(cudaGetLastError());
49 threads.y = threads.z = 1;
50 blocks.x = ggc_get_nSM() * 8;
51 blocks.y = blocks.z = 1;
54 template <
typename DataType>
56 const unsigned int* __restrict__ indices,
57 DataType* __restrict__ subset,
58 const DataType* __restrict__ array) {
59 unsigned tid = TID_1D;
60 unsigned nthreads = TOTAL_THREADS_1D;
62 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
63 unsigned index = indices[src];
64 subset[src] = array[index];
68 template <
typename DataType,
typename OffsetIteratorType>
70 const unsigned int* __restrict__ indices,
71 const OffsetIteratorType offsets,
72 DataType* __restrict__ subset,
73 const DataType* __restrict__ array) {
74 unsigned tid = TID_1D;
75 unsigned nthreads = TOTAL_THREADS_1D;
77 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
78 unsigned index = indices[offsets[src]];
79 subset[src] = array[index];
83 template <
typename DataType>
85 const unsigned int* __restrict__ indices,
86 DataType* __restrict__ subset,
87 DataType* __restrict__ array,
88 DataType reset_value) {
89 unsigned tid = TID_1D;
90 unsigned nthreads = TOTAL_THREADS_1D;
92 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
93 unsigned index = indices[src];
94 subset[src] = array[index];
95 array[index] = reset_value;
99 template <
typename DataType,
typename OffsetIteratorType>
101 const unsigned int* __restrict__ indices,
102 const OffsetIteratorType offsets,
103 DataType* __restrict__ subset,
104 DataType* __restrict__ array,
105 DataType reset_value) {
106 unsigned tid = TID_1D;
107 unsigned nthreads = TOTAL_THREADS_1D;
109 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
110 unsigned index = indices[offsets[src]];
111 subset[src] = array[index];
112 array[index] = reset_value;
116 template <
typename DataType, SharedType sharedType>
118 const unsigned int* __restrict__ indices,
119 const DataType* __restrict__ subset,
120 DataType* __restrict__ array,
122 unsigned tid = TID_1D;
123 unsigned nthreads = TOTAL_THREADS_1D;
125 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
126 unsigned index = indices[src];
127 array[index] = subset[src];
129 is_array_updated->set(index);
134 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
136 const unsigned int* __restrict__ indices,
137 const OffsetIteratorType offsets,
138 const DataType* __restrict__ subset,
139 DataType* __restrict__ array,
141 unsigned tid = TID_1D;
142 unsigned nthreads = TOTAL_THREADS_1D;
144 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
145 unsigned index = indices[offsets[src]];
146 array[index] = subset[src];
148 is_array_updated->set(index);
153 template <
typename DataType, SharedType sharedType>
155 const unsigned int* __restrict__ indices,
156 const DataType* __restrict__ subset,
157 DataType* __restrict__ array,
159 unsigned tid = TID_1D;
160 unsigned nthreads = TOTAL_THREADS_1D;
162 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
163 unsigned index = indices[src];
164 array[index] += subset[src];
166 is_array_updated->set(index);
171 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
173 const unsigned int* __restrict__ indices,
174 const OffsetIteratorType offsets,
175 const DataType* __restrict__ subset,
176 DataType* __restrict__ array,
178 unsigned tid = TID_1D;
179 unsigned nthreads = TOTAL_THREADS_1D;
181 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
182 unsigned index = indices[offsets[src]];
183 array[index] += subset[src];
185 is_array_updated->set(index);
190 template <
typename DataType, SharedType sharedType>
192 const unsigned int* __restrict__ indices,
193 const DataType* __restrict__ subset,
194 DataType* __restrict__ array,
196 unsigned tid = TID_1D;
197 unsigned nthreads = TOTAL_THREADS_1D;
199 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
200 unsigned index = indices[src];
201 if (array[index] > subset[src]) {
202 array[index] = subset[src];
204 is_array_updated->set(index);
210 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
212 const unsigned int* __restrict__ indices,
213 const OffsetIteratorType offsets,
214 const DataType* __restrict__ subset,
215 DataType* __restrict__ array,
217 unsigned tid = TID_1D;
218 unsigned nthreads = TOTAL_THREADS_1D;
220 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
221 unsigned index = indices[offsets[src]];
222 if (array[index] > subset[src]) {
223 array[index] = subset[src];
225 is_array_updated->set(index);
231 template <
typename DataType, SharedType sharedType>
233 const unsigned int* __restrict__ indices,
234 const DataType* __restrict__ subset,
235 DataType* __restrict__ array,
237 unsigned tid = TID_1D;
238 unsigned nthreads = TOTAL_THREADS_1D;
240 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
241 unsigned index = indices[src];
242 if (array[index] < subset[src]) {
243 array[index] = subset[src];
245 is_array_updated->set(index);
251 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
253 const unsigned int* __restrict__ indices,
254 const OffsetIteratorType offsets,
255 const DataType* __restrict__ subset,
256 DataType* __restrict__ array,
258 unsigned tid = TID_1D;
259 unsigned nthreads = TOTAL_THREADS_1D;
261 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
262 unsigned index = indices[offsets[src]];
263 if (array[index] < subset[src]) {
264 array[index] = subset[src];
266 is_array_updated->set(index);
272 template <
typename DataType>
275 unsigned tid = TID_1D;
276 unsigned nthreads = TOTAL_THREADS_1D;
278 for (
index_type src = begin + tid; src < src_end; src += nthreads) {
285 const unsigned int* __restrict__ indices,
288 unsigned tid = TID_1D;
289 unsigned nthreads = TOTAL_THREADS_1D;
291 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
292 unsigned index = indices[src];
293 if (is_array_updated->test(index)) {
294 is_subset_updated->set(src);
301 size_t vec_begin,
size_t vec_end,
bool test1,
302 size_t bit_index1, uint64_t mask1,
303 bool test2,
size_t bit_index2,
305 unsigned tid = TID_1D;
306 unsigned nthreads = TOTAL_THREADS_1D;
308 for (
size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
309 bitset->batch_reset(src);
314 bitset->batch_bitwise_and(bit_index1, mask1);
317 bitset->batch_bitwise_and(bit_index2, mask2);
322 template <
typename DataType>
324 size_t begin,
size_t end) {
329 assert(begin <= (bitset_cpu->
size() - 1));
330 assert(end <= (bitset_cpu->
size() - 1));
332 size_t vec_begin = (begin + 63) / 64;
335 if (end == (bitset_cpu->
size() - 1))
338 vec_end = (end + 1) / 64;
340 size_t begin2 = vec_begin * 64;
341 size_t end2 = vec_end * 64;
354 if (begin < begin2) {
356 bit_index1 = begin / 64;
357 size_t diff = begin2 - begin;
359 mask1 = ((uint64_t)1 << (64 - diff)) - 1;
362 size_t diff2 = end - end2 + 1;
364 mask2 = ~(((uint64_t)1 << diff2) - 1);
370 if (begin < begin2) {
372 bit_index1 = begin / 64;
373 size_t diff = begin2 - begin;
375 mask1 = ((uint64_t)1 << (64 - diff)) - 1;
382 bit_index2 = end / 64;
383 size_t diff = end - end2 + 1;
385 mask2 = ~(((uint64_t)1 << diff) - 1);
391 bitset_reset_range<<<blocks, threads>>>(field->
is_updated.gpu_rd_ptr(),
392 vec_begin, vec_end, test1, bit_index1,
393 mask1, test2, bit_index2, mask2);
396 template <
typename DataType>
398 size_t end, DataType val) {
403 batch_reset<DataType><<<blocks, threads>>>(
408 unsigned int* __restrict__ offsets,
410 size_t* __restrict__ num_set_bits) {
411 cub::CachingDeviceAllocator g_allocator(
415 Shared<size_t> num_set_bits_ptr;
416 num_set_bits_ptr.alloc(1);
417 void* d_temp_storage = NULL;
418 size_t temp_storage_bytes = 0;
419 cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
420 offset_iterator, flag_iterator, offsets,
421 num_set_bits_ptr.gpu_wr_ptr(
true), bitset_size);
423 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
425 cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
426 offset_iterator, flag_iterator, offsets,
427 num_set_bits_ptr.gpu_wr_ptr(
true), bitset_size);
431 CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
432 *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
435 template <
typename DataType, SharedType sharedType,
bool reset>
438 unsigned from_id, uint8_t* send_buffer,
446 DeviceOnly<DataType>* shared_data = &field->
shared_data;
454 size_t v_size = shared->
num_nodes[from_id];
456 batch_get_reset_subset<DataType><<<blocks, threads>>>(
457 v_size, shared->
nodes[from_id].device_ptr(), shared_data->device_ptr(),
458 field->
data.gpu_wr_ptr(), i);
460 batch_get_subset<DataType><<<blocks, threads>>>(
461 v_size, shared->
nodes[from_id].device_ptr(), shared_data->device_ptr(),
462 field->
data.gpu_rd_ptr());
468 memcpy(send_buffer, &data_mode,
sizeof(data_mode));
469 memcpy(send_buffer +
sizeof(data_mode), &v_size,
sizeof(v_size));
470 shared_data->copy_to_cpu(
471 (DataType*)(send_buffer +
sizeof(data_mode) +
sizeof(v_size)), v_size);
480 template <
typename DataType>
482 size_t bit_set_count,
size_t num_shared,
483 DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {
484 if (data_mode ==
noData) {
492 memcpy(send_buffer, &data_mode,
sizeof(data_mode));
493 offset +=
sizeof(data_mode);
497 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
498 offset +=
sizeof(bit_set_count);
503 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
504 offset +=
sizeof(bit_set_count);
505 ctx->
offsets.copy_to_cpu((
unsigned int*)(send_buffer + offset),
507 offset += bit_set_count *
sizeof(
unsigned int);
510 memcpy(send_buffer + offset, &num_shared,
sizeof(num_shared));
511 offset +=
sizeof(num_shared);
512 size_t vec_size = ctx->
is_updated.cpu_rd_ptr()->vec_size();
513 memcpy(send_buffer + offset, &vec_size,
sizeof(vec_size));
514 offset +=
sizeof(vec_size);
516 (uint64_t*)(send_buffer + offset));
517 offset += vec_size *
sizeof(uint64_t);
521 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
522 offset +=
sizeof(bit_set_count);
523 shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
527 template <
typename DataType, SharedType sharedType,
bool reset>
530 unsigned from_id, uint8_t* send_buffer,
539 DeviceOnly<DataType>* shared_data = &field->
shared_data;
550 batch_get_subset_bitset<<<blocks, threads>>>(
561 *data_mode = get_data_mode<DataType>(*v_size, shared->
num_nodes[from_id]);
566 batch_get_reset_subset<DataType><<<blocks, threads>>>(
567 *v_size, shared->
nodes[from_id].device_ptr(),
568 shared_data->device_ptr(), field->
data.gpu_wr_ptr(), i);
570 batch_get_subset<DataType><<<blocks, threads>>>(
571 *v_size, shared->
nodes[from_id].device_ptr(),
572 shared_data->device_ptr(), field->
data.gpu_rd_ptr());
576 batch_get_reset_subset<DataType><<<blocks, threads>>>(
577 *v_size, shared->
nodes[from_id].device_ptr(),
578 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
579 field->
data.gpu_wr_ptr(), i);
581 batch_get_subset<DataType><<<blocks, threads>>>(
582 *v_size, shared->
nodes[from_id].device_ptr(),
583 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
584 field->
data.gpu_rd_ptr());
591 shared_data, send_buffer);
602 template <
typename DataType>
604 size_t& bit_set_count,
size_t num_shared,
605 DeviceOnly<DataType>* shared_data,
606 uint8_t* recv_buffer) {
611 memcpy(&bit_set_count, recv_buffer + offset,
sizeof(bit_set_count));
612 offset +=
sizeof(bit_set_count);
614 bit_set_count = num_shared;
620 offset +=
sizeof(bit_set_count);
621 ctx->
offsets.copy_to_gpu((
unsigned int*)(recv_buffer + offset),
623 offset += bit_set_count *
sizeof(
unsigned int);
626 ctx->
is_updated.cpu_rd_ptr()->resize(num_shared);
627 offset +=
sizeof(num_shared);
628 size_t vec_size = ctx->
is_updated.cpu_rd_ptr()->vec_size();
629 offset +=
sizeof(vec_size);
631 (uint64_t*)(recv_buffer + offset));
632 offset += vec_size *
sizeof(uint64_t);
638 assert(bit_set_count == v_size);
642 offset +=
sizeof(bit_set_count);
643 shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
647 template <
typename DataType, SharedType sharedType, UpdateOp op>
650 unsigned from_id, uint8_t* recv_buffer,
652 assert(data_mode !=
noData);
659 DeviceOnly<DataType>* shared_data = &field->
shared_data;
669 shared_data, recv_buffer);
674 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
675 v_size, shared->
nodes[from_id].device_ptr(),
676 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
678 }
else if (op ==
addOp) {
679 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
680 v_size, shared->
nodes[from_id].device_ptr(),
681 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
683 }
else if (op ==
minOp) {
684 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
685 v_size, shared->
nodes[from_id].device_ptr(),
686 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
691 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
692 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
694 }
else if (op ==
addOp) {
695 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
696 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
698 }
else if (op ==
minOp) {
699 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
700 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
705 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
706 v_size, shared->
nodes[from_id].device_ptr(),
707 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
709 }
else if (op ==
addOp) {
710 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
711 v_size, shared->
nodes[from_id].device_ptr(),
712 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
714 }
else if (op ==
minOp) {
715 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
716 v_size, shared->
nodes[from_id].device_ptr(),
717 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
Definition: DeviceEdgeSync.h:43
__global__ void bitset_reset_range(DynamicBitset *__restrict__ bitset, size_t vec_begin, size_t vec_end, bool test1, size_t bit_index1, uint64_t mask1, bool test2, size_t bit_index2, uint64_t mask2)
Definition: DeviceEdgeSync.h:298
__device__ __host__ size_t vec_size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:76
struct CUDA_Context_Shared mirror
Definition: libgluon/include/galois/cuda/Context.h:50
Definition: DataCommMode.h:35
__global__ void batch_reset(DataType *__restrict__ array, index_type begin, index_type end, DataType val)
Definition: DeviceEdgeSync.h:271
Definition: DeviceEdgeSync.h:42
Dynamic Bitset, CUDA version.
Definition: libgluon/include/galois/cuda/DynamicBitset.h:40
Definition: DeviceEdgeSync.h:42
Shared< DynamicBitset > is_updated
Definition: libgluon/include/galois/cuda/Context.h:52
void kernel_sizing(dim3 &blocks, dim3 &threads)
Definition: DeviceEdgeSync.h:45
DeviceOnly< Type > shared_data
Definition: libgluon/include/galois/cuda/Context.h:59
unsigned int index_type
Definition: EdgeHostDecls.h:33
SharedType
Definition: DeviceEdgeSync.h:42
void deserializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t &bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *recv_buffer)
Definition: DeviceEdgeSync.h:602
Shared< DynamicBitset > is_updated
Definition: libgluon/include/galois/cuda/Context.h:58
Definition: libgluon/include/galois/cuda/Context.h:56
void get_offsets_from_bitset(index_type bitset_size, unsigned int *__restrict__ offsets, DynamicBitset *__restrict__ bitset, size_t *__restrict__ num_set_bits)
Definition: DeviceEdgeSync.h:405
__global__ void batch_min_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:189
__global__ void batch_max_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:230
Definition: DataCommMode.h:37
Definition: DeviceEdgeSync.h:43
Definition: DeviceEdgeSync.h:43
void reset_bitset_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end)
Definition: DeviceEdgeSync.h:321
void serializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *send_buffer)
Definition: DeviceEdgeSync.h:479
Definition: libgluon/include/galois/cuda/Context.h:36
__device__ __host__ size_t size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:74
UpdateOp
Definition: DeviceEdgeSync.h:43
Contains the DataCommMode enumeration and a function that chooses a data comm mode based on its argum...
Definition: libgluon/include/galois/cuda/Context.h:41
__global__ void batch_get_reset_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, DataType *__restrict__ array, DataType reset_value)
Definition: DeviceEdgeSync.h:82
#define check_cuda_kernel
Definition: DeviceSync.h:41
void batch_set_shared_field(struct CUDA_Context_Common *ctx, struct CUDA_Context_Field< DataType > *field, unsigned from_id, uint8_t *recv_buffer, DataCommMode data_mode)
Definition: DeviceSync.h:648
void reset(Ty &var, Ty val)
Definition: AtomicHelpers.h:202
Contains definition of CUDA context structures.
send no data
Definition: DataCommMode.h:34
Definition: DataCommMode.h:38
__global__ void batch_add_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:152
Definition: DataCommMode.h:36
void reset_data_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end, DataType val)
Definition: DeviceEdgeSync.h:395
unsigned int * num_nodes
Definition: libgluon/include/galois/cuda/Context.h:37
Shared< Type > data
Definition: libgluon/include/galois/cuda/Context.h:57
Definition: libgluon/include/galois/cuda/DynamicBitset.h:133
Definition: libgluon/include/galois/cuda/DynamicBitset.h:209
__global__ void batch_set_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:115
DataCommMode enforcedDataMode
Specifies what format to send metadata in.
Definition: GluonSubstrate.cpp:29
void batch_get_shared_field(struct CUDA_Context_Common *ctx, struct CUDA_Context_Field< DataType > *field, unsigned from_id, uint8_t *send_buffer, DataType i=0)
Definition: DeviceSync.h:436
DeviceOnly< unsigned int > offsets
Definition: libgluon/include/galois/cuda/Context.h:51
__global__ void batch_get_subset_bitset(index_type subset_size, const unsigned int *__restrict__ indices, DynamicBitset *__restrict__ is_subset_updated, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:282
Contains implementation of CUDA dynamic bitset and iterators for it.
DataCommMode
Enumeration of data communication modes that can be used in synchronization.
Definition: DataCommMode.h:33
struct CUDA_Context_Shared master
Definition: libgluon/include/galois/cuda/Context.h:49
DeviceOnly< unsigned int > * nodes
Definition: libgluon/include/galois/cuda/Context.h:38
__global__ void batch_get_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, const DataType *__restrict__ array)
Definition: DeviceEdgeSync.h:53