32 #include "cub/util_allocator.cuh"
34 #ifdef GALOIS_CUDA_CHECK_ERROR
35 #define check_cuda_kernel \
36 check_cuda(cudaDeviceSynchronize()); \
37 check_cuda(cudaGetLastError());
39 #define check_cuda_kernel check_cuda(cudaGetLastError());
47 threads.y = threads.z = 1;
48 blocks.x = ggc_get_nSM() * 8;
49 blocks.y = blocks.z = 1;
52 template <
typename DataType>
54 const unsigned int* __restrict__ indices,
55 DataType* __restrict__ subset,
56 const DataType* __restrict__ array) {
57 unsigned tid = TID_1D;
58 unsigned nthreads = TOTAL_THREADS_1D;
60 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
61 unsigned index = indices[src];
62 subset[src] = array[index];
66 template <
typename DataType,
typename OffsetIteratorType>
68 const unsigned int* __restrict__ indices,
69 const OffsetIteratorType offsets,
70 DataType* __restrict__ subset,
71 const DataType* __restrict__ array) {
72 unsigned tid = TID_1D;
73 unsigned nthreads = TOTAL_THREADS_1D;
75 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
76 unsigned index = indices[offsets[src]];
77 subset[src] = array[index];
81 template <
typename DataType>
83 const unsigned int* __restrict__ indices,
84 DataType* __restrict__ subset,
85 DataType* __restrict__ array,
86 DataType reset_value) {
87 unsigned tid = TID_1D;
88 unsigned nthreads = TOTAL_THREADS_1D;
90 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
91 unsigned index = indices[src];
92 subset[src] = array[index];
93 array[index] = reset_value;
97 template <
typename DataType,
typename OffsetIteratorType>
99 const unsigned int* __restrict__ indices,
100 const OffsetIteratorType offsets,
101 DataType* __restrict__ subset,
102 DataType* __restrict__ array,
103 DataType reset_value) {
104 unsigned tid = TID_1D;
105 unsigned nthreads = TOTAL_THREADS_1D;
107 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
108 unsigned index = indices[offsets[src]];
109 subset[src] = array[index];
110 array[index] = reset_value;
114 template <
typename DataType, SharedType sharedType>
116 const unsigned int* __restrict__ indices,
117 const DataType* __restrict__ subset,
118 DataType* __restrict__ array,
120 unsigned tid = TID_1D;
121 unsigned nthreads = TOTAL_THREADS_1D;
123 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
124 unsigned index = indices[src];
125 array[index] = subset[src];
127 is_array_updated->set(index);
132 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
134 const unsigned int* __restrict__ indices,
135 const OffsetIteratorType offsets,
136 const DataType* __restrict__ subset,
137 DataType* __restrict__ array,
139 unsigned tid = TID_1D;
140 unsigned nthreads = TOTAL_THREADS_1D;
142 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
143 unsigned index = indices[offsets[src]];
144 array[index] = subset[src];
146 is_array_updated->set(index);
151 template <
typename DataType, SharedType sharedType>
153 const unsigned int* __restrict__ indices,
154 const DataType* __restrict__ subset,
155 DataType* __restrict__ array,
157 unsigned tid = TID_1D;
158 unsigned nthreads = TOTAL_THREADS_1D;
160 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
161 unsigned index = indices[src];
162 array[index] += subset[src];
164 is_array_updated->set(index);
169 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
171 const unsigned int* __restrict__ indices,
172 const OffsetIteratorType offsets,
173 const DataType* __restrict__ subset,
174 DataType* __restrict__ array,
176 unsigned tid = TID_1D;
177 unsigned nthreads = TOTAL_THREADS_1D;
179 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
180 unsigned index = indices[offsets[src]];
181 array[index] += subset[src];
183 is_array_updated->set(index);
188 template <
typename DataType, SharedType sharedType>
190 const unsigned int* __restrict__ indices,
191 const DataType* __restrict__ subset,
192 DataType* __restrict__ array,
194 unsigned tid = TID_1D;
195 unsigned nthreads = TOTAL_THREADS_1D;
197 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
198 unsigned index = indices[src];
199 if (array[index] > subset[src]) {
200 array[index] = subset[src];
202 is_array_updated->set(index);
208 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
210 const unsigned int* __restrict__ indices,
211 const OffsetIteratorType offsets,
212 const DataType* __restrict__ subset,
213 DataType* __restrict__ array,
215 unsigned tid = TID_1D;
216 unsigned nthreads = TOTAL_THREADS_1D;
218 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
219 unsigned index = indices[offsets[src]];
220 if (array[index] > subset[src]) {
221 array[index] = subset[src];
223 is_array_updated->set(index);
229 template <
typename DataType, SharedType sharedType>
231 const unsigned int* __restrict__ indices,
232 const DataType* __restrict__ subset,
233 DataType* __restrict__ array,
235 unsigned tid = TID_1D;
236 unsigned nthreads = TOTAL_THREADS_1D;
238 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
239 unsigned index = indices[src];
240 if (array[index] < subset[src]) {
241 array[index] = subset[src];
243 is_array_updated->set(index);
249 template <
typename DataType, SharedType sharedType,
typename OffsetIteratorType>
251 const unsigned int* __restrict__ indices,
252 const OffsetIteratorType offsets,
253 const DataType* __restrict__ subset,
254 DataType* __restrict__ array,
256 unsigned tid = TID_1D;
257 unsigned nthreads = TOTAL_THREADS_1D;
259 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
260 unsigned index = indices[offsets[src]];
261 if (array[index] < subset[src]) {
262 array[index] = subset[src];
264 is_array_updated->set(index);
270 template <
typename DataType>
273 unsigned tid = TID_1D;
274 unsigned nthreads = TOTAL_THREADS_1D;
276 for (
index_type src = begin + tid; src < src_end; src += nthreads) {
283 const unsigned int* __restrict__ indices,
286 unsigned tid = TID_1D;
287 unsigned nthreads = TOTAL_THREADS_1D;
289 for (
index_type src = 0 + tid; src < src_end; src += nthreads) {
290 unsigned index = indices[src];
291 if (is_array_updated->test(index)) {
292 is_subset_updated->set(src);
299 size_t vec_begin,
size_t vec_end,
bool test1,
300 size_t bit_index1, uint64_t mask1,
301 bool test2,
size_t bit_index2,
303 unsigned tid = TID_1D;
304 unsigned nthreads = TOTAL_THREADS_1D;
306 for (
size_t src = vec_begin + tid; src < vec_end; src += nthreads) {
307 bitset->batch_reset(src);
312 bitset->batch_bitwise_and(bit_index1, mask1);
315 bitset->batch_bitwise_and(bit_index2, mask2);
320 template <
typename DataType>
322 size_t begin,
size_t end) {
327 assert(begin <= (bitset_cpu->
size() - 1));
328 assert(end <= (bitset_cpu->
size() - 1));
330 size_t vec_begin = (begin + 63) / 64;
333 if (end == (bitset_cpu->
size() - 1))
336 vec_end = (end + 1) / 64;
338 size_t begin2 = vec_begin * 64;
339 size_t end2 = vec_end * 64;
352 if (begin < begin2) {
354 bit_index1 = begin / 64;
355 size_t diff = begin2 - begin;
357 mask1 = ((uint64_t)1 << (64 - diff)) - 1;
360 size_t diff2 = end - end2 + 1;
362 mask2 = ~(((uint64_t)1 << diff2) - 1);
368 if (begin < begin2) {
370 bit_index1 = begin / 64;
371 size_t diff = begin2 - begin;
373 mask1 = ((uint64_t)1 << (64 - diff)) - 1;
380 bit_index2 = end / 64;
381 size_t diff = end - end2 + 1;
383 mask2 = ~(((uint64_t)1 << diff) - 1);
389 bitset_reset_range<<<blocks, threads>>>(field->
is_updated.gpu_rd_ptr(),
390 vec_begin, vec_end, test1, bit_index1,
391 mask1, test2, bit_index2, mask2);
394 template <
typename DataType>
396 size_t begin,
size_t end, DataType val) {
401 batch_reset<DataType><<<blocks, threads>>>(
406 unsigned int* __restrict__ offsets,
408 size_t* __restrict__ num_set_bits) {
409 cub::CachingDeviceAllocator g_allocator(
413 Shared<size_t> num_set_bits_ptr;
414 num_set_bits_ptr.alloc(1);
415 void* d_temp_storage = NULL;
416 size_t temp_storage_bytes = 0;
417 cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
418 offset_iterator, flag_iterator, offsets,
419 num_set_bits_ptr.gpu_wr_ptr(
true), bitset_size);
421 CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
423 cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
424 offset_iterator, flag_iterator, offsets,
425 num_set_bits_ptr.gpu_wr_ptr(
true), bitset_size);
429 CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
430 *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();
433 template <
typename DataType, SharedType sharedType,
bool reset>
436 unsigned from_id, uint8_t* send_buffer,
444 DeviceOnly<DataType>* shared_data = &field->
shared_data;
452 size_t v_size = shared->
num_edges[from_id];
454 batch_get_reset_subset<DataType><<<blocks, threads>>>(
455 v_size, shared->
edges[from_id].device_ptr(), shared_data->device_ptr(),
456 field->
data.gpu_wr_ptr(), i);
458 batch_get_subset<DataType><<<blocks, threads>>>(
459 v_size, shared->
edges[from_id].device_ptr(), shared_data->device_ptr(),
460 field->
data.gpu_rd_ptr());
466 memcpy(send_buffer, &data_mode,
sizeof(data_mode));
467 memcpy(send_buffer +
sizeof(data_mode), &v_size,
sizeof(v_size));
468 shared_data->copy_to_cpu(
469 (DataType*)(send_buffer +
sizeof(data_mode) +
sizeof(v_size)), v_size);
478 template <
typename DataType>
481 size_t num_shared, DeviceOnly<DataType>* shared_data,
482 uint8_t* send_buffer) {
483 if (data_mode ==
noData) {
491 memcpy(send_buffer, &data_mode,
sizeof(data_mode));
492 offset +=
sizeof(data_mode);
496 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
497 offset +=
sizeof(bit_set_count);
502 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
503 offset +=
sizeof(bit_set_count);
504 ctx->
offsets.copy_to_cpu((
unsigned int*)(send_buffer + offset),
506 offset += bit_set_count *
sizeof(
unsigned int);
509 memcpy(send_buffer + offset, &num_shared,
sizeof(num_shared));
510 offset +=
sizeof(num_shared);
511 size_t vec_size = ctx->
is_updated.cpu_rd_ptr()->vec_size();
512 memcpy(send_buffer + offset, &vec_size,
sizeof(vec_size));
513 offset +=
sizeof(vec_size);
515 (uint64_t*)(send_buffer + offset));
516 offset += vec_size *
sizeof(uint64_t);
520 memcpy(send_buffer + offset, &bit_set_count,
sizeof(bit_set_count));
521 offset +=
sizeof(bit_set_count);
522 shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
526 template <
typename DataType, SharedType sharedType,
bool reset>
529 unsigned from_id, uint8_t* send_buffer,
538 DeviceOnly<DataType>* shared_data = &field->
shared_data;
549 batch_get_subset_bitset<<<blocks, threads>>>(
560 *data_mode = get_data_mode<DataType>(*v_size, shared->
num_edges[from_id]);
565 batch_get_reset_subset<DataType><<<blocks, threads>>>(
566 *v_size, shared->
edges[from_id].device_ptr(),
567 shared_data->device_ptr(), field->
data.gpu_wr_ptr(), i);
569 batch_get_subset<DataType><<<blocks, threads>>>(
570 *v_size, shared->
edges[from_id].device_ptr(),
571 shared_data->device_ptr(), field->
data.gpu_rd_ptr());
575 batch_get_reset_subset<DataType><<<blocks, threads>>>(
576 *v_size, shared->
edges[from_id].device_ptr(),
577 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
578 field->
data.gpu_wr_ptr(), i);
580 batch_get_subset<DataType><<<blocks, threads>>>(
581 *v_size, shared->
edges[from_id].device_ptr(),
582 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
583 field->
data.gpu_rd_ptr());
590 shared_data, send_buffer);
601 template <
typename DataType>
604 size_t num_shared, DeviceOnly<DataType>* shared_data,
605 uint8_t* recv_buffer) {
610 memcpy(&bit_set_count, recv_buffer + offset,
sizeof(bit_set_count));
611 offset +=
sizeof(bit_set_count);
613 bit_set_count = num_shared;
619 offset +=
sizeof(bit_set_count);
620 ctx->
offsets.copy_to_gpu((
unsigned int*)(recv_buffer + offset),
622 offset += bit_set_count *
sizeof(
unsigned int);
625 ctx->
is_updated.cpu_rd_ptr()->resize(num_shared);
626 offset +=
sizeof(num_shared);
627 size_t vec_size = ctx->
is_updated.cpu_rd_ptr()->vec_size();
628 offset +=
sizeof(vec_size);
630 (uint64_t*)(recv_buffer + offset));
631 offset += vec_size *
sizeof(uint64_t);
637 assert(bit_set_count == v_size);
641 offset +=
sizeof(bit_set_count);
642 shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
646 template <
typename DataType, SharedType sharedType, UpdateOp op>
649 unsigned from_id, uint8_t* recv_buffer,
651 assert(data_mode !=
noData);
658 DeviceOnly<DataType>* shared_data = &field->
shared_data;
668 shared_data, recv_buffer);
673 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
674 v_size, shared->
edges[from_id].device_ptr(),
675 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
677 }
else if (op ==
addOp) {
678 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
679 v_size, shared->
edges[from_id].device_ptr(),
680 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
682 }
else if (op ==
minOp) {
683 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
684 v_size, shared->
edges[from_id].device_ptr(),
685 shared_data->device_ptr(), field->
data.gpu_wr_ptr(),
690 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
691 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
693 }
else if (op ==
addOp) {
694 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
695 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
697 }
else if (op ==
minOp) {
698 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
699 v_size, ctx->
offsets.device_ptr(), shared_data->device_ptr(),
704 batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
705 v_size, shared->
edges[from_id].device_ptr(),
706 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
708 }
else if (op ==
addOp) {
709 batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
710 v_size, shared->
edges[from_id].device_ptr(),
711 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
713 }
else if (op ==
minOp) {
714 batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
715 v_size, shared->
edges[from_id].device_ptr(),
716 ctx->
offsets.device_ptr(), shared_data->device_ptr(),
Definition: DeviceEdgeSync.h:43
__global__ void bitset_reset_range(DynamicBitset *__restrict__ bitset, size_t vec_begin, size_t vec_end, bool test1, size_t bit_index1, uint64_t mask1, bool test2, size_t bit_index2, uint64_t mask2)
Definition: DeviceEdgeSync.h:298
__device__ __host__ size_t vec_size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:76
Definition: DataCommMode.h:35
__global__ void batch_reset(DataType *__restrict__ array, index_type begin, index_type end, DataType val)
Definition: DeviceEdgeSync.h:271
Shared< Type > data
Definition: EdgeContext.h:57
Definition: DeviceEdgeSync.h:42
Dynamic Bitset, CUDA version.
Definition: libgluon/include/galois/cuda/DynamicBitset.h:40
DeviceOnly< Type > shared_data
Definition: EdgeContext.h:59
Definition: DeviceEdgeSync.h:42
Definition: EdgeContext.h:36
void kernel_sizing(dim3 &blocks, dim3 &threads)
Definition: DeviceEdgeSync.h:45
unsigned int index_type
Definition: EdgeHostDecls.h:33
Contains definition of CUDA context structures.
SharedType
Definition: DeviceEdgeSync.h:42
void deserializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t &bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *recv_buffer)
Definition: DeviceEdgeSync.h:602
void get_offsets_from_bitset(index_type bitset_size, unsigned int *__restrict__ offsets, DynamicBitset *__restrict__ bitset, size_t *__restrict__ num_set_bits)
Definition: DeviceEdgeSync.h:405
__global__ void batch_min_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:189
__global__ void batch_max_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:230
Definition: DataCommMode.h:37
Definition: DeviceEdgeSync.h:43
Definition: DeviceEdgeSync.h:43
Definition: EdgeContext.h:56
void reset_bitset_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end)
Definition: DeviceEdgeSync.h:321
struct CUDA_Context_Shared_Edges mirror
Definition: EdgeContext.h:50
void serializeMessage(struct CUDA_Context_Common_Edges *ctx, DataCommMode data_mode, size_t bit_set_count, size_t num_shared, DeviceOnly< DataType > *shared_data, uint8_t *send_buffer)
Definition: DeviceEdgeSync.h:479
__device__ __host__ size_t size() const
Definition: libgluon/include/galois/cuda/DynamicBitset.h:74
UpdateOp
Definition: DeviceEdgeSync.h:43
Contains the DataCommMode enumeration and a function that chooses a data comm mode based on its argum...
Shared< DynamicBitset > is_updated
Definition: EdgeContext.h:52
__global__ void batch_get_reset_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, DataType *__restrict__ array, DataType reset_value)
Definition: DeviceEdgeSync.h:82
DeviceOnly< unsigned int > offsets
Definition: EdgeContext.h:51
unsigned int * num_edges
Definition: EdgeContext.h:37
void reset(Ty &var, Ty val)
Definition: AtomicHelpers.h:202
send no data
Definition: DataCommMode.h:34
Definition: DataCommMode.h:38
__global__ void batch_add_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:152
struct CUDA_Context_Shared_Edges master
Definition: EdgeContext.h:49
Definition: DataCommMode.h:36
void reset_data_field(struct CUDA_Context_Field_Edges< DataType > *field, size_t begin, size_t end, DataType val)
Definition: DeviceEdgeSync.h:395
Definition: libgluon/include/galois/cuda/DynamicBitset.h:133
void batch_get_shared_edge(struct CUDA_Context_Common_Edges *ctx, struct CUDA_Context_Field_Edges< DataType > *field, unsigned from_id, uint8_t *send_buffer, DataType i=0)
Definition: DeviceEdgeSync.h:434
Definition: libgluon/include/galois/cuda/DynamicBitset.h:209
__global__ void batch_set_subset(index_type subset_size, const unsigned int *__restrict__ indices, const DataType *__restrict__ subset, DataType *__restrict__ array, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:115
DataCommMode enforcedDataMode
Specifies what format to send metadata in.
Definition: GluonSubstrate.cpp:29
DeviceOnly< unsigned int > * edges
Definition: EdgeContext.h:38
__global__ void batch_get_subset_bitset(index_type subset_size, const unsigned int *__restrict__ indices, DynamicBitset *__restrict__ is_subset_updated, DynamicBitset *__restrict__ is_array_updated)
Definition: DeviceEdgeSync.h:282
Definition: EdgeContext.h:41
Contains implementation of CUDA dynamic bitset and iterators for it.
DataCommMode
Enumeration of data communication modes that can be used in synchronization.
Definition: DataCommMode.h:33
void batch_set_shared_edge(struct CUDA_Context_Common_Edges *ctx, struct CUDA_Context_Field_Edges< DataType > *field, unsigned from_id, uint8_t *recv_buffer, DataCommMode data_mode)
Definition: DeviceEdgeSync.h:647
#define check_cuda_kernel
Definition: DeviceEdgeSync.h:39
__global__ void batch_get_subset(index_type subset_size, const unsigned int *__restrict__ indices, DataType *__restrict__ subset, const DataType *__restrict__ array)
Definition: DeviceEdgeSync.h:53
Shared< DynamicBitset > is_updated
Definition: EdgeContext.h:58