forked from rapidsai/cudf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson_tree.cu
743 lines (682 loc) · 33.6 KB
/
json_tree.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nested_json.hpp"
#include <hash/hash_allocator.cuh>
#include <hash/helper_functions.cuh>
#include <io/utilities/hostdevice_vector.hpp>
#include <cudf/detail/hashing.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/scatter.cuh>
#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>
#include <cuco/static_map.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/mr/device/polymorphic_allocator.hpp>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/fill.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/remove.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/tabulate.h>
#include <thrust/transform.h>
#include <limits>
namespace cudf::io::json {
namespace detail {
// The node that a token represents
struct token_to_node {
__device__ auto operator()(PdaTokenT const token) -> NodeT
{
switch (token) {
case token_t::StructBegin: return NC_STRUCT;
case token_t::ListBegin: return NC_LIST;
case token_t::StringBegin: return NC_STR;
case token_t::ValueBegin:
return NC_STR; // NC_VAL;
// NV_VAL is removed because type inference and
// reduce_to_column_tree category collapsing takes care of this.
case token_t::FieldNameBegin: return NC_FN;
default: return NC_ERR;
};
}
};
// Convert token indices to node range for each valid node.
struct node_ranges {
device_span<PdaTokenT const> tokens;
device_span<SymbolOffsetT const> token_indices;
bool include_quote_char;
__device__ auto operator()(size_type i) -> thrust::tuple<SymbolOffsetT, SymbolOffsetT>
{
// Whether a token expects to be followed by its respective end-of-* token partner
auto const is_begin_of_section = [] __device__(PdaTokenT const token) {
switch (token) {
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin: return true;
default: return false;
};
};
// The end-of-* partner token for a given beginning-of-* token
auto const end_of_partner = [] __device__(PdaTokenT const token) {
switch (token) {
case token_t::StringBegin: return token_t::StringEnd;
case token_t::ValueBegin: return token_t::ValueEnd;
case token_t::FieldNameBegin: return token_t::FieldNameEnd;
default: return token_t::ErrorBegin;
};
};
// Includes quote char for end-of-string token or Skips the quote char for
// beginning-of-field-name token
auto const get_token_index = [include_quote_char = include_quote_char] __device__(
PdaTokenT const token, SymbolOffsetT const token_index) {
constexpr SymbolOffsetT quote_char_size = 1;
switch (token) {
// Strip off quote char included for StringBegin
case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size);
// Strip off or Include trailing quote char for string values for StringEnd
case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0);
// Strip off quote char included for FieldNameBegin
case token_t::FieldNameBegin: return token_index + quote_char_size;
default: return token_index;
};
};
PdaTokenT const token = tokens[i];
// The section from the original JSON input that this token demarcates
SymbolOffsetT range_begin = get_token_index(token, token_indices[i]);
SymbolOffsetT range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value.
if (is_begin_of_section(token)) {
if ((i + 1) < tokens.size() && end_of_partner(token) == tokens[i + 1]) {
// Update the range_end for this pair of tokens
range_end = get_token_index(tokens[i + 1], token_indices[i + 1]);
}
}
return thrust::make_tuple(range_begin, range_end);
}
};
/**
* @brief Returns stable sorted keys and its sorted order
*
* Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
* Since the key and order is returned, using double buffer helps to avoid extra copy to user
* provided output iterator.
*
* @tparam IndexType sorted order type
* @tparam KeyType key type
* @param keys keys to sort
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return Sorted keys and indices producing that sorted order
*/
template <typename IndexType = size_t, typename KeyType>
std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
// Determine temporary device storage requirements
rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(
nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
temp_storage_bytes,
keys_buffer,
order_buffer,
keys.size(),
0,
sizeof(KeyType) * 8,
stream.value());
return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
: std::move(keys_buffer2),
order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
: std::move(order_buffer2)};
}
/**
* @brief Propagate parent node to siblings from first sibling.
*
* @param node_levels Node levels of each node
* @param parent_node_ids parent node ids initialized for first child of each push node,
* and other siblings are initialized to -1.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
cudf::device_span<NodeIndexT> parent_node_ids,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
// instead of gather, using permutation_iterator, which is ~17% faster
thrust::inclusive_scan_by_key(
rmm::exec_policy(stream),
sorted_node_levels.begin(),
sorted_node_levels.end(),
thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
thrust::equal_to<TreeDepthT>{},
thrust::maximum<NodeIndexT>{});
}
// Generates a tree representation of the given tokens, token_indices.
tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
// Whether a token does represent a node in the tree representation
auto const is_node = [] __device__(PdaTokenT const token) -> bool {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin:
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin:
case token_t::ErrorBegin: return true;
default: return false;
};
};
// Whether the token pops from the parent node stack
auto const does_pop = [] __device__(PdaTokenT const token) -> bool {
switch (token) {
case token_t::StructMemberEnd:
case token_t::StructEnd:
case token_t::ListEnd: return true;
default: return false;
};
};
// Whether the token pushes onto the parent node stack
auto const does_push = [] __device__(PdaTokenT const token) -> bool {
switch (token) {
case token_t::FieldNameBegin:
case token_t::StructBegin:
case token_t::ListBegin: return true;
default: return false;
};
};
// Look for ErrorBegin and report the point of error.
if (auto const error_count =
thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
error_count > 0) {
auto const error_location =
thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
CUDF_FAIL("JSON Parser encountered an invalid format at location " +
std::to_string(thrust::device_pointer_cast(
token_indices.data())[error_location - tokens.begin()]));
}
auto const num_tokens = tokens.size();
auto const num_nodes =
thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node);
// Node levels: transform_exclusive_scan, copy_if.
rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
{
rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
auto const push_pop_it = thrust::make_transform_iterator(
tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
return does_push(token) - does_pop(token);
});
thrust::exclusive_scan(
rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
auto const node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
token_levels.begin(),
token_levels.end(),
tokens.begin(),
node_levels.begin(),
is_node);
CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes,
"node level count mismatch");
}
// Node parent ids:
// previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
// This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
{
rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<NodeIndexT>(0),
thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
tokens.begin(),
node_token_ids.begin(),
is_node);
// previous push node_id
// if previous node is a push, then i-1
// if previous node is FE, then i-2 (returns FB's index)
// if previous node is SMB and its previous node is a push, then i-2
// eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent.
// else -1
auto const first_childs_parent_token_id = [tokens_gpu =
tokens.begin()] __device__(auto i) -> NodeIndexT {
if (i <= 0) { return -1; }
if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) {
return i - 1;
} else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) {
return i - 2;
} else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and
(tokens_gpu[i - 2] == token_t::StructBegin ||
tokens_gpu[i - 2] == token_t::ListBegin)) {
return i - 2;
} else {
return -1;
}
};
thrust::transform(
rmm::exec_policy(stream),
node_token_ids.begin(),
node_token_ids.end(),
parent_node_ids.begin(),
[node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
NodeIndexT const tid) -> NodeIndexT {
auto const pid = first_childs_parent_token_id(tid);
return pid < 0
? parent_node_sentinel
: thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
node_ids_gpu;
// parent_node_sentinel is -1, useful for segmented max operation below
});
}
// Propagate parent node to siblings from first sibling - inplace.
propagate_parent_to_siblings(
cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
parent_node_ids,
stream);
// Node categories: copy_if with transform.
rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
auto const node_categories_it =
thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{});
auto const node_categories_end = thrust::copy_if(
rmm::exec_policy(stream), tokens.begin(), tokens.end(), node_categories_it, is_node);
CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
"node category count mismatch");
// Node ranges: copy_if with transform.
rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
auto const node_range_tuple_it =
thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
// Whether the tokenizer stage should keep quote characters for string values
// If the tokenizer keeps the quote characters, they may be stripped during type casting
constexpr bool include_quote_char = true;
auto const node_range_out_it = thrust::make_transform_output_iterator(
node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char});
auto const node_range_out_end =
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(0) + num_tokens,
node_range_out_it,
[is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool {
return is_node(tokens_gpu[i]);
});
CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
return {std::move(node_categories),
std::move(parent_node_ids),
std::move(node_levels),
std::move(node_range_begin),
std::move(node_range_end)};
}
/**
* @brief Generates unique node_type id for each node.
* Field nodes with the same name are assigned the same node_type id.
* List, Struct, and String nodes are assigned their category values as node_type ids.
*
* All inputs and outputs are in node_id order.
* @param d_input JSON string in device memory
* @param d_tree Tree representation of the JSON
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return Vector of node_type ids
*/
rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<SymbolT const> d_input,
tree_meta_t const& d_tree,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
using hash_map_type =
cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
auto const num_nodes = d_tree.node_categories.size();
auto const num_fields = thrust::count(rmm::exec_policy(stream),
d_tree.node_categories.begin(),
d_tree.node_categories.end(),
node_t::NC_FN);
constexpr size_type empty_node_index_sentinel = -1;
hash_map_type key_map{compute_hash_table_size(num_fields, 40), // 40% occupancy in hash map
cuco::sentinel::empty_key{empty_node_index_sentinel},
cuco::sentinel::empty_value{empty_node_index_sentinel},
hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value()};
auto const d_hasher = [d_input = d_input.data(),
node_range_begin = d_tree.node_range_begin.data(),
node_range_end = d_tree.node_range_end.data()] __device__(auto node_id) {
auto const field_name = cudf::string_view(d_input + node_range_begin[node_id],
node_range_end[node_id] - node_range_begin[node_id]);
return cudf::detail::default_hash<cudf::string_view>{}(field_name);
};
auto const d_equal = [d_input = d_input.data(),
node_range_begin = d_tree.node_range_begin.data(),
node_range_end = d_tree.node_range_end.data()] __device__(auto node_id1,
auto node_id2) {
auto const field_name1 = cudf::string_view(
d_input + node_range_begin[node_id1], node_range_end[node_id1] - node_range_begin[node_id1]);
auto const field_name2 = cudf::string_view(
d_input + node_range_begin[node_id2], node_range_end[node_id2] - node_range_begin[node_id2]);
return field_name1 == field_name2;
};
// key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
// hashing)
auto const iter = cudf::detail::make_counting_transform_iterator(
0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
auto const is_field_name_node = [node_categories =
d_tree.node_categories.data()] __device__(auto node_id) {
return node_categories[node_id] == node_t::NC_FN;
};
key_map.insert_if(iter,
iter + num_nodes,
thrust::counting_iterator<size_type>(0), // stencil
is_field_name_node,
d_hasher,
d_equal,
stream.value());
auto const get_hash_value =
[key_map = key_map.get_device_view(), d_hasher, d_equal] __device__(auto node_id) -> size_type {
auto const it = key_map.find(node_id, d_hasher, d_equal);
return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed);
};
// convert field nodes to node indices, and other nodes to enum value.
rmm::device_uvector<size_type> node_type(num_nodes, stream);
thrust::tabulate(rmm::exec_policy(stream),
node_type.begin(),
node_type.end(),
[node_categories = d_tree.node_categories.data(),
is_field_name_node,
get_hash_value] __device__(auto node_id) -> size_type {
if (is_field_name_node(node_id))
return static_cast<size_type>(NUM_NODE_CLASSES) + get_hash_value(node_id);
else
return static_cast<size_type>(node_categories[node_id]);
});
return node_type;
}
// Two level hashing algorithm
// 1. Convert node_category+fieldname to node_type. (passed as argument)
// a. Create a hashmap to hash field name and assign unique node id as values.
// b. Convert the node categories to node types.
// Node type is defined as node category enum value if it is not a field node,
// otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY).
// 2. Set operation on entire path of each node
// a. Create a hash map with hash of {node_level, node_type} of its node and the entire parent
// until root.
// b. While creating hashmap, transform node id to unique node ids that are inserted into the
// hash map. This mimicks set operation with hash map. This unique node ids are set ids.
// c. Return this converted set ids, which are the hash map keys/values, and unique set ids.
std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_node_path(
device_span<TreeDepthT const> node_levels,
device_span<size_type const> node_type,
device_span<NodeIndexT const> parent_node_ids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
auto const num_nodes = parent_node_ids.size();
rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
using hash_map_type =
cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
constexpr size_type empty_node_index_sentinel = -1;
hash_map_type key_map{compute_hash_table_size(num_nodes), // TODO reduce oversubscription
cuco::sentinel::empty_key{empty_node_index_sentinel},
cuco::sentinel::empty_value{empty_node_index_sentinel},
cuco::sentinel::erased_key{-2},
hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value()};
// path compression is not used since extra writes make all map operations slow.
auto const d_hasher = [node_level = node_levels.begin(),
node_type = node_type.begin(),
parent_node_ids = parent_node_ids.begin()] __device__(auto node_id) {
auto hash =
cudf::detail::hash_combine(cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]),
cudf::detail::default_hash<size_type>{}(node_type[node_id]));
node_id = parent_node_ids[node_id];
while (node_id != parent_node_sentinel) {
hash = cudf::detail::hash_combine(
hash, cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]));
hash = cudf::detail::hash_combine(
hash, cudf::detail::default_hash<size_type>{}(node_type[node_id]));
node_id = parent_node_ids[node_id];
}
return hash;
};
rmm::device_uvector<hash_value_type> node_hash(num_nodes, stream);
thrust::tabulate(rmm::exec_policy(stream), node_hash.begin(), node_hash.end(), d_hasher);
auto const d_hashed_cache = [node_hash = node_hash.begin()] __device__(auto node_id) {
return node_hash[node_id];
};
auto const d_equal = [node_level = node_levels.begin(),
node_type = node_type.begin(),
parent_node_ids = parent_node_ids.begin(),
d_hashed_cache] __device__(auto node_id1, auto node_id2) {
if (node_id1 == node_id2) return true;
if (d_hashed_cache(node_id1) != d_hashed_cache(node_id2)) return false;
auto const is_equal_level = [node_level, node_type](auto node_id1, auto node_id2) {
if (node_id1 == node_id2) return true;
return node_level[node_id1] == node_level[node_id2] and
node_type[node_id1] == node_type[node_id2];
};
// if both nodes have same node types at all levels, it will check until it has common parent
// or root.
while (node_id1 != parent_node_sentinel and node_id2 != parent_node_sentinel and
node_id1 != node_id2 and is_equal_level(node_id1, node_id2)) {
node_id1 = parent_node_ids[node_id1];
node_id2 = parent_node_ids[node_id2];
}
return node_id1 == node_id2;
};
// insert and convert node ids to unique set ids
auto const num_inserted = thrust::count_if(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_nodes),
[d_hashed_cache,
d_equal,
view = key_map.get_device_mutable_view(),
uq_node_id = col_id.begin()] __device__(auto node_id) mutable {
// typename hash_map_type::value_type const insert_pair{};
auto it = view.insert_and_find(cuco::make_pair(node_id, node_id), d_hashed_cache, d_equal);
uq_node_id[node_id] = (it.first)->first; // first.load(cuda::std::memory_order_relaxed);
return it.second;
});
auto const num_columns = num_inserted; // key_map.get_size() is not updated.
rmm::device_uvector<size_type> unique_keys(num_columns, stream);
key_map.retrieve_all(unique_keys.begin(), thrust::make_discard_iterator(), stream.value());
return {std::move(col_id), std::move(unique_keys)};
}
/**
* @brief Generates column id and parent column id for each node
*
* 1. Generate col_id:
* a. Set operation on entire path of each node, translate each node id to set id.
* (two level hashing)
* b. gather unique set ids.
* c. sort and use binary search to generate column ids.
* d. Translate parent node ids to parent column ids.
*
* All inputs and outputs are in node_id order.
* @param d_input JSON string in device memory
* @param d_tree Tree representation of the JSON
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return column_id, parent_column_id
*/
std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> generate_column_id(
device_span<SymbolT const> d_input,
tree_meta_t const& d_tree,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
auto const num_nodes = d_tree.node_categories.size();
// Two level hashing:
// one for field names -> node_type and,
// another for {node_level, node_category} + field hash for the entire path
// which is {node_level, node_type} recursively using parent_node_id
auto [col_id, unique_keys] = [&]() {
// Convert node_category + field_name to node_type.
rmm::device_uvector<size_type> node_type =
hash_node_type_with_field_name(d_input, d_tree, stream);
// hash entire path from node to root.
return hash_node_path(d_tree.node_levels, node_type, d_tree.parent_node_ids, stream, mr);
}();
thrust::sort(rmm::exec_policy(stream), unique_keys.begin(), unique_keys.end());
thrust::lower_bound(rmm::exec_policy(stream),
unique_keys.begin(),
unique_keys.end(),
col_id.begin(),
col_id.end(),
col_id.begin());
rmm::device_uvector<size_type> parent_col_id(num_nodes, stream, mr);
thrust::transform(rmm::exec_policy(stream),
d_tree.parent_node_ids.begin(),
d_tree.parent_node_ids.end(),
parent_col_id.begin(),
[col_id = col_id.begin()] __device__(auto node_id) {
return node_id >= 0 ? col_id[node_id] : parent_node_sentinel;
});
return {std::move(col_id), std::move(parent_col_id)};
}
/**
* @brief Computes row indices of each node in the hierarchy.
* 2. Generate row_offset.
* a. Extract only list children
* b. stable_sort by parent_col_id.
* c. scan_by_key {parent_col_id} (done only on nodes who's parent is list)
* d. propagate to non-list leaves from parent list node by recursion
*
* pre-condition:
* d_tree.node_categories, d_tree.parent_node_ids, parent_col_id are in order of node_id.
* post-condition: row_offsets is in order of node_id.
* parent_col_id is moved and reused inside this function.
* @param parent_col_id parent node's column id
* @param d_tree Tree representation of the JSON string
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return row_offsets
*/
rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndexT>&& parent_col_id,
tree_meta_t const& d_tree,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
auto const num_nodes = d_tree.node_categories.size();
rmm::device_uvector<size_type> scatter_indices(num_nodes, stream);
thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end());
// Extract only list children. (nodes who's parent is a list/root)
auto const list_parent_end =
thrust::remove_if(rmm::exec_policy(stream),
thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()),
thrust::make_zip_iterator(parent_col_id.end(), scatter_indices.end()),
d_tree.parent_node_ids.begin(),
[node_categories = d_tree.node_categories.begin()] __device__(auto pnid) {
return !(pnid == parent_node_sentinel || node_categories[pnid] == NC_LIST);
});
auto const num_list_parent = thrust::distance(
thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()), list_parent_end);
thrust::stable_sort_by_key(rmm::exec_policy(stream),
parent_col_id.begin(),
parent_col_id.begin() + num_list_parent,
scatter_indices.begin());
rmm::device_uvector<size_type> row_offsets(num_nodes, stream, mr);
// TODO is it possible to generate list child_offsets too here?
// write only 1st child offset to parent node id child_offsets?
thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
parent_col_id.begin(),
parent_col_id.begin() + num_list_parent,
thrust::make_constant_iterator<size_type>(1),
row_offsets.begin());
// Using scatter instead of sort.
auto& temp_storage = parent_col_id; // reuse parent_col_id as temp storage
thrust::scatter(rmm::exec_policy(stream),
row_offsets.begin(),
row_offsets.begin() + num_list_parent,
scatter_indices.begin(),
temp_storage.begin());
row_offsets = std::move(temp_storage);
// Propagate row offsets to non-list leaves from list's immediate children node by recursion
thrust::transform_if(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_nodes),
row_offsets.begin(),
[node_categories = d_tree.node_categories.data(),
parent_node_ids = d_tree.parent_node_ids.begin(),
row_offsets = row_offsets.begin()] __device__(size_type node_id) {
auto parent_node_id = parent_node_ids[node_id];
while (parent_node_id != parent_node_sentinel and
node_categories[parent_node_id] != node_t::NC_LIST) {
node_id = parent_node_id;
parent_node_id = parent_node_ids[parent_node_id];
}
return row_offsets[node_id];
},
[node_categories = d_tree.node_categories.data(),
parent_node_ids = d_tree.parent_node_ids.begin()] __device__(size_type node_id) {
auto const parent_node_id = parent_node_ids[node_id];
return parent_node_id != parent_node_sentinel and
!(node_categories[parent_node_id] == node_t::NC_LIST);
});
return row_offsets;
}
// This algorithm assigns a unique column id to each node in the tree.
// The row offset is the row index of the node in that column id.
// Algorithm:
// 1. Generate col_id:
// a. Set operation on entire path of each node, translate each node id to set id.
// b. gather unique set ids.
// c. sort and use binary search to generate column ids.
// d. Translate parent node ids to parent column ids.
// 2. Generate row_offset.
// a. filter only list childs
// a. stable_sort by parent_col_id.
// b. scan_by_key {parent_col_id} (done only on nodes whose parent is a list)
// c. propagate to non-list leaves from parent list node by recursion
std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
records_orient_tree_traversal(device_span<SymbolT const> d_input,
tree_meta_t const& d_tree,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
auto [new_col_id, new_parent_col_id] = generate_column_id(d_input, d_tree, stream, mr);
auto row_offsets = compute_row_offsets(std::move(new_parent_col_id), d_tree, stream, mr);
return std::tuple{std::move(new_col_id), std::move(row_offsets)};
}
} // namespace detail
} // namespace cudf::io::json