diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 03a25a6f891583..e9f5792028cded 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -428,6 +428,11 @@ DEFINE_Int32(index_page_cache_percentage, "10"); DEFINE_mBool(disable_storage_page_cache, "false"); // whether to disable row cache feature in storage DEFINE_mBool(disable_storage_row_cache, "true"); +// Parquet page cache: threshold ratio for caching decompressed vs compressed pages +// If uncompressed_size / compressed_size <= threshold, cache decompressed; otherwise cache compressed +DEFINE_Double(parquet_page_cache_decompress_threshold, "1.5"); +// Parquet page cache: whether to enable caching compressed pages (when ratio exceeds threshold) +DEFINE_Bool(enable_parquet_cache_compressed_pages, "false"); // whether to disable pk page cache feature in storage DEFINE_Bool(disable_pk_storage_page_cache, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 7b19e8e4e4da85..09e397f63238c4 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -444,6 +444,10 @@ DECLARE_Int32(index_page_cache_percentage); DECLARE_Bool(disable_storage_page_cache); // whether to disable row cache feature in storage DECLARE_mBool(disable_storage_row_cache); +// Parquet page cache: threshold ratio for caching decompressed vs compressed pages +DECLARE_Double(parquet_page_cache_decompress_threshold); +// Parquet page cache: whether to enable caching compressed pages +DECLARE_Bool(enable_parquet_cache_compressed_pages); // whether to disable pk page cache feature in storage DECLARE_Bool(disable_pk_storage_page_cache); diff --git a/be/src/io/cache/cached_remote_file_reader.h b/be/src/io/cache/cached_remote_file_reader.h index 939471b62ea41d..20c1a47ce881c3 100644 --- a/be/src/io/cache/cached_remote_file_reader.h +++ b/be/src/io/cache/cached_remote_file_reader.h @@ -55,6 +55,8 @@ class CachedRemoteFileReader final : public FileReader { static std::pair s_align_size(size_t offset, size_t size, size_t length); + int64_t mtime() const override { return _remote_file_reader->mtime(); } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index bd08bc20461016..074849356438dc 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -203,6 +203,21 @@ Result FileFactory::create_file_reader( const io::FileSystemProperties& system_properties, const io::FileDescription& file_description, const io::FileReaderOptions& reader_options, RuntimeProfile* profile) { + auto reader_res = _create_file_reader_internal(system_properties, file_description, + reader_options, profile); + if (!reader_res.has_value()) { + return unexpected(std::move(reader_res).error()); + } + auto file_reader = std::move(reader_res).value(); + LOG_INFO("create file reader for path={}, size={}, mtime={}", file_description.path, + file_description.file_size, file_description.mtime); + return file_reader; +} + +Result FileFactory::_create_file_reader_internal( + const io::FileSystemProperties& system_properties, + const io::FileDescription& file_description, const io::FileReaderOptions& reader_options, + RuntimeProfile* profile) { TFileType::type type = system_properties.system_type; switch (type) { case TFileType::FILE_LOCAL: { diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index 0ba791bd0a3dc9..61e322ca0af02c 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -126,6 +126,12 @@ class FileFactory { private: static std::string _get_fs_name(const io::FileDescription& file_description); + + /// Create FileReader without FS + static Result _create_file_reader_internal( + const io::FileSystemProperties& system_properties, + const io::FileDescription& file_description, + const io::FileReaderOptions& reader_options, RuntimeProfile* profile = nullptr); }; } // namespace doris diff --git a/be/src/io/fs/broker_file_reader.cpp b/be/src/io/fs/broker_file_reader.cpp index 102ea3e247778a..41b2992f70008a 100644 --- a/be/src/io/fs/broker_file_reader.cpp +++ b/be/src/io/fs/broker_file_reader.cpp @@ -39,12 +39,14 @@ struct IOContext; BrokerFileReader::BrokerFileReader(const TNetworkAddress& broker_addr, Path path, size_t file_size, TBrokerFD fd, - std::shared_ptr connection) + std::shared_ptr connection, + int64_t mtime) : _path(std::move(path)), _file_size(file_size), _broker_addr(broker_addr), _fd(fd), - _connection(std::move(connection)) { + _connection(std::move(connection)), + _mtime(mtime) { DorisMetrics::instance()->broker_file_open_reading->increment(1); DorisMetrics::instance()->broker_file_reader_total->increment(1); } diff --git a/be/src/io/fs/broker_file_reader.h b/be/src/io/fs/broker_file_reader.h index 7d19edb32c0dea..2f6bd94b652bcb 100644 --- a/be/src/io/fs/broker_file_reader.h +++ b/be/src/io/fs/broker_file_reader.h @@ -38,7 +38,7 @@ struct IOContext; class BrokerFileReader final : public FileReader { public: BrokerFileReader(const TNetworkAddress& broker_addr, Path path, size_t file_size, TBrokerFD fd, - std::shared_ptr connection); + std::shared_ptr connection, int64_t mtime = 0); ~BrokerFileReader() override; @@ -50,6 +50,8 @@ class BrokerFileReader final : public FileReader { bool closed() const override { return _closed.load(std::memory_order_acquire); } + int64_t mtime() const override { return _mtime; } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; @@ -62,6 +64,7 @@ class BrokerFileReader final : public FileReader { TBrokerFD _fd; std::shared_ptr _connection; + int64_t _mtime; std::atomic _closed = false; }; } // namespace doris::io diff --git a/be/src/io/fs/broker_file_system.cpp b/be/src/io/fs/broker_file_system.cpp index 8b0d5db23e2116..b0dc89dc277ad1 100644 --- a/be/src/io/fs/broker_file_system.cpp +++ b/be/src/io/fs/broker_file_system.cpp @@ -139,7 +139,7 @@ Status BrokerFileSystem::open_file_internal(const Path& file, FileReaderSPtr* re error_msg(response->opStatus.message)); } *reader = std::make_shared(_broker_addr, file, fsize, response->fd, - _connection); + _connection, opts.mtime); return Status::OK(); } diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp index cae08d284179c4..6277416055550c 100644 --- a/be/src/io/fs/buffered_reader.cpp +++ b/be/src/io/fs/buffered_reader.cpp @@ -819,12 +819,10 @@ Status BufferedFileStreamReader::read_bytes(const uint8_t** buf, uint64_t offset int64_t buf_remaining = _buf_end_offset - _buf_start_offset; int64_t to_read = std::min(_buf_size - buf_remaining, _file_end_offset - _buf_end_offset); int64_t has_read = 0; - SCOPED_RAW_TIMER(&_statistics.read_time); while (has_read < to_read) { size_t loop_read = 0; Slice result(_buf.get() + buf_remaining + has_read, to_read - has_read); RETURN_IF_ERROR(_file->read_at(_buf_end_offset + has_read, result, &loop_read, io_ctx)); - _statistics.read_calls++; if (loop_read == 0) { break; } @@ -833,7 +831,6 @@ Status BufferedFileStreamReader::read_bytes(const uint8_t** buf, uint64_t offset if (has_read != to_read) { return Status::Corruption("Try to read {} bytes, but received {} bytes", to_read, has_read); } - _statistics.read_bytes += to_read; _buf_end_offset += to_read; *buf = _buf.get(); return Status::OK(); diff --git a/be/src/io/fs/buffered_reader.h b/be/src/io/fs/buffered_reader.h index 6bcf634aef35ea..6ddcca02067ddb 100644 --- a/be/src/io/fs/buffered_reader.h +++ b/be/src/io/fs/buffered_reader.h @@ -160,6 +160,8 @@ class RangeCacheFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return _inner_reader->mtime(); } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; @@ -225,7 +227,6 @@ class MergeRangeFileReader : public io::FileReader { int64_t merged_io = 0; int64_t request_bytes = 0; int64_t merged_bytes = 0; - int64_t apply_bytes = 0; }; struct RangeCachedData { @@ -299,9 +300,6 @@ class MergeRangeFileReader : public io::FileReader { _merged_read_slice_size = READ_SLICE_SIZE; } - for (const PrefetchRange& range : _random_access_ranges) { - _statistics.apply_bytes += range.end_offset - range.start_offset; - } if (_profile != nullptr) { const char* random_profile = "MergedSmallIO"; ADD_TIMER_WITH_LEVEL(_profile, random_profile, 1); @@ -315,8 +313,6 @@ class MergeRangeFileReader : public io::FileReader { random_profile, 1); _merged_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "MergedBytes", TUnit::BYTES, random_profile, 1); - _apply_bytes = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "ApplyBytes", TUnit::BYTES, - random_profile, 1); } } @@ -335,6 +331,8 @@ class MergeRangeFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return _reader->mtime(); } + // for test only size_t buffer_remaining() const { return _remaining; } @@ -359,7 +357,6 @@ class MergeRangeFileReader : public io::FileReader { COUNTER_UPDATE(_merged_io, _statistics.merged_io); COUNTER_UPDATE(_request_bytes, _statistics.request_bytes); COUNTER_UPDATE(_merged_bytes, _statistics.merged_bytes); - COUNTER_UPDATE(_apply_bytes, _statistics.apply_bytes); if (_reader != nullptr) { _reader->collect_profile_before_close(); } @@ -373,7 +370,6 @@ class MergeRangeFileReader : public io::FileReader { RuntimeProfile::Counter* _merged_io = nullptr; RuntimeProfile::Counter* _request_bytes = nullptr; RuntimeProfile::Counter* _merged_bytes = nullptr; - RuntimeProfile::Counter* _apply_bytes = nullptr; int _search_read_range(size_t start_offset, size_t end_offset); void _clean_cached_data(RangeCachedData& cached_data); @@ -540,6 +536,8 @@ class PrefetchBufferedReader final : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return _reader->mtime(); } + void set_random_access_ranges(const std::vector* random_access_ranges) { _random_access_ranges = random_access_ranges; for (auto& _pre_buffer : _pre_buffers) { @@ -600,6 +598,8 @@ class InMemoryFileReader final : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return _reader->mtime(); } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; @@ -619,12 +619,6 @@ class InMemoryFileReader final : public io::FileReader { */ class BufferedStreamReader { public: - struct Statistics { - int64_t read_time = 0; - int64_t read_calls = 0; - int64_t read_bytes = 0; - }; - /** * Return the address of underlying buffer that locates the start of data between [offset, offset + bytes_to_read) * @param buf the buffer address to save the start address of data @@ -637,13 +631,11 @@ class BufferedStreamReader { * Save the data address to slice.data, and the slice.size is the bytes to read. */ virtual Status read_bytes(Slice& slice, uint64_t offset, const IOContext* io_ctx) = 0; - Statistics& statistics() { return _statistics; } virtual ~BufferedStreamReader() = default; // return the file path virtual std::string path() = 0; -protected: - Statistics _statistics; + virtual int64_t mtime() const = 0; }; class BufferedFileStreamReader : public BufferedStreamReader, public ProfileCollector { @@ -657,6 +649,8 @@ class BufferedFileStreamReader : public BufferedStreamReader, public ProfileColl Status read_bytes(Slice& slice, uint64_t offset, const IOContext* io_ctx) override; std::string path() override { return _file->path(); } + int64_t mtime() const override { return _file->mtime(); } + protected: void _collect_profile_before_close() override { if (_file != nullptr) { diff --git a/be/src/io/fs/file_reader.h b/be/src/io/fs/file_reader.h index e6d8527e831906..3df912cbad4af9 100644 --- a/be/src/io/fs/file_reader.h +++ b/be/src/io/fs/file_reader.h @@ -90,6 +90,9 @@ class FileReader : public doris::ProfileCollector { virtual const std::string& get_data_dir_path() { return VIRTUAL_REMOTE_DATA_DIR; } + // File modification time (seconds since epoch). Default to 0 meaning unknown. + virtual int64_t mtime() const = 0; + protected: virtual Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) = 0; diff --git a/be/src/io/fs/hdfs_file_reader.cpp b/be/src/io/fs/hdfs_file_reader.cpp index 0e278dff0c8847..b1d65a63ba0529 100644 --- a/be/src/io/fs/hdfs_file_reader.cpp +++ b/be/src/io/fs/hdfs_file_reader.cpp @@ -66,16 +66,17 @@ Result HdfsFileReader::create(Path full_path, const hdfsFS& fs, auto path = convert_path(full_path, fs_name); return get_file(fs, path, opts.mtime, opts.file_size).transform([&](auto&& accessor) { return std::make_shared(std::move(path), std::move(fs_name), - std::move(accessor), profile); + std::move(accessor), profile, opts.mtime); }); } HdfsFileReader::HdfsFileReader(Path path, std::string fs_name, FileHandleCache::Accessor accessor, - RuntimeProfile* profile) + RuntimeProfile* profile, int64_t mtime) : _path(std::move(path)), _fs_name(std::move(fs_name)), _accessor(std::move(accessor)), - _profile(profile) { + _profile(profile), + _mtime(mtime) { _handle = _accessor.get(); DorisMetrics::instance()->hdfs_file_open_reading->increment(1); diff --git a/be/src/io/fs/hdfs_file_reader.h b/be/src/io/fs/hdfs_file_reader.h index 8556eea0de6ac5..08f98bca29af0c 100644 --- a/be/src/io/fs/hdfs_file_reader.h +++ b/be/src/io/fs/hdfs_file_reader.h @@ -45,7 +45,7 @@ class HdfsFileReader final : public FileReader { const FileReaderOptions& opts, RuntimeProfile* profile); HdfsFileReader(Path path, std::string fs_name, FileHandleCache::Accessor accessor, - RuntimeProfile* profile); + RuntimeProfile* profile, int64_t mtime = 0); ~HdfsFileReader() override; @@ -57,6 +57,8 @@ class HdfsFileReader final : public FileReader { bool closed() const override { return _closed.load(std::memory_order_acquire); } + int64_t mtime() const override { return _mtime; } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; @@ -86,6 +88,7 @@ class HdfsFileReader final : public FileReader { CachedHdfsFileHandle* _handle = nullptr; // owned by _cached_file_handle std::atomic _closed = false; RuntimeProfile* _profile = nullptr; + int64_t _mtime; #ifdef USE_HADOOP_HDFS HDFSProfile _hdfs_profile; #endif diff --git a/be/src/io/fs/http_file_reader.cpp b/be/src/io/fs/http_file_reader.cpp index fb243179baf557..5ad984039fc475 100644 --- a/be/src/io/fs/http_file_reader.cpp +++ b/be/src/io/fs/http_file_reader.cpp @@ -34,7 +34,7 @@ Result HttpFileReader::create(const std::string& url, ofi.path = Path(url); ofi.extend_info = props; - auto reader = std::make_shared(ofi, url); + auto reader = std::make_shared(ofi, url, opts.mtime); // Open the file to detect Range support and validate configuration RETURN_IF_ERROR_RESULT(reader->open(opts)); @@ -42,11 +42,12 @@ Result HttpFileReader::create(const std::string& url, return reader; } -HttpFileReader::HttpFileReader(const OpenFileInfo& fileInfo, std::string url) +HttpFileReader::HttpFileReader(const OpenFileInfo& fileInfo, std::string url, int64_t mtime) : _extend_kv(fileInfo.extend_info), _path(fileInfo.path), _url(std::move(url)), - _client(std::make_unique()) { + _client(std::make_unique()), + _mtime(mtime) { auto etag_iter = _extend_kv.find("etag"); if (etag_iter != _extend_kv.end()) { _etag = etag_iter->second; diff --git a/be/src/io/fs/http_file_reader.h b/be/src/io/fs/http_file_reader.h index 607eedf3d1a50b..982e65905aa691 100644 --- a/be/src/io/fs/http_file_reader.h +++ b/be/src/io/fs/http_file_reader.h @@ -41,7 +41,7 @@ class HttpFileReader final : public FileReader { const std::map& props, const FileReaderOptions& opts, RuntimeProfile* profile); - explicit HttpFileReader(const OpenFileInfo& fileInfo, std::string url); + explicit HttpFileReader(const OpenFileInfo& fileInfo, std::string url, int64_t mtime); ~HttpFileReader() override; Status open(const FileReaderOptions& opts); @@ -52,6 +52,8 @@ class HttpFileReader final : public FileReader { bool closed() const override { return _closed.load(std::memory_order_acquire); } size_t size() const override { return _file_size; } + int64_t mtime() const override { return _mtime; } + private: // Prepare and initialize the HTTP client for a new request Status prepare_client(bool set_fail_on_error = true); @@ -78,6 +80,7 @@ class HttpFileReader final : public FileReader { int64_t _last_modified = 0; std::atomic _closed = false; std::unique_ptr _client; + int64_t _mtime; // Configuration for non-Range request handling bool _enable_range_request = true; // Whether Range request is required diff --git a/be/src/io/fs/http_file_system.cpp b/be/src/io/fs/http_file_system.cpp index 92e175ca774041..b1e8de354ad9b3 100644 --- a/be/src/io/fs/http_file_system.cpp +++ b/be/src/io/fs/http_file_system.cpp @@ -56,7 +56,7 @@ Status HttpFileSystem::open_file_internal(const Path& path, FileReaderSPtr* read // Pass properties (including HTTP headers) to the file reader file_info.extend_info = _properties; - auto http_reader = std::make_shared(file_info, path.native()); + auto http_reader = std::make_shared(file_info, path.native(), opts.mtime); RETURN_IF_ERROR(http_reader->open(opts)); *reader = http_reader; return Status::OK(); diff --git a/be/src/io/fs/local_file_reader.h b/be/src/io/fs/local_file_reader.h index 0ffd6ccde9e029..97e071226a7d7b 100644 --- a/be/src/io/fs/local_file_reader.h +++ b/be/src/io/fs/local_file_reader.h @@ -62,6 +62,8 @@ class LocalFileReader final : public FileReader { const std::string& get_data_dir_path() override { return _data_dir_path; } + int64_t mtime() const override { return 0; } + private: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; diff --git a/be/src/io/fs/s3_file_reader.h b/be/src/io/fs/s3_file_reader.h index 58294ec1891cb8..40e3ac61d3aca6 100644 --- a/be/src/io/fs/s3_file_reader.h +++ b/be/src/io/fs/s3_file_reader.h @@ -53,6 +53,8 @@ class S3FileReader final : public FileReader { bool closed() const override { return _closed.load(std::memory_order_acquire); } + int64_t mtime() const override { return 0; } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) override; diff --git a/be/src/io/fs/stream_load_pipe.h b/be/src/io/fs/stream_load_pipe.h index cedab0b6c17a7b..df137a9267cb29 100644 --- a/be/src/io/fs/stream_load_pipe.h +++ b/be/src/io/fs/stream_load_pipe.h @@ -57,6 +57,8 @@ class StreamLoadPipe : public MessageBodySink, public FileReader { size_t size() const override { return 0; } + int64_t mtime() const override { return 0; } + // called when consumer finished Status close() override { if (!(_finished || _cancelled)) { diff --git a/be/src/io/fs/tracing_file_reader.h b/be/src/io/fs/tracing_file_reader.h index 39b70dfbb63bef..7a6651afd21a2a 100644 --- a/be/src/io/fs/tracing_file_reader.h +++ b/be/src/io/fs/tracing_file_reader.h @@ -47,6 +47,8 @@ class TracingFileReader : public FileReader { void _collect_profile_at_runtime() override { return _inner->collect_profile_at_runtime(); } void _collect_profile_before_close() override { return _inner->collect_profile_before_close(); } + int64_t mtime() const override { return _inner->mtime(); } + FileReaderStats* stats() const { return _stats; } doris::io::FileReaderSPtr inner_reader() { return _inner; } diff --git a/be/src/olap/rowset/segment_v2/row_ranges.h b/be/src/olap/rowset/segment_v2/row_ranges.h index b9f340823b1bb8..2f990faa762d70 100644 --- a/be/src/olap/rowset/segment_v2/row_ranges.h +++ b/be/src/olap/rowset/segment_v2/row_ranges.h @@ -232,15 +232,15 @@ class RowRanges { return _ranges[_ranges.size() - 1].to(); } - size_t range_size() { return _ranges.size(); } + size_t range_size() const { return _ranges.size(); } - RowRange get_range(size_t index) { return _ranges[index]; } + RowRange get_range(size_t index) const { return _ranges[index]; } - int64_t get_range_from(size_t range_index) { return _ranges[range_index].from(); } + int64_t get_range_from(size_t range_index) const { return _ranges[range_index].from(); } - int64_t get_range_to(size_t range_index) { return _ranges[range_index].to(); } + int64_t get_range_to(size_t range_index) const { return _ranges[range_index].to(); } - size_t get_range_count(size_t range_index) { return _ranges[range_index].count(); } + size_t get_range_count(size_t range_index) const { return _ranges[range_index].count(); } std::string to_string() { std::string result; diff --git a/be/src/vec/exec/format/orc/orc_file_reader.h b/be/src/vec/exec/format/orc/orc_file_reader.h index 503777e67c2946..15aeed332428b7 100644 --- a/be/src/vec/exec/format/orc/orc_file_reader.h +++ b/be/src/vec/exec/format/orc/orc_file_reader.h @@ -54,6 +54,8 @@ class OrcMergeRangeFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return _inner_reader->mtime(); } + // for test only const Statistics& statistics() const { return _statistics; } diff --git a/be/src/vec/exec/format/parquet/level_decoder.cpp b/be/src/vec/exec/format/parquet/level_decoder.cpp index 79c18c7af08449..033ca5cbdd1d84 100644 --- a/be/src/vec/exec/format/parquet/level_decoder.cpp +++ b/be/src/vec/exec/format/parquet/level_decoder.cpp @@ -87,6 +87,7 @@ doris::Status doris::vectorized::LevelDecoder::init_v2(const doris::Slice& level } size_t doris::vectorized::LevelDecoder::get_levels(doris::vectorized::level_t* levels, size_t n) { + // toto template. if (_encoding == tparquet::Encoding::RLE) { n = std::min((size_t)_num_levels, n); auto num_decoded = _rle_decoder.get_values(levels, n); diff --git a/be/src/vec/exec/format/parquet/parquet_common.cpp b/be/src/vec/exec/format/parquet/parquet_common.cpp index 4a25102e4fe041..95920ef2a16006 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.cpp +++ b/be/src/vec/exec/format/parquet/parquet_common.cpp @@ -118,6 +118,107 @@ Status FilterMap::generate_nested_filter_map(const std::vector& rep_lev return Status::OK(); } +Status ColumnSelectVector::init(const std::vector& run_length_null_map, size_t num_values, + NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, + const std::unordered_set* skipped_indices) { + _num_values = num_values; + _num_nulls = 0; + _read_index = 0; + size_t map_index = 0; + bool is_null = false; + _has_filter = filter_map->has_filter(); + + if (filter_map->has_filter()) { + // No run length null map is generated when _filter_all = true + // DCHECK(!filter_map->filter_all()); + _data_map.resize(num_values); + for (auto& run_length : run_length_null_map) { + if (is_null) { + _num_nulls += run_length; + for (int i = 0; i < run_length; ++i) { + _data_map[map_index++] = FILTERED_NULL; + } + } else { + for (int i = 0; i < run_length; ++i) { + _data_map[map_index++] = FILTERED_CONTENT; + } + } + is_null = !is_null; + } + + size_t num_read = 0; + size_t i = 0; + size_t valid_count = 0; + + while (valid_count < num_values) { + DCHECK_LT(filter_map_index + i, filter_map->filter_map_size()); + + if (skipped_indices != nullptr && skipped_indices->count(filter_map_index + i) > 0) { + ++i; + continue; + } + + if (filter_map->filter_map_data()[filter_map_index + i]) { + _data_map[valid_count] = + _data_map[valid_count] == FILTERED_NULL ? NULL_DATA : CONTENT; + num_read++; + } + ++valid_count; + ++i; + } + + _num_filtered = num_values - num_read; + + if (null_map != nullptr && num_read > 0) { + NullMap& map_data_column = *null_map; + auto null_map_index = map_data_column.size(); + map_data_column.resize(null_map_index + num_read); + + if (_num_nulls == 0) { + memset(map_data_column.data() + null_map_index, 0, num_read); + } else if (_num_nulls == num_values) { + memset(map_data_column.data() + null_map_index, 1, num_read); + } else { + for (i = 0; i < num_values; ++i) { + if (_data_map[i] == CONTENT) { + map_data_column[null_map_index++] = (UInt8) false; + } else if (_data_map[i] == NULL_DATA) { + map_data_column[null_map_index++] = (UInt8) true; + } + } + } + } + } else { + _num_filtered = 0; + _run_length_null_map = &run_length_null_map; + if (null_map != nullptr) { + NullMap& map_data_column = *null_map; + auto null_map_index = map_data_column.size(); + map_data_column.resize(null_map_index + num_values); + + for (auto& run_length : run_length_null_map) { + if (is_null) { + memset(map_data_column.data() + null_map_index, 1, run_length); + null_map_index += run_length; + _num_nulls += run_length; + } else { + memset(map_data_column.data() + null_map_index, 0, run_length); + null_map_index += run_length; + } + is_null = !is_null; + } + } else { + for (auto& run_length : run_length_null_map) { + if (is_null) { + _num_nulls += run_length; + } + is_null = !is_null; + } + } + } + return Status::OK(); +} + ParsedVersion::ParsedVersion(std::string application, std::optional version, std::optional app_build_hash) : _application(std::move(application)), diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 293b5879108a98..a57ce9db7eac13 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -92,105 +92,7 @@ class ColumnSelectVector { Status init(const std::vector& run_length_null_map, size_t num_values, NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, - const std::unordered_set* skipped_indices = nullptr) { - _num_values = num_values; - _num_nulls = 0; - _read_index = 0; - size_t map_index = 0; - bool is_null = false; - _has_filter = filter_map->has_filter(); - - if (filter_map->has_filter()) { - // No run length null map is generated when _filter_all = true - DCHECK(!filter_map->filter_all()); - _data_map.resize(num_values); - for (auto& run_length : run_length_null_map) { - if (is_null) { - _num_nulls += run_length; - for (int i = 0; i < run_length; ++i) { - _data_map[map_index++] = FILTERED_NULL; - } - } else { - for (int i = 0; i < run_length; ++i) { - _data_map[map_index++] = FILTERED_CONTENT; - } - } - is_null = !is_null; - } - - size_t num_read = 0; - size_t i = 0; - size_t valid_count = 0; - - while (valid_count < num_values) { - DCHECK_LT(filter_map_index + i, filter_map->filter_map_size()); - - if (skipped_indices != nullptr && - skipped_indices->count(filter_map_index + i) > 0) { - ++i; - continue; - } - - if (filter_map->filter_map_data()[filter_map_index + i]) { - _data_map[valid_count] = - _data_map[valid_count] == FILTERED_NULL ? NULL_DATA : CONTENT; - num_read++; - } - ++valid_count; - ++i; - } - - _num_filtered = num_values - num_read; - - if (null_map != nullptr && num_read > 0) { - NullMap& map_data_column = *null_map; - auto null_map_index = map_data_column.size(); - map_data_column.resize(null_map_index + num_read); - - if (_num_nulls == 0) { - memset(map_data_column.data() + null_map_index, 0, num_read); - } else if (_num_nulls == num_values) { - memset(map_data_column.data() + null_map_index, 1, num_read); - } else { - for (i = 0; i < num_values; ++i) { - if (_data_map[i] == CONTENT) { - map_data_column[null_map_index++] = (UInt8) false; - } else if (_data_map[i] == NULL_DATA) { - map_data_column[null_map_index++] = (UInt8) true; - } - } - } - } - } else { - _num_filtered = 0; - _run_length_null_map = &run_length_null_map; - if (null_map != nullptr) { - NullMap& map_data_column = *null_map; - auto null_map_index = map_data_column.size(); - map_data_column.resize(null_map_index + num_values); - - for (auto& run_length : run_length_null_map) { - if (is_null) { - memset(map_data_column.data() + null_map_index, 1, run_length); - null_map_index += run_length; - _num_nulls += run_length; - } else { - memset(map_data_column.data() + null_map_index, 0, run_length); - null_map_index += run_length; - } - is_null = !is_null; - } - } else { - for (auto& run_length : run_length_null_map) { - if (is_null) { - _num_nulls += run_length; - } - is_null = !is_null; - } - } - } - return Status::OK(); - } + const std::unordered_set* skipped_indices = nullptr); size_t num_values() const { return _num_values; } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 4484e2142ee2f2..807975def2ff9e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -25,6 +25,8 @@ #include #include "common/compiler_util.h" // IWYU pragma: keep +#include "io/fs/buffered_reader.h" +#include "olap/page_cache.h" #include "util/bit_util.h" #include "util/block_compression.h" #include "util/runtime_profile.h" @@ -47,77 +49,110 @@ struct IOContext; namespace doris::vectorized { #include "common/compile_check_begin.h" -ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader, - tparquet::ColumnChunk* column_chunk, FieldSchema* field_schema, - const tparquet::OffsetIndex* offset_index, - const cctz::time_zone* ctz, io::IOContext* io_ctx) +template +ColumnChunkReader::ColumnChunkReader( + io::BufferedStreamReader* reader, tparquet::ColumnChunk* column_chunk, + FieldSchema* field_schema, const tparquet::OffsetIndex* offset_index, size_t total_rows, + io::IOContext* io_ctx, const ParquetPageReadContext& ctx) : _field_schema(field_schema), _max_rep_level(field_schema->repetition_level), _max_def_level(field_schema->definition_level), _stream_reader(reader), _metadata(column_chunk->meta_data), _offset_index(offset_index), - // _ctz(ctz), - _io_ctx(io_ctx) {} + _total_rows(total_rows), + _io_ctx(io_ctx), + _ctx(ctx) {} -Status ColumnChunkReader::init() { +template +Status ColumnChunkReader::init() { size_t start_offset = has_dict_page(_metadata) ? _metadata.dictionary_page_offset : _metadata.data_page_offset; size_t chunk_size = _metadata.total_compressed_size; // create page reader - _page_reader = create_page_reader(_stream_reader, _io_ctx, start_offset, chunk_size, - _metadata.num_values, _offset_index); + _page_reader = create_page_reader( + _stream_reader, _io_ctx, start_offset, chunk_size, _total_rows, _metadata, + _offset_index, _ctx); // get the block compression codec RETURN_IF_ERROR(get_block_compression_codec(_metadata.codec, &_block_compress_codec)); - if (has_dict_page(_metadata)) { - // seek to the directory page - _page_reader->seek_to_page(_metadata.dictionary_page_offset); - // Parse dictionary data when reading - // RETURN_IF_ERROR(_page_reader->next_page_header()); - // RETURN_IF_ERROR(_decode_dict_page()); - } else { - // seek to the first data page - _page_reader->seek_to_page(_metadata.data_page_offset); - } _state = INITIALIZED; + RETURN_IF_ERROR(_parse_first_page_header()); return Status::OK(); } -Status ColumnChunkReader::next_page() { - if (_state == HEADER_PARSED) { - return Status::OK(); - } - if (UNLIKELY(_state == NOT_INIT)) { - return Status::Corruption("Should initialize chunk reader"); - } - if (UNLIKELY(_remaining_num_values != 0)) { - return Status::Corruption("Should skip current page"); +template +Status ColumnChunkReader::skip_nested_values( + const std::vector& def_levels) { + size_t no_value_cnt = 0; + size_t value_cnt = 0; + + for (size_t idx = 0; idx < def_levels.size(); idx++) { + level_t def_level = def_levels[idx]; + if (IN_COLLECTION && def_level < _field_schema->repeated_parent_def_level) { + no_value_cnt++; + } else if (def_level < _field_schema->definition_level) { + no_value_cnt++; + } else { + value_cnt++; + } } - RETURN_IF_ERROR(_page_reader->next_page_header()); - - if (!_dict_checked) { - _dict_checked = true; - const tparquet::PageHeader* header; - RETURN_IF_ERROR(_page_reader->get_page_header(header)); - if (header->type == tparquet::PageType::DICTIONARY_PAGE) { - // the first page maybe directory page even if _metadata.__isset.dictionary_page_offset == false, - // so we should parse the directory page in next_page() - RETURN_IF_ERROR(_decode_dict_page()); - // parse the real first data page - return next_page(); - } + RETURN_IF_ERROR(skip_values(value_cnt, true)); + RETURN_IF_ERROR(skip_values(no_value_cnt, false)); + return Status::OK(); +} + +template +Status ColumnChunkReader::_parse_first_page_header() { + RETURN_IF_ERROR(parse_page_header()); + + const tparquet::PageHeader* header = nullptr; + RETURN_IF_ERROR(_page_reader->get_page_header(&header)); + if (header->type == tparquet::PageType::DICTIONARY_PAGE) { + // the first page maybe directory page even if _metadata.__isset.dictionary_page_offset == false, + // so we should parse the directory page in next_page() + RETURN_IF_ERROR(_decode_dict_page()); + // parse the real first data page + RETURN_IF_ERROR(_page_reader->dict_next_page()); + _state = INITIALIZED; } - RETURN_IF_ERROR(_page_reader->get_num_values(_remaining_num_values)); - _chunk_parsed_values += _remaining_num_values; + return Status::OK(); +} + +template +Status ColumnChunkReader::parse_page_header() { + if (_state == HEADER_PARSED || _state == DATA_LOADED) { + return Status::OK(); + } + RETURN_IF_ERROR(_page_reader->parse_page_header()); + + const tparquet::PageHeader* header = nullptr; + RETURN_IF_ERROR(_page_reader->get_page_header(&header)); + int32_t page_num_values = _page_reader->is_header_v2() ? header->data_page_header_v2.num_values + : header->data_page_header.num_values; + _remaining_rep_nums = page_num_values; + _remaining_def_nums = page_num_values; + _remaining_num_values = page_num_values; + + // no offset will parse all header. + if constexpr (OFFSET_INDEX == false) { + _chunk_parsed_values += _remaining_num_values; + } _state = HEADER_PARSED; + return Status::OK(); +} +template +Status ColumnChunkReader::next_page() { + _state = INITIALIZED; + RETURN_IF_ERROR(_page_reader->next_page()); return Status::OK(); } -void ColumnChunkReader::_get_uncompressed_levels(const tparquet::DataPageHeaderV2& page_v2, - Slice& page_data) { +template +void ColumnChunkReader::_get_uncompressed_levels( + const tparquet::DataPageHeaderV2& page_v2, Slice& page_data) { int32_t rl = page_v2.repetition_levels_byte_length; int32_t dl = page_v2.definition_levels_byte_length; _v2_rep_levels = Slice(page_data.data, rl); @@ -126,67 +161,185 @@ void ColumnChunkReader::_get_uncompressed_levels(const tparquet::DataPageHeaderV page_data.size -= dl + rl; } -Status ColumnChunkReader::load_page_data() { - // TODO: remove checking HEADER_PARSED or change name +template +Status ColumnChunkReader::load_page_data() { + if (_state == DATA_LOADED) { + return Status::OK(); + } if (UNLIKELY(_state != HEADER_PARSED)) { return Status::Corruption("Should parse page header"); } - const tparquet::PageHeader* header; - RETURN_IF_ERROR(_page_reader->get_page_header(header)); - int32_t uncompressed_size = header->uncompressed_page_size; - if (_block_compress_codec != nullptr) { - Slice compressed_data; - RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); - if (header->__isset.data_page_header_v2) { - const tparquet::DataPageHeaderV2& header_v2 = header->data_page_header_v2; - // uncompressed_size = rl + dl + uncompressed_data_size - // compressed_size = rl + dl + compressed_data_size - uncompressed_size -= header_v2.repetition_levels_byte_length + - header_v2.definition_levels_byte_length; - _get_uncompressed_levels(header_v2, compressed_data); + const tparquet::PageHeader* header = nullptr; + RETURN_IF_ERROR(_page_reader->get_page_header(&header)); + int32_t uncompressed_size = header->uncompressed_page_size; + bool page_loaded = false; + + // First, try to reuse a cache handle previously discovered by PageReader + // (header-only lookup) to avoid a second lookup here. If no handle is + // attached, fall back to a StoragePageCache lookup for a decompressed page. + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr) { + if (_page_reader->has_page_cache_handle()) { + const PageCacheHandle& handle = _page_reader->page_cache_handle(); + Slice cached = handle.data(); + size_t header_size = _page_reader->header_bytes().size(); + //size_t levels_size = 0; + size_t levels_size = 0; + if (header->__isset.data_page_header_v2) { + const tparquet::DataPageHeaderV2& header_v2 = header->data_page_header_v2; + size_t rl = header_v2.repetition_levels_byte_length; + size_t dl = header_v2.definition_levels_byte_length; + levels_size = rl + dl; + _v2_rep_levels = + Slice(reinterpret_cast(cached.data) + header_size, rl); + _v2_def_levels = + Slice(reinterpret_cast(cached.data) + header_size + rl, dl); + } + // payload_slice points to the bytes after header and levels + Slice payload_slice(cached.data + header_size + levels_size, + cached.size - header_size - levels_size); + + bool cache_payload_is_decompressed = _page_reader->is_cache_payload_decompressed(); + + if (cache_payload_is_decompressed) { + // Cached payload is already uncompressed + _page_data = payload_slice; + } else { + CHECK(_block_compress_codec); + // Decompress cached payload into _decompress_buf for decoding + size_t uncompressed_payload_size = + header->__isset.data_page_header_v2 + ? static_cast(header->uncompressed_page_size) - levels_size + : static_cast(header->uncompressed_page_size); + _reserve_decompress_buf(uncompressed_payload_size); + _page_data = Slice(_decompress_buf.get(), uncompressed_payload_size); + SCOPED_RAW_TIMER(&_chunk_statistics.decompress_time); + _chunk_statistics.decompress_cnt++; + RETURN_IF_ERROR(_block_compress_codec->decompress(payload_slice, &_page_data)); + } + // page cache counters were incremented when PageReader did the header-only + // cache lookup. Do not increment again to avoid double-counting. + page_loaded = true; } - bool is_v2_compressed = - header->__isset.data_page_header_v2 && header->data_page_header_v2.is_compressed; - if (header->__isset.data_page_header || is_v2_compressed) { - // check decompressed buffer size - _reserve_decompress_buf(uncompressed_size); - _page_data = Slice(_decompress_buf.get(), uncompressed_size); - SCOPED_RAW_TIMER(&_statistics.decompress_time); - _statistics.decompress_cnt++; - RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &_page_data)); + } + + if (!page_loaded) { + if (_block_compress_codec != nullptr) { + Slice compressed_data; + RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); + std::vector level_bytes; + if (header->__isset.data_page_header_v2) { + const tparquet::DataPageHeaderV2& header_v2 = header->data_page_header_v2; + // uncompressed_size = rl + dl + uncompressed_data_size + // compressed_size = rl + dl + compressed_data_size + uncompressed_size -= header_v2.repetition_levels_byte_length + + header_v2.definition_levels_byte_length; + // copy level bytes (rl + dl) so that we can cache header + levels + uncompressed payload + size_t rl = header_v2.repetition_levels_byte_length; + size_t dl = header_v2.definition_levels_byte_length; + size_t level_sz = rl + dl; + if (level_sz > 0) { + level_bytes.resize(level_sz); + memcpy(level_bytes.data(), compressed_data.data, level_sz); + } + // now remove levels from compressed_data for decompression + _get_uncompressed_levels(header_v2, compressed_data); + } + bool is_v2_compressed = header->__isset.data_page_header_v2 && + header->data_page_header_v2.is_compressed; + bool page_has_compression = header->__isset.data_page_header || is_v2_compressed; + + if (page_has_compression) { + // Decompress payload for immediate decoding + _reserve_decompress_buf(uncompressed_size); + _page_data = Slice(_decompress_buf.get(), uncompressed_size); + SCOPED_RAW_TIMER(&_chunk_statistics.decompress_time); + _chunk_statistics.decompress_cnt++; + RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &_page_data)); + + // Decide whether to cache decompressed payload or compressed payload based on threshold + bool should_cache_decompressed = false; + if (header->compressed_page_size > 0) { + should_cache_decompressed = + (_metadata.codec == tparquet::CompressionCodec::UNCOMPRESSED) || + (static_cast(header->uncompressed_page_size) <= + static_cast(config::parquet_page_cache_decompress_threshold) * + static_cast(header->compressed_page_size)); + } + + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr && + !_page_reader->header_bytes().empty()) { + if (should_cache_decompressed) { + _insert_page_into_cache(level_bytes, _page_data); + _chunk_statistics.page_cache_decompressed_write_counter += 1; + } else { + if (config::enable_parquet_cache_compressed_pages) { + // cache the compressed payload as-is (header | levels | compressed_payload) + _insert_page_into_cache( + level_bytes, Slice(compressed_data.data, compressed_data.size)); + _chunk_statistics.page_cache_compressed_write_counter += 1; + } + } + } + } else { + // no compression on this page, use the data directly + _page_data = Slice(compressed_data.data, compressed_data.size); + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr) { + _insert_page_into_cache(level_bytes, _page_data); + _chunk_statistics.page_cache_decompressed_write_counter += 1; + } + } } else { - // Don't need decompress - _page_data = Slice(compressed_data.data, compressed_data.size); - } - } else { - RETURN_IF_ERROR(_page_reader->get_page_data(_page_data)); - if (header->__isset.data_page_header_v2) { - _get_uncompressed_levels(header->data_page_header_v2, _page_data); + // For uncompressed page, we may still need to extract v2 levels + std::vector level_bytes; + Slice uncompressed_data; + RETURN_IF_ERROR(_page_reader->get_page_data(uncompressed_data)); + if (header->__isset.data_page_header_v2) { + const tparquet::DataPageHeaderV2& header_v2 = header->data_page_header_v2; + size_t rl = header_v2.repetition_levels_byte_length; + size_t dl = header_v2.definition_levels_byte_length; + size_t level_sz = rl + dl; + if (level_sz > 0) { + level_bytes.resize(level_sz); + memcpy(level_bytes.data(), uncompressed_data.data, level_sz); + } + _get_uncompressed_levels(header_v2, uncompressed_data); + } + // copy page data out + _page_data = Slice(uncompressed_data.data, uncompressed_data.size); + // Optionally cache uncompressed data for uncompressed pages + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr) { + _insert_page_into_cache(level_bytes, _page_data); + _chunk_statistics.page_cache_decompressed_write_counter += 1; + } } } // Initialize repetition level and definition level. Skip when level = 0, which means required field. if (_max_rep_level > 0) { - SCOPED_RAW_TIMER(&_statistics.decode_level_time); + SCOPED_RAW_TIMER(&_chunk_statistics.decode_level_time); if (header->__isset.data_page_header_v2) { RETURN_IF_ERROR(_rep_level_decoder.init_v2(_v2_rep_levels, _max_rep_level, - _remaining_num_values)); + _remaining_rep_nums)); } else { RETURN_IF_ERROR(_rep_level_decoder.init( &_page_data, header->data_page_header.repetition_level_encoding, _max_rep_level, - _remaining_num_values)); + _remaining_rep_nums)); } } if (_max_def_level > 0) { - SCOPED_RAW_TIMER(&_statistics.decode_level_time); + SCOPED_RAW_TIMER(&_chunk_statistics.decode_level_time); if (header->__isset.data_page_header_v2) { RETURN_IF_ERROR(_def_level_decoder.init_v2(_v2_def_levels, _max_def_level, - _remaining_num_values)); + _remaining_def_nums)); } else { RETURN_IF_ERROR(_def_level_decoder.init( &_page_data, header->data_page_header.definition_level_encoding, _max_def_level, - _remaining_num_values)); + _remaining_def_nums)); } } auto encoding = header->__isset.data_page_header_v2 ? header->data_page_header_v2.encoding @@ -204,23 +357,30 @@ Status ColumnChunkReader::load_page_data() { RETURN_IF_ERROR(Decoder::get_decoder(_metadata.type, encoding, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); - // Initialize the time convert context - // page_decoder->init(_field_schema, _ctz); _decoders[static_cast(encoding)] = std::move(page_decoder); _page_decoder = _decoders[static_cast(encoding)].get(); } - // Reset page data for each page + // Reset page data for each page. + // If this is a v2 data page, _page_data currently contains rl+dl followed by payload. + // The decoder expects payload-only, so strip the level bytes into a temporary Slice + // that points into the same cached memory (so ownership remains with the cache handle). + // Slice payload_slice = _page_data; + // if (header->__isset.data_page_header_v2) { + // const tparquet::DataPageHeaderV2& header_v2 = header->data_page_header_v2; + // _get_uncompressed_levels(header_v2, payload_slice); + // } RETURN_IF_ERROR(_page_decoder->set_data(&_page_data)); _state = DATA_LOADED; return Status::OK(); } -Status ColumnChunkReader::_decode_dict_page() { - const tparquet::PageHeader* header; - RETURN_IF_ERROR(_page_reader->get_page_header(header)); +template +Status ColumnChunkReader::_decode_dict_page() { + const tparquet::PageHeader* header = nullptr; + RETURN_IF_ERROR(_page_reader->get_page_header(&header)); DCHECK_EQ(tparquet::PageType::DICTIONARY_PAGE, header->type); - SCOPED_RAW_TIMER(&_statistics.decode_dict_time); + SCOPED_RAW_TIMER(&_chunk_statistics.decode_dict_time); // Using the PLAIN_DICTIONARY enum value is deprecated in the Parquet 2.0 specification. // Prefer using RLE_DICTIONARY in a data page and PLAIN in a dictionary page for Parquet 2.0+ files. @@ -235,16 +395,88 @@ Status ColumnChunkReader::_decode_dict_page() { // Prepare dictionary data int32_t uncompressed_size = header->uncompressed_page_size; auto dict_data = make_unique_buffer(uncompressed_size); - if (_block_compress_codec != nullptr) { - Slice compressed_data; - RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); - Slice dict_slice(dict_data.get(), uncompressed_size); - RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &dict_slice)); - } else { - Slice dict_slice; - RETURN_IF_ERROR(_page_reader->get_page_data(dict_slice)); - // The data is stored by BufferedStreamReader, we should copy it out - memcpy(dict_data.get(), dict_slice.data, dict_slice.size); + bool dict_loaded = false; + + // Try to load dictionary page from cache + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr) { + if (_page_reader->has_page_cache_handle()) { + const PageCacheHandle& handle = _page_reader->page_cache_handle(); + Slice cached = handle.data(); + size_t header_size = _page_reader->header_bytes().size(); + // Dictionary page layout in cache: header | payload (compressed or uncompressed) + Slice payload_slice(cached.data + header_size, cached.size - header_size); + + bool cache_payload_is_decompressed = _page_reader->is_cache_payload_decompressed(); + + if (cache_payload_is_decompressed) { + // Use cached decompressed dictionary data + memcpy(dict_data.get(), payload_slice.data, payload_slice.size); + dict_loaded = true; + } else { + CHECK(_block_compress_codec); + // Decompress cached compressed dictionary data + Slice dict_slice(dict_data.get(), uncompressed_size); + RETURN_IF_ERROR(_block_compress_codec->decompress(payload_slice, &dict_slice)); + dict_loaded = true; + } + + // When dictionary page is loaded from cache, we need to skip the page data + // to update the offset correctly (similar to calling get_page_data()) + if (dict_loaded) { + _page_reader->skip_page_data(); + } + } + } + + if (!dict_loaded) { + // Load and decompress dictionary page from file + if (_block_compress_codec != nullptr) { + Slice compressed_data; + RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); + Slice dict_slice(dict_data.get(), uncompressed_size); + RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &dict_slice)); + + // Decide whether to cache decompressed or compressed dictionary based on threshold + bool should_cache_decompressed = false; + if (header->compressed_page_size > 0) { + should_cache_decompressed = + (static_cast(header->uncompressed_page_size) <= + static_cast(config::parquet_page_cache_decompress_threshold) * + static_cast(header->compressed_page_size)); + } + + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr && !_page_reader->header_bytes().empty()) { + std::vector empty_levels; // Dictionary pages don't have levels + if (should_cache_decompressed) { + // Cache the decompressed dictionary page + _insert_page_into_cache(empty_levels, dict_slice); + _chunk_statistics.page_cache_decompressed_write_counter += 1; + } else { + if (config::enable_parquet_cache_compressed_pages) { + // Cache the compressed dictionary page + _insert_page_into_cache(empty_levels, + Slice(compressed_data.data, compressed_data.size)); + _chunk_statistics.page_cache_compressed_write_counter += 1; + } + } + } + } else { + Slice dict_slice; + RETURN_IF_ERROR(_page_reader->get_page_data(dict_slice)); + // The data is stored by BufferedStreamReader, we should copy it out + memcpy(dict_data.get(), dict_slice.data, dict_slice.size); + + // Cache the uncompressed dictionary page + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr && !_page_reader->header_bytes().empty()) { + std::vector empty_levels; + Slice payload(dict_data.get(), uncompressed_size); + _insert_page_into_cache(empty_levels, payload); + _chunk_statistics.page_cache_decompressed_write_counter += 1; + } + } } // Cache page decoder @@ -253,8 +485,6 @@ Status ColumnChunkReader::_decode_dict_page() { Decoder::get_decoder(_metadata.type, tparquet::Encoding::RLE_DICTIONARY, page_decoder)); // Set type length page_decoder->set_type_length(_get_type_length()); - // Initialize the time convert context - // page_decoder->init(_field_schema, _ctz); // Set the dictionary data RETURN_IF_ERROR(page_decoder->set_dict(dict_data, uncompressed_size, header->dictionary_page_header.num_values)); @@ -264,49 +494,64 @@ Status ColumnChunkReader::_decode_dict_page() { return Status::OK(); } -void ColumnChunkReader::_reserve_decompress_buf(size_t size) { +template +void ColumnChunkReader::_reserve_decompress_buf(size_t size) { if (size > _decompress_buf_size) { _decompress_buf_size = BitUtil::next_power_of_two(size); _decompress_buf = make_unique_buffer(_decompress_buf_size); } } -Status ColumnChunkReader::skip_values(size_t num_values, bool skip_data) { +template +void ColumnChunkReader::_insert_page_into_cache( + const std::vector& level_bytes, const Slice& payload) { + StoragePageCache::CacheKey key( + fmt::format("{}::{}", _stream_reader->path(), _stream_reader->mtime()), + _page_reader->file_end_offset(), _page_reader->header_start_offset()); + const std::vector& header_bytes = _page_reader->header_bytes(); + size_t total = header_bytes.size() + level_bytes.size() + payload.size; + auto* page = new DataPage(total, true, segment_v2::DATA_PAGE); + size_t pos = 0; + memcpy(page->data() + pos, header_bytes.data(), header_bytes.size()); + pos += header_bytes.size(); + if (!level_bytes.empty()) { + memcpy(page->data() + pos, level_bytes.data(), level_bytes.size()); + pos += level_bytes.size(); + } + if (payload.size > 0) { + memcpy(page->data() + pos, payload.data, payload.size); + pos += payload.size; + } + page->reset_size(total); + PageCacheHandle handle; + StoragePageCache::instance()->insert(key, page, &handle, segment_v2::DATA_PAGE); + _chunk_statistics.page_cache_write_counter += 1; +} + +template +Status ColumnChunkReader::skip_values(size_t num_values, + bool skip_data) { if (UNLIKELY(_remaining_num_values < num_values)) { return Status::IOError("Skip too many values in current page. {} vs. {}", _remaining_num_values, num_values); } _remaining_num_values -= num_values; if (skip_data) { - SCOPED_RAW_TIMER(&_statistics.decode_value_time); + SCOPED_RAW_TIMER(&_chunk_statistics.decode_value_time); return _page_decoder->skip_values(num_values); } else { return Status::OK(); } } -void ColumnChunkReader::insert_null_values(MutableColumnPtr& doris_column, size_t num_values) { - SCOPED_RAW_TIMER(&_statistics.decode_value_time); - doris_column->insert_many_defaults(num_values); - _remaining_num_values -= num_values; -} - -size_t ColumnChunkReader::get_rep_levels(level_t* levels, size_t n) { - DCHECK_GT(_max_rep_level, 0); - return _rep_level_decoder.get_levels(levels, n); -} - -size_t ColumnChunkReader::get_def_levels(level_t* levels, size_t n) { - DCHECK_GT(_max_def_level, 0); - return _def_level_decoder.get_levels(levels, n); -} - -Status ColumnChunkReader::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, bool is_dict_filter) { +template +Status ColumnChunkReader::decode_values( + MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, + bool is_dict_filter) { if (select_vector.num_values() == 0) { return Status::OK(); } - SCOPED_RAW_TIMER(&_statistics.decode_value_time); + SCOPED_RAW_TIMER(&_chunk_statistics.decode_value_time); if (UNLIKELY((doris_column->is_column_dictionary() || is_dict_filter) && !_has_dict)) { return Status::IOError("Not dictionary coded"); } @@ -317,7 +562,159 @@ Status ColumnChunkReader::decode_values(MutableColumnPtr& doris_column, DataType return _page_decoder->decode_values(doris_column, data_type, select_vector, is_dict_filter); } -int32_t ColumnChunkReader::_get_type_length() { +template +Status ColumnChunkReader::seek_to_nested_row(size_t left_row) { + if constexpr (OFFSET_INDEX) { + while (true) { + if (_page_reader->start_row() <= left_row && left_row < _page_reader->end_row()) { + break; + } else if (has_next_page()) { + RETURN_IF_ERROR(next_page()); + _current_row = _page_reader->start_row(); + } else [[unlikely]] { + return Status::InternalError("no match seek row {}, current row {}", left_row, + _current_row); + } + }; + + RETURN_IF_ERROR(parse_page_header()); + RETURN_IF_ERROR(load_page_data()); + RETURN_IF_ERROR(_skip_nested_rows_in_page(left_row - _current_row)); + _current_row = left_row; + } else { + while (true) { + RETURN_IF_ERROR(parse_page_header()); + if (_page_reader->is_header_v2()) { + if (_page_reader->start_row() <= left_row && left_row < _page_reader->end_row()) { + RETURN_IF_ERROR(load_page_data()); + // this page contain this row. + RETURN_IF_ERROR(_skip_nested_rows_in_page(left_row - _current_row)); + _current_row = left_row; + break; + } + + _current_row = _page_reader->end_row(); + if (has_next_page()) [[likely]] { + RETURN_IF_ERROR(next_page()); + } else { + return Status::InternalError("no match seek row {}, current row {}", left_row, + _current_row); + } + } else { + RETURN_IF_ERROR(load_page_data()); + std::vector rep_levels; + std::vector def_levels; + bool cross_page = false; + + size_t result_rows = 0; + RETURN_IF_ERROR(load_page_nested_rows(rep_levels, left_row - _current_row, + &result_rows, &cross_page)); + RETURN_IF_ERROR(fill_def(def_levels)); + RETURN_IF_ERROR(skip_nested_values(def_levels)); + bool need_load_next_page = true; + while (cross_page) { + need_load_next_page = false; + rep_levels.clear(); + def_levels.clear(); + RETURN_IF_ERROR(load_cross_page_nested_row(rep_levels, &cross_page)); + RETURN_IF_ERROR(fill_def(def_levels)); + RETURN_IF_ERROR(skip_nested_values(def_levels)); + } + if (left_row == _current_row) { + break; + } + if (need_load_next_page) { + if (has_next_page()) [[likely]] { + RETURN_IF_ERROR(next_page()); + } else { + return Status::InternalError("no match seek row {}, current row {}", + left_row, _current_row); + } + } + } + }; + } + + return Status::OK(); +} + +template +Status ColumnChunkReader::_skip_nested_rows_in_page(size_t num_rows) { + if (num_rows == 0) { + return Status::OK(); + } + + std::vector rep_levels; + std::vector def_levels; + + bool cross_page = false; + size_t result_rows = 0; + RETURN_IF_ERROR(load_page_nested_rows(rep_levels, num_rows, &result_rows, &cross_page)); + RETURN_IF_ERROR(fill_def(def_levels)); + RETURN_IF_ERROR(skip_nested_values(def_levels)); + DCHECK(cross_page == false); + if (num_rows != result_rows) [[unlikely]] { + return Status::InternalError("no match skip rows, expect {} vs. real {}", num_rows, + result_rows); + } + return Status::OK(); +} + +template +Status ColumnChunkReader::load_page_nested_rows( + std::vector& rep_levels, size_t max_rows, size_t* result_rows, bool* cross_page) { + if (_state != DATA_LOADED) [[unlikely]] { + return Status::IOError("Should load page data first to load nested rows"); + } + *cross_page = false; + *result_rows = 0; + rep_levels.reserve(rep_levels.size() + _remaining_rep_nums); + while (_remaining_rep_nums) { + level_t rep_level = _rep_level_decoder.get_next(); + if (rep_level == 0) { // rep_level 0 indicates start of new row + if (*result_rows == max_rows) { // this page contain max_rows, page no end. + _current_row += max_rows; + _rep_level_decoder.rewind_one(); + return Status::OK(); + } + (*result_rows)++; + } + _remaining_rep_nums--; + rep_levels.emplace_back(rep_level); + } + _current_row += *result_rows; + + auto need_check_cross_page = [&]() -> bool { + return !OFFSET_INDEX && _remaining_rep_nums == 0 && !_page_reader->is_header_v2() && + has_next_page(); + }; + *cross_page = need_check_cross_page(); + return Status::OK(); +}; + +template +Status ColumnChunkReader::load_cross_page_nested_row( + std::vector& rep_levels, bool* cross_page) { + RETURN_IF_ERROR(next_page()); + RETURN_IF_ERROR(parse_page_header()); + RETURN_IF_ERROR(load_page_data()); + + *cross_page = has_next_page(); + while (_remaining_rep_nums) { + level_t rep_level = _rep_level_decoder.get_next(); + if (rep_level == 0) { // rep_level 0 indicates start of new row + *cross_page = false; + _rep_level_decoder.rewind_one(); + break; + } + _remaining_rep_nums--; + rep_levels.emplace_back(rep_level); + } + return Status::OK(); +} + +template +int32_t ColumnChunkReader::_get_type_length() { switch (_field_schema->physical_type) { case tparquet::Type::INT32: [[fallthrough]]; @@ -355,5 +752,10 @@ bool has_dict_page(const tparquet::ColumnMetaData& column) { column.dictionary_page_offset < column.data_page_offset; } +template class ColumnChunkReader; +template class ColumnChunkReader; +template class ColumnChunkReader; +template class ColumnChunkReader; + #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index f8e182d658387a..c380e99e087a81 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -34,9 +34,6 @@ #include "vec/exec/format/parquet/parquet_common.h" #include "vparquet_page_reader.h" -namespace cctz { -class time_zone; -} // namespace cctz namespace doris { class BlockCompressionCodec; @@ -54,9 +51,53 @@ template class ColumnStr; using ColumnString = ColumnStr; +struct ColumnChunkReaderStatistics { + int64_t decompress_time = 0; + int64_t decompress_cnt = 0; + int64_t decode_header_time = 0; + int64_t decode_value_time = 0; + int64_t decode_dict_time = 0; + int64_t decode_level_time = 0; + int64_t skip_page_header_num = 0; + int64_t parse_page_header_num = 0; + int64_t read_page_header_time = 0; + // page cache metrics + // total pages read (from cache or file) + int64_t page_read_counter = 0; + int64_t page_cache_write_counter = 0; + int64_t page_cache_compressed_write_counter = 0; + int64_t page_cache_decompressed_write_counter = 0; + // number of cache hits (either compressed or decompressed) + int64_t page_cache_hit_counter = 0; + int64_t page_cache_missing_counter = 0; + // per-hit breakdown + int64_t page_cache_compressed_hit_counter = 0; + int64_t page_cache_decompressed_hit_counter = 0; +}; + /** * Read and decode parquet column data into doris block column. - *

Usage:

+ *

Usage:

struct ColumnChunkReaderStatistics { + int64_t decompress_time = 0; + int64_t decompress_cnt = 0; + int64_t decode_header_time = 0; + int64_t decode_value_time = 0; + int64_t decode_dict_time = 0; + int64_t decode_level_time = 0; + int64_t skip_page_header_num = 0; + int64_t parse_page_header_num = 0; + // page cache metrics + // total pages read (from cache or file) + int64_t page_read_counter = 0; + int64_t page_cache_write_counter = 0; + int64_t page_cache_compressed_write_counter = 0; + int64_t page_cache_decompressed_write_counter = 0; + // number of cache hits (either compressed or decompressed) + int64_t page_cache_hit_counter = 0; + // per-hit breakdown + int64_t page_cache_compressed_hit_counter = 0; + int64_t page_cache_decompressed_hit_counter = 0; + }; * // Create chunk reader * ColumnChunkReader chunk_reader(BufferedStreamReader* reader, * tparquet::ColumnChunk* column_chunk, @@ -74,6 +115,7 @@ using ColumnString = ColumnStr; * chunk_reader.decode_values(slice, num_values); * } */ +template class ColumnChunkReader { public: struct Statistics { @@ -90,43 +132,23 @@ class ColumnChunkReader { ColumnChunkReader(io::BufferedStreamReader* reader, tparquet::ColumnChunk* column_chunk, FieldSchema* field_schema, const tparquet::OffsetIndex* offset_index, - const cctz::time_zone* ctz, io::IOContext* io_ctx); + size_t total_row, io::IOContext* io_ctx, + const ParquetPageReadContext& ctx = ParquetPageReadContext()); ~ColumnChunkReader() = default; // Initialize chunk reader, will generate the decoder and codec. Status init(); // Whether the chunk reader has a more page to read. - bool has_next_page() const { return _chunk_parsed_values < _metadata.num_values; } - - // Deprecated - // Seek to the specific page, page_header_offset must be the start offset of the page header. - // _end_offset may exceed the actual data area, so we can only use the number of parsed values - // to determine whether there are remaining pages to read. That's to say we can't use the - // PageLocation in parquet metadata to seek to the specified page. We should call next_page() - // and skip_page() to skip pages one by one. - // todo: change this interface to seek_to_page(int64_t page_header_offset, size_t num_parsed_values) - // and set _chunk_parsed_values = num_parsed_values - // [[deprecated]] - void seek_to_page(int64_t page_header_offset) { - _remaining_num_values = 0; - _page_reader->seek_to_page(page_header_offset); - _state = INITIALIZED; - } - - // Seek to next page. Only read and parse the page header. - Status next_page(); - - // Skip current page(will not read and parse) if the page is filtered by predicates. - Status skip_page() { - Status res = Status::OK(); - _remaining_num_values = 0; - if (_state == HEADER_PARSED) { - res = _page_reader->skip_page(); + bool has_next_page() const { + if constexpr (OFFSET_INDEX) { + return _page_reader->has_next_page(); + } else { + // no offset need parse all page header. + return _chunk_parsed_values < _metadata.num_values; } - _state = PAGE_SKIPPED; - return res; } + // Skip some values(will not read and parse) in current page if the values are filtered by predicates. // when skip_data = false, the underlying decoder will not skip data, // only used when maintaining the consistency of _remaining_num_values. @@ -143,16 +165,6 @@ class ColumnChunkReader { } // The remaining number of values in current page(including null values). Decreased when reading or skipping. uint32_t remaining_num_values() const { return _remaining_num_values; } - // null values are generated from definition levels - // the caller should maintain the consistency after analyzing null values from definition levels. - void insert_null_values(MutableColumnPtr& doris_column, size_t num_values); - // Get the raw data of current page. - Slice& get_page_data() { return _page_data; } - - // Get the repetition levels - size_t get_rep_levels(level_t* levels, size_t n); - // Get the definition levels - size_t get_def_levels(level_t* levels, size_t n); // Decode values in current page into doris column. Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, @@ -171,12 +183,30 @@ class ColumnChunkReader { // Get page decoder Decoder* get_page_decoder() { return _page_decoder; } - Statistics& statistics() { - _statistics.decode_header_time = _page_reader->statistics().decode_header_time; - _statistics.skip_page_header_num = _page_reader->statistics().skip_page_header_num; - _statistics.parse_page_header_num = _page_reader->statistics().parse_page_header_num; - _statistics.read_page_header_time = _page_reader->statistics().read_page_header_time; - return _statistics; + ColumnChunkReaderStatistics& chunk_statistics() { + _chunk_statistics.decode_header_time = _page_reader->page_statistics().decode_header_time; + _chunk_statistics.skip_page_header_num = + _page_reader->page_statistics().skip_page_header_num; + _chunk_statistics.parse_page_header_num = + _page_reader->page_statistics().parse_page_header_num; + _chunk_statistics.read_page_header_time = + _page_reader->page_statistics().read_page_header_time; + _chunk_statistics.page_read_counter += _page_reader->page_statistics().page_read_counter; + _chunk_statistics.page_cache_write_counter += + _page_reader->page_statistics().page_cache_write_counter; + _chunk_statistics.page_cache_compressed_write_counter += + _page_reader->page_statistics().page_cache_compressed_write_counter; + _chunk_statistics.page_cache_decompressed_write_counter += + _page_reader->page_statistics().page_cache_decompressed_write_counter; + _chunk_statistics.page_cache_hit_counter += + _page_reader->page_statistics().page_cache_hit_counter; + _chunk_statistics.page_cache_missing_counter += + _page_reader->page_statistics().page_cache_missing_counter; + _chunk_statistics.page_cache_compressed_hit_counter += + _page_reader->page_statistics().page_cache_compressed_hit_counter; + _chunk_statistics.page_cache_decompressed_hit_counter += + _page_reader->page_statistics().page_cache_decompressed_hit_counter; + return _chunk_statistics; } Status read_dict_values_to_column(MutableColumnPtr& doris_column) { @@ -189,33 +219,80 @@ class ColumnChunkReader { ->convert_dict_column_to_string_column(dict_column); } + size_t page_start_row() const { return _page_reader->start_row(); } + + size_t page_end_row() const { return _page_reader->end_row(); } + + Status parse_page_header(); + Status next_page(); + + Status seek_to_nested_row(size_t left_row); + Status skip_nested_values(const std::vector& def_levels); + Status fill_def(std::vector& def_values) { + auto before_sz = def_values.size(); + auto append_sz = _remaining_def_nums - _remaining_rep_nums; + def_values.resize(before_sz + append_sz, 0); + if (max_def_level() != 0) { + auto ptr = def_values.data() + before_sz; + _def_level_decoder.get_levels(ptr, append_sz); + } + _remaining_def_nums -= append_sz; + return Status::OK(); + } + + Status load_page_nested_rows(std::vector& rep_levels, size_t max_rows, + size_t* result_rows, bool* cross_page); + Status load_cross_page_nested_row(std::vector& rep_levels, bool* cross_page); + + // Test helpers / accessors + Slice get_page_data() const { return _page_data; } + const Slice& v2_rep_levels() const { return _v2_rep_levels; } + const Slice& v2_def_levels() const { return _v2_def_levels; } + ColumnChunkReaderStatistics& statistics() { return chunk_statistics(); } + private: enum ColumnChunkReaderState { NOT_INIT, INITIALIZED, HEADER_PARSED, DATA_LOADED, PAGE_SKIPPED }; + // for check dict page. + Status _parse_first_page_header(); Status _decode_dict_page(); + void _reserve_decompress_buf(size_t size); int32_t _get_type_length(); + void _insert_page_into_cache(const std::vector& level_bytes, const Slice& payload); + void _get_uncompressed_levels(const tparquet::DataPageHeaderV2& page_v2, Slice& page_data); + Status _skip_nested_rows_in_page(size_t num_rows); ColumnChunkReaderState _state = NOT_INIT; FieldSchema* _field_schema = nullptr; - level_t _max_rep_level; - level_t _max_def_level; - tparquet::LogicalType _parquet_logical_type; + const level_t _max_rep_level; + const level_t _max_def_level; io::BufferedStreamReader* _stream_reader = nullptr; tparquet::ColumnMetaData _metadata; - const tparquet::OffsetIndex* _offset_index; - // cctz::time_zone* _ctz; + const tparquet::OffsetIndex* _offset_index = nullptr; + size_t _current_row = 0; + size_t _total_rows = 0; io::IOContext* _io_ctx = nullptr; - std::unique_ptr _page_reader; + std::unique_ptr> _page_reader; BlockCompressionCodec* _block_compress_codec = nullptr; + // Session-level parquet page cache options + ParquetPageReadContext _ctx; + LevelDecoder _rep_level_decoder; LevelDecoder _def_level_decoder; size_t _chunk_parsed_values = 0; + // this page remaining rep/def nums + // if max_rep_level = 0 / max_def_level = 0, this value retail hava value. + uint32_t _remaining_rep_nums = 0; + uint32_t _remaining_def_nums = 0; + // this page remaining values to be processed (for read/skip). + // need parse this page header. uint32_t _remaining_num_values = 0; + Slice _page_data; DorisUniqueBufferPtr _decompress_buf; size_t _decompress_buf_size = 0; @@ -227,7 +304,7 @@ class ColumnChunkReader { // Map: encoding -> Decoder // Plain or Dictionary encoding. If the dictionary grows too big, the encoding will fall back to the plain encoding std::unordered_map> _decoders; - Statistics _statistics; + ColumnChunkReaderStatistics _chunk_statistics; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 71bfdce85c9828..12c6573ea74d88 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -108,16 +108,19 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, const tparquet::RowGroup& row_group, const RowRanges& row_ranges, const cctz::time_zone* ctz, io::IOContext* io_ctx, std::unique_ptr& reader, - size_t max_buf_size, const tparquet::OffsetIndex* offset_index, - const std::set& column_ids, - const std::set& filter_column_ids) { + size_t max_buf_size, + std::unordered_map& col_offsets, + bool in_collection, const std::set& column_ids, + const std::set& filter_column_ids, RuntimeState* state, + const std::string& created_by) { + size_t total_rows = row_group.num_rows; if (field->data_type->get_primitive_type() == TYPE_ARRAY) { std::unique_ptr element_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, - element_reader, max_buf_size, nullptr, column_ids, - filter_column_ids)); - element_reader->set_nested_column(); - auto array_reader = ArrayColumnReader::create_unique(row_ranges, ctz, io_ctx); + element_reader, max_buf_size, col_offsets, true, column_ids, + filter_column_ids, state, created_by)); + // element_reader->set_nested_column(); + auto array_reader = ArrayColumnReader::create_unique(row_ranges, total_rows, ctz, io_ctx); RETURN_IF_ERROR(array_reader->init(std::move(element_reader), field)); array_reader->_filter_column_ids = filter_column_ids; reader.reset(array_reader.release()); @@ -129,12 +132,12 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, column_ids.find(field->children[0].get_column_id()) != column_ids.end()) { // Create key reader RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, - key_reader, max_buf_size, nullptr, column_ids, - filter_column_ids)); - key_reader->set_nested_column(); + key_reader, max_buf_size, col_offsets, true, column_ids, + filter_column_ids, state, created_by)); + // key_reader->set_nested_column(); } else { - auto skip_reader = std::make_unique(row_ranges, ctz, io_ctx, - &field->children[0]); + auto skip_reader = std::make_unique(row_ranges, total_rows, ctz, + io_ctx, &field->children[0]); key_reader = std::move(skip_reader); } @@ -142,16 +145,16 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, column_ids.find(field->children[1].get_column_id()) != column_ids.end()) { // Create value reader RETURN_IF_ERROR(create(file, &field->children[1], row_group, row_ranges, ctz, io_ctx, - value_reader, max_buf_size, nullptr, column_ids, - filter_column_ids)); - value_reader->set_nested_column(); + value_reader, max_buf_size, col_offsets, true, column_ids, + filter_column_ids, state, created_by)); + // value_reader->set_nested_column(); } else { - auto skip_reader = std::make_unique(row_ranges, ctz, io_ctx, - &field->children[0]); + auto skip_reader = std::make_unique(row_ranges, total_rows, ctz, + io_ctx, &field->children[0]); value_reader = std::move(skip_reader); } - auto map_reader = MapColumnReader::create_unique(row_ranges, ctz, io_ctx); + auto map_reader = MapColumnReader::create_unique(row_ranges, total_rows, ctz, io_ctx); RETURN_IF_ERROR(map_reader->init(std::move(key_reader), std::move(value_reader), field)); map_reader->_filter_column_ids = filter_column_ids; reader.reset(map_reader.release()); @@ -164,17 +167,17 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, std::unique_ptr child_reader; if (column_ids.empty() || column_ids.find(child.get_column_id()) != column_ids.end()) { RETURN_IF_ERROR(create(file, &child, row_group, row_ranges, ctz, io_ctx, - child_reader, max_buf_size, nullptr, column_ids, - filter_column_ids)); - child_reader->set_nested_column(); + child_reader, max_buf_size, col_offsets, in_collection, + column_ids, filter_column_ids)); + // child_reader->set_nested_column(); child_readers[child.name] = std::move(child_reader); // Record the first non-SkippingReader if (non_skip_reader_idx == -1) { non_skip_reader_idx = i; } } else { - auto skip_reader = - std::make_unique(row_ranges, ctz, io_ctx, &child); + auto skip_reader = std::make_unique(row_ranges, total_rows, ctz, + io_ctx, &child); skip_reader->_filter_column_ids = filter_column_ids; child_readers[child.name] = std::move(skip_reader); } @@ -183,22 +186,57 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, if (non_skip_reader_idx == -1) { std::unique_ptr child_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, - child_reader, max_buf_size, nullptr, column_ids, - filter_column_ids)); - child_reader->set_nested_column(); + child_reader, max_buf_size, col_offsets, in_collection, + column_ids, filter_column_ids)); + // child_reader->set_nested_column(); child_readers[field->children[0].name] = std::move(child_reader); } - auto struct_reader = StructColumnReader::create_unique(row_ranges, ctz, io_ctx); + auto struct_reader = StructColumnReader::create_unique(row_ranges, total_rows, ctz, io_ctx); RETURN_IF_ERROR(struct_reader->init(std::move(child_readers), field)); struct_reader->_filter_column_ids = filter_column_ids; reader.reset(struct_reader.release()); } else { - const tparquet::ColumnChunk& chunk = row_group.columns[field->physical_column_index]; - auto scalar_reader = - ScalarColumnReader::create_unique(row_ranges, chunk, offset_index, ctz, io_ctx); - RETURN_IF_ERROR(scalar_reader->init(file, field, max_buf_size)); - scalar_reader->_filter_column_ids = filter_column_ids; - reader.reset(scalar_reader.release()); + auto physical_index = field->physical_column_index; + const tparquet::OffsetIndex* offset_index = + col_offsets.find(physical_index) != col_offsets.end() ? &col_offsets[physical_index] + : nullptr; + + const tparquet::ColumnChunk& chunk = row_group.columns[physical_index]; + + // ScalarColumnReader::create_unique(row_ranges, total_rows, chunk, offset_index, ctz, io_ctx); + if (in_collection) { + if (offset_index == nullptr) { + auto scalar_reader = ScalarColumnReader::create_unique( + row_ranges, total_rows, chunk, offset_index, ctz, io_ctx); + + RETURN_IF_ERROR(scalar_reader->init(file, field, max_buf_size, state, created_by)); + scalar_reader->_filter_column_ids = filter_column_ids; + reader.reset(scalar_reader.release()); + } else { + auto scalar_reader = ScalarColumnReader::create_unique( + row_ranges, total_rows, chunk, offset_index, ctz, io_ctx); + + RETURN_IF_ERROR(scalar_reader->init(file, field, max_buf_size, state, created_by)); + scalar_reader->_filter_column_ids = filter_column_ids; + reader.reset(scalar_reader.release()); + } + } else { + if (offset_index == nullptr) { + auto scalar_reader = ScalarColumnReader::create_unique( + row_ranges, total_rows, chunk, offset_index, ctz, io_ctx); + + RETURN_IF_ERROR(scalar_reader->init(file, field, max_buf_size, state, created_by)); + scalar_reader->_filter_column_ids = filter_column_ids; + reader.reset(scalar_reader.release()); + } else { + auto scalar_reader = ScalarColumnReader::create_unique( + row_ranges, total_rows, chunk, offset_index, ctz, io_ctx); + + RETURN_IF_ERROR(scalar_reader->init(file, field, max_buf_size, state, created_by)); + scalar_reader->_filter_column_ids = filter_column_ids; + reader.reset(scalar_reader.release()); + } + } } return Status::OK(); } @@ -206,13 +244,15 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, void ParquetColumnReader::_generate_read_ranges(RowRange page_row_range, RowRanges* result_ranges) const { result_ranges->add(page_row_range); - if (_nested_column) { - return; - } RowRanges::ranges_intersection(*result_ranges, _row_ranges, result_ranges); } -Status ScalarColumnReader::init(io::FileReaderSPtr file, FieldSchema* field, size_t max_buf_size) { +template +Status ScalarColumnReader::init(io::FileReaderSPtr file, + FieldSchema* field, + size_t max_buf_size, + RuntimeState* state, + const std::string& created_by) { _field_schema = field; auto& chunk_meta = _chunk_meta.meta_data; int64_t chunk_start = has_dict_page(chunk_meta) ? chunk_meta.dictionary_page_offset @@ -228,13 +268,19 @@ Status ScalarColumnReader::init(io::FileReaderSPtr file, FieldSchema* field, siz } _stream_reader = std::make_unique(file, chunk_start, chunk_len, prefetch_buffer_size); - _chunk_reader = std::make_unique(_stream_reader.get(), &_chunk_meta, field, - _offset_index, _ctz, _io_ctx); + // Build Parquet page read context: enable_parquet_file_page_cache from session, others from BE config + ParquetPageReadContext ctx( + (state == nullptr) ? true : state->query_options().enable_parquet_file_page_cache, + created_by); + + _chunk_reader = std::make_unique>( + _stream_reader.get(), &_chunk_meta, field, _offset_index, _total_rows, _io_ctx, ctx); RETURN_IF_ERROR(_chunk_reader->init()); return Status::OK(); } -Status ScalarColumnReader::_skip_values(size_t num_values) { +template +Status ScalarColumnReader::_skip_values(size_t num_values) { if (num_values == 0) { return Status::OK(); } @@ -257,7 +303,7 @@ Status ScalarColumnReader::_skip_values(size_t num_values) { LOG(WARNING) << ss.str(); return Status::InternalError("Failed to decode definition level."); } - if (def_level == 0) { + if (def_level < _field_schema->definition_level) { null_size += loop_skip; } else { nonnull_size += loop_skip; @@ -276,9 +322,12 @@ Status ScalarColumnReader::_skip_values(size_t num_values) { return Status::OK(); } -Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_column, - DataTypePtr& type, FilterMap& filter_map, - bool is_dict_filter) { +template +Status ScalarColumnReader::_read_values(size_t num_values, + ColumnPtr& doris_column, + DataTypePtr& type, + FilterMap& filter_map, + bool is_dict_filter) { if (num_values == 0) { return Status::OK(); } @@ -312,7 +361,11 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu LOG(WARNING) << ss.str(); return Status::InternalError("Failed to decode definition level."); } - bool is_null = def_level == 0; + + for (int i = 0; i < loop_read; i++) { + _def_levels.emplace_back(def_level); + } + bool is_null = def_level < _field_schema->definition_level; if (!(prev_is_null ^ is_null)) { null_map.emplace_back(0); } @@ -326,11 +379,14 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu prev_is_null = is_null; has_read += loop_read; } + } else { + _def_levels.resize(_def_levels.size() + num_values, 0); } } else { if (_chunk_reader->max_def_level() > 0) { return Status::Corruption("Not nullable column has null values in parquet file"); } + _def_levels.resize(_def_levels.size() + num_values, 0); data_column = doris_column->assume_mutable(); } if (null_map.size() == 0) { @@ -357,93 +413,15 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu * A row of complex type may be stored across two(or more) pages, and the parameter `align_rows` indicates that * whether the reader should read the remaining value of the last row in previous page. */ -Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataTypePtr& type, - FilterMap& filter_map, size_t batch_size, - size_t* read_rows, bool* eof, bool is_dict_filter, - bool align_rows) { - std::unique_ptr nested_filter_map; - - FilterMap* current_filter_map = &filter_map; - size_t origin_size = 0; - if (align_rows) { - origin_size = _rep_levels.size(); - // just read the remaining values of the last row in previous page, - // so there's no a new row should be read. - batch_size = 0; - /* - * Since the function is repeatedly called to fetch data for the batch size, - * it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the - * definition and repetition levels of the reader only containing the latter - * part of the batch (i.e., missing some parts). Therefore, when using the - * definition and repetition levels to fill the null_map for structs and maps, - * the function should not be called multiple times before filling. - * todo: - * We may need to consider reading the entire batch of data at once, as this approach - * would be more user-friendly in terms of function usage. However, we must consider that if the - * data spans multiple pages, memory usage may increase significantly. - */ - } else { - _rep_levels.resize(0); - _def_levels.resize(0); - if (_nested_filter_map_data) { - _nested_filter_map_data->resize(0); - } - } - size_t parsed_rows = 0; - size_t remaining_values = _chunk_reader->remaining_num_values(); - bool has_rep_level = _chunk_reader->max_rep_level() > 0; - bool has_def_level = _chunk_reader->max_def_level() > 0; - - // Handle repetition levels (indicates nesting structure) - if (has_rep_level) { - LevelDecoder& rep_decoder = _chunk_reader->rep_level_decoder(); - // Read repetition levels until batch is full or no more values - while (parsed_rows <= batch_size && remaining_values > 0) { - level_t rep_level = rep_decoder.get_next(); - if (rep_level == 0) { // rep_level 0 indicates start of new row - if (parsed_rows == batch_size) { - rep_decoder.rewind_one(); - break; - } - parsed_rows++; - } - _rep_levels.emplace_back(rep_level); - remaining_values--; - } - - // Generate nested filter map - if (filter_map.has_filter() && (!filter_map.filter_all())) { - if (_nested_filter_map_data == nullptr) { - _nested_filter_map_data.reset(new std::vector()); - } - RETURN_IF_ERROR(filter_map.generate_nested_filter_map( - _rep_levels, *_nested_filter_map_data, &nested_filter_map, - &_orig_filter_map_index, origin_size)); - // Update current_filter_map to nested_filter_map - current_filter_map = nested_filter_map.get(); - } - } else if (!align_rows) { - // case : required child columns in struct type - parsed_rows = std::min(remaining_values, batch_size); - remaining_values -= parsed_rows; - _rep_levels.resize(parsed_rows, 0); - } - - // Process definition levels (indicates null values) - size_t parsed_values = _chunk_reader->remaining_num_values() - remaining_values; - _def_levels.resize(origin_size + parsed_values); - if (has_def_level) { - // if parsed_values is 0, we don't need to decode levels - if (parsed_values != 0) { - _chunk_reader->def_level_decoder().get_levels(&_def_levels[origin_size], parsed_values); - } - } else { - std::fill(_def_levels.begin() + origin_size, _def_levels.end(), 0); - } +template +Status ScalarColumnReader::_read_nested_column( + ColumnPtr& doris_column, DataTypePtr& type, FilterMap& filter_map, size_t batch_size, + size_t* read_rows, bool* eof, bool is_dict_filter) { + _rep_levels.clear(); + _def_levels.clear(); // Handle nullable columns MutableColumnPtr data_column; - std::vector null_map; NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); @@ -460,138 +438,86 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType data_column = doris_column->assume_mutable(); } - // Process definition levels to build null map - size_t has_read = origin_size; - size_t ancestor_nulls = 0; - size_t null_size = 0; - size_t nonnull_size = 0; - null_map.emplace_back(0); - bool prev_is_null = false; + std::vector null_map; std::unordered_set ancestor_null_indices; + std::vector nested_filter_map_data; - while (has_read < origin_size + parsed_values) { - level_t def_level = _def_levels[has_read++]; - size_t loop_read = 1; - while (has_read < origin_size + parsed_values && _def_levels[has_read] == def_level) { - has_read++; - loop_read++; + auto read_and_fill_data = [&](size_t before_rep_level_sz, size_t filter_map_index) { + RETURN_IF_ERROR(_chunk_reader->fill_def(_def_levels)); + std::unique_ptr nested_filter_map = std::make_unique(); + if (filter_map.has_filter()) { + RETURN_IF_ERROR(gen_filter_map(filter_map, filter_map_index, before_rep_level_sz, + _rep_levels.size(), nested_filter_map_data, + &nested_filter_map)); } - if (def_level < _field_schema->repeated_parent_def_level) { - for (size_t i = 0; i < loop_read; i++) { - ancestor_null_indices.insert(has_read - loop_read + i); - } - ancestor_nulls += loop_read; - continue; - } - - bool is_null = def_level < _field_schema->definition_level; - if (is_null) { - null_size += loop_read; - } else { - nonnull_size += loop_read; - } + null_map.clear(); + ancestor_null_indices.clear(); + RETURN_IF_ERROR(gen_nested_null_map(before_rep_level_sz, _rep_levels.size(), null_map, + ancestor_null_indices)); - if (prev_is_null == is_null && (USHRT_MAX - null_map.back() >= loop_read)) { - null_map.back() += loop_read; - } else { - if (!(prev_is_null ^ is_null)) { - null_map.emplace_back(0); - } - size_t remaining = loop_read; - while (remaining > USHRT_MAX) { - null_map.emplace_back(USHRT_MAX); - null_map.emplace_back(0); - remaining -= USHRT_MAX; - } - null_map.emplace_back((u_short)remaining); - prev_is_null = is_null; - } - } - - size_t num_values = parsed_values - ancestor_nulls; - - // Handle filtered values - if (current_filter_map->filter_all()) { - // Skip all values if everything is filtered - if (null_size > 0) { - RETURN_IF_ERROR(_chunk_reader->skip_values(null_size, false)); - } - if (nonnull_size > 0) { - RETURN_IF_ERROR(_chunk_reader->skip_values(nonnull_size, true)); - } - if (ancestor_nulls != 0) { - RETURN_IF_ERROR(_chunk_reader->skip_values(ancestor_nulls, false)); - } - } else { ColumnSelectVector select_vector; { SCOPED_RAW_TIMER(&_decode_null_map_time); - RETURN_IF_ERROR( - select_vector.init(null_map, num_values, map_data_column, current_filter_map, - _nested_filter_map_data ? origin_size : _filter_map_index, - &ancestor_null_indices)); + RETURN_IF_ERROR(select_vector.init( + null_map, + _rep_levels.size() - before_rep_level_sz - ancestor_null_indices.size(), + map_data_column, nested_filter_map.get(), 0, &ancestor_null_indices)); } RETURN_IF_ERROR( _chunk_reader->decode_values(data_column, type, select_vector, is_dict_filter)); - if (ancestor_nulls != 0) { - RETURN_IF_ERROR(_chunk_reader->skip_values(ancestor_nulls, false)); - } - } - *read_rows += parsed_rows; - _filter_map_index += parsed_values; - - // Handle cross-page reading - if (_chunk_reader->remaining_num_values() == 0) { - if (_chunk_reader->has_next_page()) { - RETURN_IF_ERROR(_chunk_reader->next_page()); - RETURN_IF_ERROR(_chunk_reader->load_page_data()); - return _read_nested_column(doris_column, type, filter_map, 0, read_rows, eof, - is_dict_filter, true); - } else { - *eof = true; - } - } - - // Apply filtering to repetition and definition levels - if (current_filter_map->has_filter()) { - if (current_filter_map->filter_all()) { - _rep_levels.resize(0); - _def_levels.resize(0); - } else { - std::vector filtered_rep_levels; - std::vector filtered_def_levels; - filtered_rep_levels.reserve(_rep_levels.size()); - filtered_def_levels.reserve(_def_levels.size()); - - const uint8_t* filter_map_data = current_filter_map->filter_map_data(); - - for (size_t i = 0; i < _rep_levels.size(); i++) { - if (filter_map_data[i]) { - filtered_rep_levels.push_back(_rep_levels[i]); - filtered_def_levels.push_back(_def_levels[i]); + if (ancestor_null_indices.size() != 0) { + RETURN_IF_ERROR(_chunk_reader->skip_values(ancestor_null_indices.size(), false)); + } + if (filter_map.has_filter()) { + auto new_rep_sz = before_rep_level_sz; + for (size_t idx = before_rep_level_sz; idx < _rep_levels.size(); idx++) { + if (nested_filter_map_data[idx - before_rep_level_sz]) { + _rep_levels[new_rep_sz] = _rep_levels[idx]; + _def_levels[new_rep_sz] = _def_levels[idx]; + new_rep_sz++; } } - - _rep_levels = std::move(filtered_rep_levels); - _def_levels = std::move(filtered_def_levels); + _rep_levels.resize(new_rep_sz); + _def_levels.resize(new_rep_sz); + } + return Status::OK(); + }; + + while (_current_range_idx < _row_ranges.range_size()) { + size_t left_row = + std::max(_current_row_index, _row_ranges.get_range_from(_current_range_idx)); + size_t right_row = std::min(left_row + batch_size - *read_rows, + (size_t)_row_ranges.get_range_to(_current_range_idx)); + _current_row_index = left_row; + RETURN_IF_ERROR(_chunk_reader->seek_to_nested_row(left_row)); + size_t load_rows = 0; + bool cross_page = false; + size_t before_rep_level_sz = _rep_levels.size(); + RETURN_IF_ERROR(_chunk_reader->load_page_nested_rows(_rep_levels, right_row - left_row, + &load_rows, &cross_page)); + RETURN_IF_ERROR(read_and_fill_data(before_rep_level_sz, _filter_map_index)); + _filter_map_index += load_rows; + while (cross_page) { + before_rep_level_sz = _rep_levels.size(); + RETURN_IF_ERROR(_chunk_reader->load_cross_page_nested_row(_rep_levels, &cross_page)); + RETURN_IF_ERROR(read_and_fill_data(before_rep_level_sz, _filter_map_index - 1)); + } + *read_rows += load_rows; + _current_row_index += load_rows; + _current_range_idx += (_current_row_index == _row_ranges.get_range_to(_current_range_idx)); + if (*read_rows == batch_size) { + break; } } - - // Prepare for next row - ++_orig_filter_map_index; - - if (_rep_levels.size() > 0) { - // make sure the rows of complex type are aligned correctly, - // so the repetition level of first element should be 0, meaning a new row is started. - DCHECK_EQ(_rep_levels[0], 0); - } + *eof = _current_range_idx == _row_ranges.range_size(); return Status::OK(); } -Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_column, - bool* has_dict) { +template +Status ScalarColumnReader::read_dict_values_to_column( + MutableColumnPtr& doris_column, bool* has_dict) { bool loaded; RETURN_IF_ERROR(_try_load_dict_page(&loaded, has_dict)); if (loaded && *has_dict) { @@ -599,28 +525,24 @@ Status ScalarColumnReader::read_dict_values_to_column(MutableColumnPtr& doris_co } return Status::OK(); } - -MutableColumnPtr ScalarColumnReader::convert_dict_column_to_string_column( +template +MutableColumnPtr +ScalarColumnReader::convert_dict_column_to_string_column( const ColumnInt32* dict_column) { return _chunk_reader->convert_dict_column_to_string_column(dict_column); } -Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) { - *loaded = false; - *has_dict = false; - if (_chunk_reader->remaining_num_values() == 0) { - if (!_chunk_reader->has_next_page()) { - *loaded = false; - return Status::OK(); - } - RETURN_IF_ERROR(_chunk_reader->next_page()); - *loaded = true; - *has_dict = _chunk_reader->has_dict(); - } +template +Status ScalarColumnReader::_try_load_dict_page(bool* loaded, + bool* has_dict) { + // _chunk_reader init will load first page header to check whether has dict page + *loaded = true; + *has_dict = _chunk_reader->has_dict(); return Status::OK(); } -Status ScalarColumnReader::read_column_data( +template +Status ScalarColumnReader::read_column_data( ColumnPtr& doris_column, const DataTypePtr& type, const std::shared_ptr& root_node, FilterMap& filter_map, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter, @@ -642,33 +564,33 @@ Status ScalarColumnReader::read_column_data( doris_column, type, is_dict_filter); DataTypePtr& resolved_type = _converter->get_physical_type(); - do { - if (_chunk_reader->remaining_num_values() == 0) { - if (!_chunk_reader->has_next_page()) { - *eof = true; - *read_rows = 0; - return Status::OK(); - } - RETURN_IF_ERROR(_chunk_reader->next_page()); - } - if (_nested_column) { - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - RETURN_IF_ERROR(_read_nested_column(resolved_column, resolved_type, filter_map, - batch_size, read_rows, eof, is_dict_filter, false)); - break; - } + _def_levels.clear(); + _rep_levels.clear(); + *read_rows = 0; + + if constexpr (IN_COLLECTION) { + RETURN_IF_ERROR(_read_nested_column(resolved_column, resolved_type, filter_map, batch_size, + read_rows, eof, is_dict_filter)); + return _converter->convert(resolved_column, _field_schema->data_type, type, doris_column, + is_dict_filter); + } + int64_t right_row = 0; + if constexpr (OFFSET_INDEX == false) { + RETURN_IF_ERROR(_chunk_reader->parse_page_header()); + right_row = _chunk_reader->page_end_row(); + } else { + right_row = _chunk_reader->page_end_row(); + } + auto before_filter_map_index = _filter_map_index; + + do { // generate the row ranges that should be read RowRanges read_ranges; - _generate_read_ranges(RowRange {_current_row_index, - _current_row_index + _chunk_reader->remaining_num_values()}, - &read_ranges); - + _generate_read_ranges(RowRange {_current_row_index, right_row}, &read_ranges); if (read_ranges.count() == 0) { // skip the whole page - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = 0; + _current_row_index = right_row; } else { bool skip_whole_batch = false; // Determining whether to skip page or batch will increase the calculation time. @@ -680,12 +602,8 @@ Status ScalarColumnReader::read_column_data( filter_map.can_filter_all(remaining_num_values, _filter_map_index)) { // We can skip the whole page if the remaining values are filtered by predicate columns _filter_map_index += remaining_num_values; - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); + _current_row_index = right_row; *read_rows = remaining_num_values; - if (!_chunk_reader->has_next_page()) { - *eof = true; - } break; } skip_whole_batch = batch_size <= remaining_num_values && @@ -695,6 +613,7 @@ Status ScalarColumnReader::read_column_data( } } // load page data to decode or skip values + RETURN_IF_ERROR(_chunk_reader->parse_page_header()); RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); size_t has_read = 0; for (size_t idx = 0; idx < read_ranges.range_size(); idx++) { @@ -713,18 +632,34 @@ Status ScalarColumnReader::read_column_data( filter_map, is_dict_filter)); } has_read += read_values; + *read_rows += read_values; _current_row_index += read_values; if (has_read == batch_size) { break; } } - *read_rows = has_read; } + } while (false); - if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { + if (right_row == _current_row_index) { + if (!_chunk_reader->has_next_page()) { *eof = true; + } else { + RETURN_IF_ERROR(_chunk_reader->next_page()); } - } while (false); + } + + if (filter_map.has_filter()) { + size_t new_rep_sz = 0; + for (size_t idx = before_filter_map_index; idx < _filter_map_index; idx++) { + if (filter_map.filter_map_data()[idx]) { + _def_levels[new_rep_sz] = _def_levels[idx - before_filter_map_index]; + new_rep_sz++; + } + } + _def_levels.resize(new_rep_sz); + } + _rep_levels.resize(_def_levels.size(), 0); return _converter->convert(resolved_column, _field_schema->data_type, type, doris_column, is_dict_filter); @@ -955,7 +890,7 @@ Status StructColumnReader::read_column_data( field_rows += loop_rows; } DCHECK_EQ(*read_rows, field_rows); - DCHECK_EQ(*eof, field_eof); + // DCHECK_EQ(*eof, field_eof); } } @@ -1076,6 +1011,12 @@ Status StructColumnReader::read_column_data( } return Status::OK(); } + +template class ScalarColumnReader; +template class ScalarColumnReader; +template class ScalarColumnReader; +template class ScalarColumnReader; + #include "common/compile_check_end.h" }; // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index c318d559b8ae29..8bcd9712716962 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -53,12 +53,9 @@ using ColumnString = ColumnStr; class ParquetColumnReader { public: - struct Statistics { - Statistics() - : read_time(0), - read_calls(0), - page_index_read_calls(0), - read_bytes(0), + struct ColumnStatistics { + ColumnStatistics() + : page_index_read_calls(0), decompress_time(0), decompress_cnt(0), decode_header_time(0), @@ -68,14 +65,18 @@ class ParquetColumnReader { decode_null_map_time(0), skip_page_header_num(0), parse_page_header_num(0), - read_page_header_time(0) {} - - Statistics(io::BufferedStreamReader::Statistics& fs, ColumnChunkReader::Statistics& cs, - int64_t null_map_time) - : read_time(fs.read_time), - read_calls(fs.read_calls), - page_index_read_calls(0), - read_bytes(fs.read_bytes), + read_page_header_time(0), + page_read_counter(0), + page_cache_write_counter(0), + page_cache_compressed_write_counter(0), + page_cache_decompressed_write_counter(0), + page_cache_hit_counter(0), + page_cache_missing_counter(0), + page_cache_compressed_hit_counter(0), + page_cache_decompressed_hit_counter(0) {} + + ColumnStatistics(ColumnChunkReaderStatistics& cs, int64_t null_map_time) + : page_index_read_calls(0), decompress_time(cs.decompress_time), decompress_cnt(cs.decompress_cnt), decode_header_time(cs.decode_header_time), @@ -85,12 +86,17 @@ class ParquetColumnReader { decode_null_map_time(null_map_time), skip_page_header_num(cs.skip_page_header_num), parse_page_header_num(cs.parse_page_header_num), - read_page_header_time(cs.read_page_header_time) {} + read_page_header_time(cs.read_page_header_time), + page_read_counter(cs.page_read_counter), + page_cache_write_counter(cs.page_cache_write_counter), + page_cache_compressed_write_counter(cs.page_cache_compressed_write_counter), + page_cache_decompressed_write_counter(cs.page_cache_decompressed_write_counter), + page_cache_hit_counter(cs.page_cache_hit_counter), + page_cache_missing_counter(cs.page_cache_missing_counter), + page_cache_compressed_hit_counter(cs.page_cache_compressed_hit_counter), + page_cache_decompressed_hit_counter(cs.page_cache_decompressed_hit_counter) {} - int64_t read_time; - int64_t read_calls; int64_t page_index_read_calls; - int64_t read_bytes; int64_t decompress_time; int64_t decompress_cnt; int64_t decode_header_time; @@ -101,28 +107,44 @@ class ParquetColumnReader { int64_t skip_page_header_num; int64_t parse_page_header_num; int64_t read_page_header_time; - - void merge(Statistics& statistics) { - read_time += statistics.read_time; - read_calls += statistics.read_calls; - read_bytes += statistics.read_bytes; - page_index_read_calls += statistics.page_index_read_calls; - decompress_time += statistics.decompress_time; - decompress_cnt += statistics.decompress_cnt; - decode_header_time += statistics.decode_header_time; - decode_value_time += statistics.decode_value_time; - decode_dict_time += statistics.decode_dict_time; - decode_level_time += statistics.decode_level_time; - decode_null_map_time += statistics.decode_null_map_time; - skip_page_header_num += statistics.skip_page_header_num; - parse_page_header_num += statistics.parse_page_header_num; - read_page_header_time += statistics.read_page_header_time; + int64_t page_read_counter; + int64_t page_cache_write_counter; + int64_t page_cache_compressed_write_counter; + int64_t page_cache_decompressed_write_counter; + int64_t page_cache_hit_counter; + int64_t page_cache_missing_counter; + int64_t page_cache_compressed_hit_counter; + int64_t page_cache_decompressed_hit_counter; + + void merge(ColumnStatistics& col_statistics) { + page_index_read_calls += col_statistics.page_index_read_calls; + decompress_time += col_statistics.decompress_time; + decompress_cnt += col_statistics.decompress_cnt; + decode_header_time += col_statistics.decode_header_time; + decode_value_time += col_statistics.decode_value_time; + decode_dict_time += col_statistics.decode_dict_time; + decode_level_time += col_statistics.decode_level_time; + decode_null_map_time += col_statistics.decode_null_map_time; + skip_page_header_num += col_statistics.skip_page_header_num; + parse_page_header_num += col_statistics.parse_page_header_num; + read_page_header_time += col_statistics.read_page_header_time; + page_read_counter += col_statistics.page_read_counter; + page_cache_write_counter += col_statistics.page_cache_write_counter; + page_cache_compressed_write_counter += + col_statistics.page_cache_compressed_write_counter; + page_cache_decompressed_write_counter += + col_statistics.page_cache_decompressed_write_counter; + page_cache_hit_counter += col_statistics.page_cache_hit_counter; + page_cache_missing_counter += col_statistics.page_cache_missing_counter; + page_cache_compressed_hit_counter += col_statistics.page_cache_compressed_hit_counter; + page_cache_decompressed_hit_counter += + col_statistics.page_cache_decompressed_hit_counter; } }; - ParquetColumnReader(const RowRanges& row_ranges, const cctz::time_zone* ctz, + ParquetColumnReader(const RowRanges& row_ranges, size_t total_rows, const cctz::time_zone* ctz, io::IOContext* io_ctx) - : _row_ranges(row_ranges), _ctz(ctz), _io_ctx(io_ctx) {} + : _row_ranges(row_ranges), _total_rows(total_rows), _ctz(ctz), _io_ctx(io_ctx) {} virtual ~ParquetColumnReader() = default; virtual Status read_column_data(ColumnPtr& doris_column, const DataTypePtr& type, const std::shared_ptr& root_node, @@ -143,13 +165,13 @@ class ParquetColumnReader { const tparquet::RowGroup& row_group, const RowRanges& row_ranges, const cctz::time_zone* ctz, io::IOContext* io_ctx, std::unique_ptr& reader, size_t max_buf_size, - const tparquet::OffsetIndex* offset_index = nullptr, - const std::set& column_ids = {}, - const std::set& filter_column_ids = {}); - void set_nested_column() { _nested_column = true; } + std::unordered_map& col_offsets, + bool in_collection = false, const std::set& column_ids = {}, + const std::set& filter_column_ids = {}, + RuntimeState* state = nullptr, const std::string& created_by = ""); virtual const std::vector& get_rep_level() const = 0; virtual const std::vector& get_def_level() const = 0; - virtual Statistics statistics() = 0; + virtual ColumnStatistics column_statistics() = 0; virtual void close() = 0; virtual void reset_filter_map_index() = 0; @@ -160,9 +182,8 @@ class ParquetColumnReader { void _generate_read_ranges(RowRange page_row_range, RowRanges* result_ranges) const; FieldSchema* _field_schema = nullptr; - // When scalar column is the child of nested column, we should turn off the filtering by page index and lazy read. - bool _nested_column = false; const RowRanges& _row_ranges; + size_t _total_rows = 0; const cctz::time_zone* _ctz = nullptr; io::IOContext* _io_ctx = nullptr; int64_t _current_row_index = 0; @@ -172,17 +193,20 @@ class ParquetColumnReader { std::set _filter_column_ids; }; +template class ScalarColumnReader : public ParquetColumnReader { ENABLE_FACTORY_CREATOR(ScalarColumnReader) public: - ScalarColumnReader(const RowRanges& row_ranges, const tparquet::ColumnChunk& chunk_meta, + ScalarColumnReader(const RowRanges& row_ranges, size_t total_rows, + const tparquet::ColumnChunk& chunk_meta, const tparquet::OffsetIndex* offset_index, const cctz::time_zone* ctz, io::IOContext* io_ctx) - : ParquetColumnReader(row_ranges, ctz, io_ctx), + : ParquetColumnReader(row_ranges, total_rows, ctz, io_ctx), _chunk_meta(chunk_meta), _offset_index(offset_index) {} ~ScalarColumnReader() override { close(); } - Status init(io::FileReaderSPtr file, FieldSchema* field, size_t max_buf_size); + Status init(io::FileReaderSPtr file, FieldSchema* field, size_t max_buf_size, + RuntimeState* state, const std::string& created_by = ""); Status read_column_data(ColumnPtr& doris_column, const DataTypePtr& type, const std::shared_ptr& root_node, FilterMap& filter_map, size_t batch_size, size_t* read_rows, bool* eof, @@ -191,9 +215,8 @@ class ScalarColumnReader : public ParquetColumnReader { MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override; const std::vector& get_rep_level() const override { return _rep_levels; } const std::vector& get_def_level() const override { return _def_levels; } - Statistics statistics() override { - return Statistics(_stream_reader->statistics(), _chunk_reader->statistics(), - _decode_null_map_time); + ColumnStatistics column_statistics() override { + return ColumnStatistics(_chunk_reader->chunk_statistics(), _decode_null_map_time); } void close() override {} @@ -204,11 +227,78 @@ class ScalarColumnReader : public ParquetColumnReader { private: tparquet::ColumnChunk _chunk_meta; - const tparquet::OffsetIndex* _offset_index; + const tparquet::OffsetIndex* _offset_index = nullptr; std::unique_ptr _stream_reader; - std::unique_ptr _chunk_reader; + std::unique_ptr> _chunk_reader; + // rep def levels buffer. std::vector _rep_levels; std::vector _def_levels; + + size_t _current_range_idx = 0; + + Status gen_nested_null_map(size_t level_start_idx, size_t level_end_idx, + std::vector& null_map, + std::unordered_set& ancestor_null_indices) { + size_t has_read = level_start_idx; + null_map.emplace_back(0); + bool prev_is_null = false; + + while (has_read < level_end_idx) { + level_t def_level = _def_levels[has_read++]; + size_t loop_read = 1; + while (has_read < _def_levels.size() && _def_levels[has_read] == def_level) { + has_read++; + loop_read++; + } + + if (def_level < _field_schema->repeated_parent_def_level) { + for (size_t i = 0; i < loop_read; i++) { + ancestor_null_indices.insert(has_read - level_start_idx - loop_read + i); + } + continue; + } + + bool is_null = def_level < _field_schema->definition_level; + + if (prev_is_null == is_null && (USHRT_MAX - null_map.back() >= loop_read)) { + null_map.back() += loop_read; + } else { + if (!(prev_is_null ^ is_null)) { + null_map.emplace_back(0); + } + size_t remaining = loop_read; + while (remaining > USHRT_MAX) { + null_map.emplace_back(USHRT_MAX); + null_map.emplace_back(0); + remaining -= USHRT_MAX; + } + null_map.emplace_back((u_short)remaining); + prev_is_null = is_null; + } + } + return Status::OK(); + } + + Status gen_filter_map(FilterMap& filter_map, size_t filter_loc, size_t level_start_idx, + size_t level_end_idx, std::vector& nested_filter_map_data, + std::unique_ptr* nested_filter_map) { + nested_filter_map_data.resize(level_end_idx - level_start_idx); + for (size_t idx = level_start_idx; idx < level_end_idx; idx++) { + if (idx != level_start_idx && _rep_levels[idx] == 0) { + filter_loc++; + } + nested_filter_map_data[idx - level_start_idx] = + filter_map.filter_map_data()[filter_loc]; + } + + auto new_filter = std::make_unique(); + RETURN_IF_ERROR(new_filter->init(nested_filter_map_data.data(), + nested_filter_map_data.size(), false)); + *nested_filter_map = std::move(new_filter); + + return Status::OK(); + } + std::unique_ptr _converter = nullptr; std::unique_ptr> _nested_filter_map_data = nullptr; size_t _orig_filter_map_index = 0; @@ -217,17 +307,17 @@ class ScalarColumnReader : public ParquetColumnReader { Status _read_values(size_t num_values, ColumnPtr& doris_column, DataTypePtr& type, FilterMap& filter_map, bool is_dict_filter); Status _read_nested_column(ColumnPtr& doris_column, DataTypePtr& type, FilterMap& filter_map, - size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter, - bool align_rows); + size_t batch_size, size_t* read_rows, bool* eof, + bool is_dict_filter); Status _try_load_dict_page(bool* loaded, bool* has_dict); }; class ArrayColumnReader : public ParquetColumnReader { ENABLE_FACTORY_CREATOR(ArrayColumnReader) public: - ArrayColumnReader(const RowRanges& row_ranges, const cctz::time_zone* ctz, + ArrayColumnReader(const RowRanges& row_ranges, size_t total_rows, const cctz::time_zone* ctz, io::IOContext* io_ctx) - : ParquetColumnReader(row_ranges, ctz, io_ctx) {} + : ParquetColumnReader(row_ranges, total_rows, ctz, io_ctx) {} ~ArrayColumnReader() override { close(); } Status init(std::unique_ptr element_reader, FieldSchema* field); Status read_column_data(ColumnPtr& doris_column, const DataTypePtr& type, @@ -240,7 +330,7 @@ class ArrayColumnReader : public ParquetColumnReader { const std::vector& get_def_level() const override { return _element_reader->get_def_level(); } - Statistics statistics() override { return _element_reader->statistics(); } + ColumnStatistics column_statistics() override { return _element_reader->column_statistics(); } void close() override {} void reset_filter_map_index() override { _element_reader->reset_filter_map_index(); } @@ -252,8 +342,9 @@ class ArrayColumnReader : public ParquetColumnReader { class MapColumnReader : public ParquetColumnReader { ENABLE_FACTORY_CREATOR(MapColumnReader) public: - MapColumnReader(const RowRanges& row_ranges, const cctz::time_zone* ctz, io::IOContext* io_ctx) - : ParquetColumnReader(row_ranges, ctz, io_ctx) {} + MapColumnReader(const RowRanges& row_ranges, size_t total_rows, const cctz::time_zone* ctz, + io::IOContext* io_ctx) + : ParquetColumnReader(row_ranges, total_rows, ctz, io_ctx) {} ~MapColumnReader() override { close(); } Status init(std::unique_ptr key_reader, @@ -270,9 +361,9 @@ class MapColumnReader : public ParquetColumnReader { return _key_reader->get_def_level(); } - Statistics statistics() override { - Statistics kst = _key_reader->statistics(); - Statistics vst = _value_reader->statistics(); + ColumnStatistics column_statistics() override { + ColumnStatistics kst = _key_reader->column_statistics(); + ColumnStatistics vst = _value_reader->column_statistics(); kst.merge(vst); return kst; } @@ -292,9 +383,9 @@ class MapColumnReader : public ParquetColumnReader { class StructColumnReader : public ParquetColumnReader { ENABLE_FACTORY_CREATOR(StructColumnReader) public: - StructColumnReader(const RowRanges& row_ranges, const cctz::time_zone* ctz, + StructColumnReader(const RowRanges& row_ranges, size_t total_rows, const cctz::time_zone* ctz, io::IOContext* io_ctx) - : ParquetColumnReader(row_ranges, ctz, io_ctx) {} + : ParquetColumnReader(row_ranges, total_rows, ctz, io_ctx) {} ~StructColumnReader() override { close(); } Status init( @@ -327,12 +418,12 @@ class StructColumnReader : public ParquetColumnReader { return _child_readers.begin()->second->get_def_level(); } - Statistics statistics() override { - Statistics st; + ColumnStatistics column_statistics() override { + ColumnStatistics st; for (const auto& column_name : _read_column_names) { auto reader = _child_readers.find(column_name); if (reader != _child_readers.end()) { - Statistics cst = reader->second->statistics(); + ColumnStatistics cst = reader->second->column_statistics(); st.merge(cst); } } @@ -357,9 +448,9 @@ class StructColumnReader : public ParquetColumnReader { // This is used when a column is not needed but its structure is required (e.g., for map keys) class SkipReadingReader : public ParquetColumnReader { public: - SkipReadingReader(const RowRanges& row_ranges, const cctz::time_zone* ctz, + SkipReadingReader(const RowRanges& row_ranges, size_t total_rows, const cctz::time_zone* ctz, io::IOContext* io_ctx, FieldSchema* field_schema) - : ParquetColumnReader(row_ranges, ctz, io_ctx) { + : ParquetColumnReader(row_ranges, total_rows, ctz, io_ctx) { _field_schema = field_schema; // Use inherited member from base class VLOG_DEBUG << "[ParquetReader] Created SkipReadingReader for field: " << _field_schema->name; @@ -399,10 +490,11 @@ class SkipReadingReader : public ParquetColumnReader { } static std::unique_ptr create_unique(const RowRanges& row_ranges, - cctz::time_zone* ctz, + size_t total_rows, cctz::time_zone* ctz, io::IOContext* io_ctx, FieldSchema* field_schema) { - return std::make_unique(row_ranges, ctz, io_ctx, field_schema); + return std::make_unique(row_ranges, total_rows, ctz, io_ctx, + field_schema); } // These methods should not be called for SkipReadingReader @@ -424,8 +516,8 @@ class SkipReadingReader : public ParquetColumnReader { } // Implement required pure virtual methods from base class - Statistics statistics() override { - return Statistics(); // Return empty statistics + ColumnStatistics column_statistics() override { + return ColumnStatistics(); // Return empty statistics } void close() override { diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 58f1c337555452..d57264c1987427 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -85,7 +85,8 @@ RowGroupReader::RowGroupReader(io::FileReaderSPtr file_reader, const PositionDeleteContext& position_delete_ctx, const LazyReadContext& lazy_read_ctx, RuntimeState* state, const std::set& column_ids, - const std::set& filter_column_ids) + const std::set& filter_column_ids, + const std::string& created_by) : _file_reader(file_reader), _read_table_columns(read_columns), _row_group_id(row_group_id), @@ -98,7 +99,8 @@ RowGroupReader::RowGroupReader(io::FileReaderSPtr file_reader, _state(state), _obj_pool(new ObjectPool()), _column_ids(column_ids), - _filter_column_ids(filter_column_ids) {} + _filter_column_ids(filter_column_ids), + _created_by(created_by) {} RowGroupReader::~RowGroupReader() { _column_readers.clear(); @@ -129,17 +131,12 @@ Status RowGroupReader::init( std::min(MAX_COLUMN_BUF_SIZE, MAX_GROUP_BUF_SIZE / _read_table_columns.size()); for (const auto& read_table_col : _read_table_columns) { auto read_file_col = _table_info_node_ptr->children_file_column_name(read_table_col); - auto* field = schema.get_column(read_file_col); - auto physical_index = field->physical_column_index; std::unique_ptr reader; - // TODO : support rested column types - const tparquet::OffsetIndex* offset_index = - col_offsets.find(physical_index) != col_offsets.end() ? &col_offsets[physical_index] - : nullptr; - RETURN_IF_ERROR(ParquetColumnReader::create( - _file_reader, field, _row_group_meta, _read_ranges, _ctz, _io_ctx, reader, - max_buf_size, offset_index, _column_ids, _filter_column_ids)); + RETURN_IF_ERROR(ParquetColumnReader::create(_file_reader, field, _row_group_meta, + _read_ranges, _ctz, _io_ctx, reader, + max_buf_size, col_offsets, false, _column_ids, + _filter_column_ids, _state, _created_by)); if (reader == nullptr) { VLOG_DEBUG << "Init row group(" << _row_group_id << ") reader failed"; return Status::Corruption("Init row group reader failed"); @@ -1134,10 +1131,10 @@ void RowGroupReader::_convert_dict_cols_to_string_cols(Block* block) { } } -ParquetColumnReader::Statistics RowGroupReader::statistics() { - ParquetColumnReader::Statistics st; +ParquetColumnReader::ColumnStatistics RowGroupReader::merged_column_statistics() { + ParquetColumnReader::ColumnStatistics st; for (auto& reader : _column_readers) { - auto ost = reader.second->statistics(); + auto ost = reader.second->column_statistics(); st.merge(ost); } return st; diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h index f81d660734931c..f504292c035975 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -159,7 +159,7 @@ class RowGroupReader : public ProfileCollector { const PositionDeleteContext& position_delete_ctx, const LazyReadContext& lazy_read_ctx, RuntimeState* state, const std::set& column_ids, - const std::set& filter_column_ids); + const std::set& filter_column_ids, const std::string& created_by = ""); ~RowGroupReader(); Status init(const FieldDescriptor& schema, RowRanges& row_ranges, @@ -173,7 +173,7 @@ class RowGroupReader : public ProfileCollector { int64_t predicate_filter_time() const { return _predicate_filter_time; } int64_t dict_filter_rewrite_time() const { return _dict_filter_rewrite_time; } - ParquetColumnReader::Statistics statistics(); + ParquetColumnReader::ColumnStatistics merged_column_statistics(); void set_remaining_rows(int64_t rows) { _remaining_rows = rows; } int64_t get_remaining_rows() { return _remaining_rows; } @@ -261,6 +261,7 @@ class RowGroupReader : public ProfileCollector { std::shared_ptr _obj_pool; const std::set& _column_ids; const std::set& _filter_column_ids; + std::string _created_by; // Parquet file's created_by field bool _is_row_group_filtered = false; RowGroupIndex _current_row_group_idx {0, 0, 0}; diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp index 7d221e7d970f48..38478ab2863f52 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp @@ -26,6 +26,8 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "io/fs/buffered_reader.h" +#include "olap/page_cache.h" +#include "parquet_common.h" #include "util/runtime_profile.h" #include "util/slice.h" #include "util/thrift_util.h" @@ -40,23 +42,72 @@ namespace doris::vectorized { #include "common/compile_check_begin.h" static constexpr size_t INIT_PAGE_HEADER_SIZE = 128; -std::unique_ptr create_page_reader(io::BufferedStreamReader* reader, - io::IOContext* io_ctx, uint64_t offset, - uint64_t length, int64_t num_values, - const tparquet::OffsetIndex* offset_index) { - if (offset_index) { - return std::make_unique(reader, io_ctx, offset, length, - num_values, offset_index); - } else { - return std::make_unique(reader, io_ctx, offset, length); +// // Check if the file was created by a version that always marks pages as compressed +// // regardless of the actual compression state (parquet-cpp < 2.0.0) +// static bool _is_always_compressed(const ParquetPageReadContext& ctx) { +// if (ctx.created_by.empty()) { +// return false; +// } + +// // Parse the version string +// std::unique_ptr parsed_version; +// Status status = VersionParser::parse(ctx.created_by, &parsed_version); +// if (!status.ok()) { +// return false; +// } + +// // Check if it's parquet-cpp +// if (parsed_version->application() != "parquet-cpp") { +// return false; +// } + +// // Check if version < 2.0.0 +// if (!parsed_version->version().has_value()) { +// return false; +// } + +// std::unique_ptr semantic_version; +// status = SemanticVersion::parse(parsed_version->version().value(), &semantic_version); +// if (!status.ok()) { +// return false; +// } + +// // parquet-cpp versions < 2.0.0 always report compressed +// static const SemanticVersion PARQUET_CPP_FIXED_VERSION(2, 0, 0); +// return semantic_version->compare_to(PARQUET_CPP_FIXED_VERSION) < 0; +// } + +template +PageReader::PageReader(io::BufferedStreamReader* reader, + io::IOContext* io_ctx, uint64_t offset, + uint64_t length, size_t total_rows, + const tparquet::ColumnMetaData& metadata, + const tparquet::OffsetIndex* offset_index, + const ParquetPageReadContext& ctx) + : _reader(reader), + _io_ctx(io_ctx), + _offset(offset), + _start_offset(offset), + _end_offset(offset + length), + _total_rows(total_rows), + _metadata(metadata), + _offset_index(offset_index), + _ctx(ctx) { + _next_header_offset = _offset; + _state = INITIALIZED; + + if constexpr (OFFSET_INDEX) { + _end_row = _offset_index->page_locations.size() >= 2 + ? _offset_index->page_locations[1].first_row_index + : _total_rows; } } -PageReader::PageReader(io::BufferedStreamReader* reader, io::IOContext* io_ctx, uint64_t offset, - uint64_t length) - : _reader(reader), _io_ctx(io_ctx), _start_offset(offset), _end_offset(offset + length) {} - -Status PageReader::_parse_page_header() { +template +Status PageReader::parse_page_header() { + if (_state == HEADER_PARSED) { + return Status::OK(); + } if (UNLIKELY(_offset < _start_offset || _offset >= _end_offset)) { return Status::IOError("Out-of-bounds Access"); } @@ -67,22 +118,115 @@ Status PageReader::_parse_page_header() { return Status::IOError("Should skip or load current page to get next page"); } + _page_statistics.page_read_counter += 1; + + // Parse page header from file; header bytes are saved for possible cache insertion const uint8_t* page_header_buf = nullptr; size_t max_size = _end_offset - _offset; size_t header_size = std::min(INIT_PAGE_HEADER_SIZE, max_size); const size_t MAX_PAGE_HEADER_SIZE = config::parquet_header_max_size_mb << 20; uint32_t real_header_size = 0; + + // Try a header-only lookup in the page cache. Cached pages store + // header + optional v2 levels + uncompressed payload, so we can + // parse the page header directly from the cached bytes and avoid + // a file read for the header. + if (_ctx.enable_parquet_file_page_cache && !config::disable_storage_page_cache && + StoragePageCache::instance() != nullptr) { + PageCacheHandle handle; + StoragePageCache::CacheKey key(fmt::format("{}::{}", _reader->path(), _reader->mtime()), + _end_offset, _offset); + if (StoragePageCache::instance()->lookup(key, &handle, segment_v2::DATA_PAGE)) { + // Parse header directly from cached data + _page_cache_handle = std::move(handle); + Slice s = _page_cache_handle.data(); + real_header_size = cast_set(s.size); + SCOPED_RAW_TIMER(&_page_statistics.decode_header_time); + auto st = deserialize_thrift_msg(reinterpret_cast(s.data), + &real_header_size, true, &_cur_page_header); + if (!st.ok()) return st; + // Increment page cache counters for a true cache hit on header+payload + _page_statistics.page_cache_hit_counter += 1; + // Detect whether the cached payload is compressed or decompressed and record + // the appropriate counter. Cached layout is: header | optional levels | payload + + // Determine if payload is compressed using V2 is_compressed field if available + // bool payload_is_compressed = true; + // if (_cur_page_header.type == tparquet::PageType::DATA_PAGE_V2) { + // const auto& page_header_v2 = _cur_page_header.data_page_header_v2; + // if (page_header_v2.__isset.is_compressed) { + // payload_is_compressed = page_header_v2.is_compressed; + // } + // } + + // // ARROW-17100: [C++][Parquet] Fix backwards compatibility for ParquetV2 data pages written prior to 3.0.0 per ARROW-10353 #13665 + // // https://github.com/apache/arrow/pull/13665/files + // // Prior to parquet-cpp version 2.0.0, is_compressed was always set to false in column headers, + // // even if compression was used. See ARROW-17100. + // bool always_compressed = _is_always_compressed(_ctx); + // payload_is_compressed |= always_compressed; + + // // Apply codec check: if codec is UNCOMPRESSED, payload cannot be compressed + // payload_is_compressed = payload_is_compressed && + // (_metadata.codec != tparquet::CompressionCodec::UNCOMPRESSED); + + // // Save the computed result for use by ColumnChunkReader + // _payload_is_compressed = payload_is_compressed; + + bool is_cache_payload_decompressed = true; + if (_cur_page_header.compressed_page_size > 0) { + is_cache_payload_decompressed = + (_metadata.codec == tparquet::CompressionCodec::UNCOMPRESSED) || + (static_cast(_cur_page_header.uncompressed_page_size) <= + static_cast(config::parquet_page_cache_decompress_threshold) * + static_cast(_cur_page_header.compressed_page_size)); + } + + if (is_cache_payload_decompressed) { + _page_statistics.page_cache_decompressed_hit_counter += 1; + } else { + _page_statistics.page_cache_compressed_hit_counter += 1; + } + + _is_cache_payload_decompressed = is_cache_payload_decompressed; + + if constexpr (OFFSET_INDEX == false) { + if (is_header_v2()) { + _end_row = _start_row + _cur_page_header.data_page_header_v2.num_rows; + } else if constexpr (!IN_COLLECTION) { + _end_row = _start_row + _cur_page_header.data_page_header.num_values; + } + } + + // Save header bytes for later use (e.g., to insert updated cache entries) + _header_buf.assign(s.data, s.data + real_header_size); + _last_header_size = real_header_size; + _page_statistics.parse_page_header_num++; + _offset += real_header_size; + _next_header_offset = _offset + _cur_page_header.compressed_page_size; + _state = HEADER_PARSED; + return Status::OK(); + } else { + _page_statistics.page_cache_missing_counter += 1; + // Clear any existing cache handle on miss to avoid holding stale handle + _page_cache_handle = PageCacheHandle(); + } + } + // NOTE: page cache lookup for *decompressed* page data is handled in + // ColumnChunkReader::load_page_data(). PageReader should only be + // responsible for parsing the header bytes from the file and saving + // them in `_header_buf` for possible later insertion into the cache. while (true) { if (UNLIKELY(_io_ctx && _io_ctx->should_stop)) { return Status::EndOfFile("stop"); } header_size = std::min(header_size, max_size); { - SCOPED_RAW_TIMER(&_statistics.read_page_header_time); + SCOPED_RAW_TIMER(&_page_statistics.read_page_header_time); RETURN_IF_ERROR(_reader->read_bytes(&page_header_buf, _offset, header_size, _io_ctx)); } real_header_size = cast_set(header_size); - SCOPED_RAW_TIMER(&_statistics.decode_header_time); + SCOPED_RAW_TIMER(&_page_statistics.decode_header_time); auto st = deserialize_thrift_msg(page_header_buf, &real_header_size, true, &_cur_page_header); if (st.ok()) { @@ -97,23 +241,26 @@ Status PageReader::_parse_page_header() { header_size <<= 2; } - _statistics.parse_page_header_num++; + if constexpr (OFFSET_INDEX == false) { + if (is_header_v2()) { + _end_row = _start_row + _cur_page_header.data_page_header_v2.num_rows; + } else if constexpr (!IN_COLLECTION) { + _end_row = _start_row + _cur_page_header.data_page_header.num_values; + } + } + + // Save header bytes for possible cache insertion later + _header_buf.assign(page_header_buf, page_header_buf + real_header_size); + _last_header_size = real_header_size; + _page_statistics.parse_page_header_num++; _offset += real_header_size; _next_header_offset = _offset + _cur_page_header.compressed_page_size; _state = HEADER_PARSED; return Status::OK(); } -Status PageReader::skip_page() { - if (UNLIKELY(_state != HEADER_PARSED)) { - return Status::IOError("Should generate page header first to skip current page"); - } - _offset = _next_header_offset; - _state = INITIALIZED; - return Status::OK(); -} - -Status PageReader::get_page_data(Slice& slice) { +template +Status PageReader::get_page_data(Slice& slice) { if (UNLIKELY(_state != HEADER_PARSED)) { return Status::IOError("Should generate page header first to load current page data"); } @@ -123,9 +270,14 @@ Status PageReader::get_page_data(Slice& slice) { slice.size = _cur_page_header.compressed_page_size; RETURN_IF_ERROR(_reader->read_bytes(slice, _offset, _io_ctx)); _offset += slice.size; - _state = INITIALIZED; + _state = DATA_LOADED; return Status::OK(); } + +template class PageReader; +template class PageReader; +template class PageReader; +template class PageReader; #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h index 1782716364151b..a722507c7423a3 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h @@ -23,7 +23,20 @@ #include #include "common/cast_set.h" +#include "common/config.h" #include "common/status.h" +#include "olap/page_cache.h" +#include "util/block_compression.h" +#include "vec/exec/format/parquet/parquet_common.h" +namespace doris { +class BlockCompressionCodec; + +namespace io { +class BufferedStreamReader; +struct IOContext; +} // namespace io + +} // namespace doris namespace doris { namespace io { @@ -38,156 +51,202 @@ namespace doris::vectorized { /** * Use to deserialize parquet page header, and get the page data in iterator interface. */ + +// Session-level options for parquet page reading/caching. +struct ParquetPageReadContext { + bool enable_parquet_file_page_cache = true; + std::string created_by; // Parquet file's created_by field for version checking + ParquetPageReadContext() = default; + ParquetPageReadContext(bool enable, const std::string& created_by_str = "") + : enable_parquet_file_page_cache(enable), created_by(created_by_str) {} +}; + +template class PageReader { public: - struct Statistics { + struct PageStatistics { int64_t decode_header_time = 0; int64_t skip_page_header_num = 0; int64_t parse_page_header_num = 0; int64_t read_page_header_time = 0; + // page cache metrics + // page_cache_hit_counter: number of times a cached page was used + int64_t page_cache_hit_counter = 0; + // page_cache_missing_counter: number of times a cached page was not found + int64_t page_cache_missing_counter = 0; + // page_cache_compressed_hit_counter: number of cache hits where the cached payload is compressed + int64_t page_cache_compressed_hit_counter = 0; + // page_cache_decompressed_hit_counter: number of cache hits where the cached payload is decompressed + int64_t page_cache_decompressed_hit_counter = 0; + // page_cache_write_counter: number of times this reader wrote an entry into the cache + int64_t page_cache_write_counter = 0; + // page_cache_compressed_write_counter: number of times this reader wrote compressed page into the cache + int64_t page_cache_compressed_write_counter = 0; + // page_cache_decompressed_write_counter: number of times this reader wrote decompressed page into the cache + int64_t page_cache_decompressed_write_counter = 0; + // page_read_counter: total pages read by this PageReader (includes cache hits and file reads) + int64_t page_read_counter = 0; }; PageReader(io::BufferedStreamReader* reader, io::IOContext* io_ctx, uint64_t offset, - uint64_t length); - virtual ~PageReader() = default; - - // Deprecated - // Parquet file may not be standardized, - // _end_offset may exceed the actual data area. - // ColumnChunkReader::has_next_page() use the number of parsed values for judgment - // [[deprecated]] - bool has_next_page() const { return _offset < _end_offset; } - - virtual Status next_page_header() { return _parse_page_header(); } - - virtual Status get_page_header(const tparquet::PageHeader*& page_header) { - if (UNLIKELY(_state != HEADER_PARSED)) { - return Status::InternalError("Page header not parsed"); - } - page_header = &_cur_page_header; - return Status::OK(); - } - - virtual Status get_num_values(uint32_t& num_values) { - if (_state != HEADER_PARSED) { - return Status::InternalError("Page header not parsed"); - } - if (_cur_page_header.type == tparquet::PageType::DATA_PAGE_V2) { - num_values = _cur_page_header.data_page_header_v2.num_values; + uint64_t length, size_t total_rows, const tparquet::ColumnMetaData& metadata, + const tparquet::OffsetIndex* offset_index = nullptr, + const ParquetPageReadContext& ctx = ParquetPageReadContext()); + ~PageReader() = default; + + bool has_next_page() const { + if constexpr (OFFSET_INDEX) { + return _page_index + 1 != _offset_index->page_locations.size(); } else { - num_values = _cur_page_header.data_page_header.num_values; + // Deprecated + // Parquet file may not be standardized, + // _end_offset may exceed the actual data area. + // ColumnChunkReader::has_next_page() use the number of parsed values for judgment + // ref:https://github.com/duckdb/duckdb/issues/10829 + // [[deprecated]] + LOG(FATAL) << "has_next_page should not be called when no offset index"; + return _offset < _end_offset; } - return Status::OK(); - } - - virtual Status skip_page(); - - virtual Status get_page_data(Slice& slice); - - Statistics& statistics() { return _statistics; } - - void seek_to_page(int64_t page_header_offset) { - _offset = page_header_offset; - _next_header_offset = page_header_offset; - _state = INITIALIZED; } -protected: - enum PageReaderState { INITIALIZED, HEADER_PARSED }; - PageReaderState _state = INITIALIZED; - tparquet::PageHeader _cur_page_header; - Statistics _statistics; - - Status _parse_page_header(); + Status parse_page_header(); -private: - io::BufferedStreamReader* _reader = nullptr; - io::IOContext* _io_ctx = nullptr; - uint64_t _offset = 0; - uint64_t _next_header_offset = 0; - uint64_t _start_offset = 0; - uint64_t _end_offset = 0; -}; + Status next_page() { + _page_statistics.skip_page_header_num += _state == INITIALIZED; + if constexpr (OFFSET_INDEX) { + _page_index++; + _start_row = _offset_index->page_locations[_page_index].first_row_index; + if (_page_index + 1 < _offset_index->page_locations.size()) { + _end_row = _offset_index->page_locations[_page_index + 1].first_row_index; + } else { + _end_row = _total_rows; + } + int64_t next_page_offset = _offset_index->page_locations[_page_index].offset; + _offset = next_page_offset; + _next_header_offset = next_page_offset; + _state = INITIALIZED; + } else { + if (UNLIKELY(_offset == _start_offset)) { + return Status::Corruption("should parse first page."); + } + + if (is_header_v2()) { + _start_row += _cur_page_header.data_page_header_v2.num_rows; + } else if constexpr (!IN_COLLECTION) { + _start_row += _cur_page_header.data_page_header.num_values; + } + + _offset = _next_header_offset; + _state = INITIALIZED; + } -class PageReaderWithOffsetIndex : public PageReader { -public: - PageReaderWithOffsetIndex(io::BufferedStreamReader* reader, io::IOContext* io_ctx, - uint64_t offset, uint64_t length, int64_t num_values, - const tparquet::OffsetIndex* offset_index) - : PageReader(reader, io_ctx, offset, length), - _num_values(num_values), - _offset_index(offset_index) {} - - Status next_page_header() override { - // lazy to parse page header in get_page_header return Status::OK(); } - Status get_page_header(const tparquet::PageHeader*& page_header) override { - if (_state != HEADER_PARSED) { - RETURN_IF_ERROR(_parse_page_header()); + Status dict_next_page() { + if constexpr (OFFSET_INDEX) { + _state = INITIALIZED; + return Status::OK(); + } else { + return next_page(); } - page_header = &_cur_page_header; - return Status::OK(); } - Status get_num_values(uint32_t& num_values) override { - if (UNLIKELY(_page_index >= _offset_index->page_locations.size())) { - return Status::IOError("End of page"); - } - - if (_page_index < _offset_index->page_locations.size() - 1) { - num_values = cast_set( - _offset_index->page_locations[_page_index + 1].first_row_index - - _offset_index->page_locations[_page_index].first_row_index); - } else { - num_values = cast_set( - _num_values - _offset_index->page_locations[_page_index].first_row_index); + Status get_page_header(const tparquet::PageHeader** page_header) { + if (UNLIKELY(_state != HEADER_PARSED)) { + return Status::InternalError("Page header not parsed"); } + *page_header = &_cur_page_header; return Status::OK(); } - Status skip_page() override { - if (UNLIKELY(_page_index >= _offset_index->page_locations.size())) { - return Status::IOError("End of page"); - } + Status get_page_data(Slice& slice); - if (_state != HEADER_PARSED) { - _statistics.skip_page_header_num++; + // Skip page data and update offset (used when data is loaded from cache) + void skip_page_data() { + if (_state == HEADER_PARSED) { + _offset += _cur_page_header.compressed_page_size; + _state = DATA_LOADED; } + } - seek_to_page(_offset_index->page_locations[_page_index].offset + - _offset_index->page_locations[_page_index].compressed_page_size); - _page_index++; - return Status::OK(); + // Expose header bytes info for cache insertion + uint32_t last_header_size() const { return _last_header_size; } + const std::vector& header_bytes() const { return _header_buf; } + // header start offset for current page + int64_t header_start_offset() const { + return static_cast(_next_header_offset) - static_cast(_last_header_size) - + static_cast(_cur_page_header.compressed_page_size); + } + uint64_t file_end_offset() const { return _end_offset; } + bool cached_decompressed() const { + return static_cast(_cur_page_header.uncompressed_page_size) <= + static_cast(config::parquet_page_cache_decompress_threshold) * + static_cast(_cur_page_header.compressed_page_size); } - Status get_page_data(Slice& slice) override { - if (_page_index >= _offset_index->page_locations.size()) { - return Status::IOError("End of page"); - } - if (_state != HEADER_PARSED) { - RETURN_IF_ERROR(_parse_page_header()); - } + PageStatistics& page_statistics() { return _page_statistics; } - // dirctionary page is not in page location - if (LIKELY(_cur_page_header.type != tparquet::PageType::DICTIONARY_PAGE)) { - _page_index++; - } + bool is_header_v2() { return _cur_page_header.__isset.data_page_header_v2; } - return PageReader::get_page_data(slice); - } + // Returns whether the current page's cache payload is decompressed + bool is_cache_payload_decompressed() const { return _is_cache_payload_decompressed; } + + size_t start_row() const { return _start_row; } + + size_t end_row() const { return _end_row; } + + // Accessors for cache handle + bool has_page_cache_handle() const { return _page_cache_handle.cache() != nullptr; } + const doris::PageCacheHandle& page_cache_handle() const { return _page_cache_handle; } + + // Page cache members + doris::PageCacheHandle _page_cache_handle; + // stored header bytes when cache miss so we can insert header+payload into cache + std::vector _header_buf; + // last parsed header size in bytes + uint32_t _last_header_size = 0; private: + enum PageReaderState { INITIALIZED, HEADER_PARSED, DATA_LOADED }; + PageReaderState _state = INITIALIZED; + PageStatistics _page_statistics; + + io::BufferedStreamReader* _reader = nullptr; + io::IOContext* _io_ctx = nullptr; + // current reader offset in file location. + uint64_t _offset = 0; + // this page offset in file location. + uint64_t _start_offset = 0; + uint64_t _end_offset = 0; + uint64_t _next_header_offset = 0; + // current page row range + size_t _start_row = 0; + size_t _end_row = 0; + // total rows in this column chunk + size_t _total_rows = 0; + // Column metadata for this column chunk + const tparquet::ColumnMetaData& _metadata; + // for page index size_t _page_index = 0; - int64_t _num_values = 0; const tparquet::OffsetIndex* _offset_index; + + // Session-level parquet page cache options + ParquetPageReadContext _ctx; + + tparquet::PageHeader _cur_page_header; + bool _is_cache_payload_decompressed = true; }; -std::unique_ptr create_page_reader(io::BufferedStreamReader* reader, - io::IOContext* io_ctx, uint64_t offset, - uint64_t length, int64_t num_values = 0, - const tparquet::OffsetIndex* offset_index = nullptr); +template +std::unique_ptr> create_page_reader( + io::BufferedStreamReader* reader, io::IOContext* io_ctx, uint64_t offset, uint64_t length, + size_t total_rows, const tparquet::ColumnMetaData& metadata, + const tparquet::OffsetIndex* offset_index = nullptr, + const ParquetPageReadContext& ctx = ParquetPageReadContext()) { + return std::make_unique>( + reader, io_ctx, offset, length, total_rows, metadata, offset_index, ctx); +} #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index d4688621ca19cf..62dde51f009746 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -140,9 +140,11 @@ void ParquetReader::_init_profile() { ADD_TIMER_WITH_LEVEL(_profile, parquet_profile, 1); _parquet_profile.filtered_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( - _profile, "FilteredGroups", TUnit::UNIT, parquet_profile, 1); + _profile, "RowGroupsFiltered", TUnit::UNIT, parquet_profile, 1); _parquet_profile.to_read_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( - _profile, "ReadGroups", TUnit::UNIT, parquet_profile, 1); + _profile, "RowGroupsReadNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.total_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsTotalNum", TUnit::UNIT, parquet_profile, 1); _parquet_profile.filtered_group_rows = ADD_CHILD_COUNTER_WITH_LEVEL( _profile, "FilteredRowsByGroup", TUnit::UNIT, parquet_profile, 1); _parquet_profile.filtered_page_rows = ADD_CHILD_COUNTER_WITH_LEVEL( @@ -153,16 +155,14 @@ void ParquetReader::_init_profile() { _profile, "FilteredBytes", TUnit::BYTES, parquet_profile, 1); _parquet_profile.raw_rows_read = ADD_CHILD_COUNTER_WITH_LEVEL( _profile, "RawRowsRead", TUnit::UNIT, parquet_profile, 1); - _parquet_profile.to_read_bytes = ADD_CHILD_COUNTER_WITH_LEVEL( - _profile, "ReadBytes", TUnit::BYTES, parquet_profile, 1); _parquet_profile.column_read_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ColumnReadTime", parquet_profile, 1); _parquet_profile.parse_meta_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseMetaTime", parquet_profile, 1); _parquet_profile.parse_footer_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseFooterTime", parquet_profile, 1); - _parquet_profile.open_file_time = - ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FileOpenTime", parquet_profile, 1); + _parquet_profile.file_reader_create_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FileReaderCreateTime", parquet_profile, 1); _parquet_profile.open_file_num = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "FileNum", TUnit::UNIT, parquet_profile, 1); _parquet_profile.page_index_read_calls = @@ -183,8 +183,24 @@ void ParquetReader::_init_profile() { ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecompressTime", parquet_profile, 1); _parquet_profile.decompress_cnt = ADD_CHILD_COUNTER_WITH_LEVEL( _profile, "DecompressCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_read_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageReadCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_missing_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheMissingCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedHitCount", TUnit::UNIT, parquet_profile, 1); _parquet_profile.decode_header_time = - ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeHeaderTime", parquet_profile, 1); + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderDecodeTime", parquet_profile, 1); _parquet_profile.read_page_header_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderReadTime", parquet_profile, 1); _parquet_profile.decode_value_time = @@ -222,8 +238,8 @@ Status ParquetReader::_open_file() { return Status::EndOfFile("stop"); } if (_file_reader == nullptr) { - SCOPED_RAW_TIMER(&_statistics.open_file_time); - ++_statistics.open_file_num; + SCOPED_RAW_TIMER(&_reader_statistics.file_reader_create_time); + ++_reader_statistics.open_file_num; _file_description.mtime = _scan_range.__isset.modification_time ? _scan_range.modification_time : 0; io::FileReaderOptions reader_options = @@ -237,7 +253,7 @@ Status ParquetReader::_open_file() { } if (_file_metadata == nullptr) { - SCOPED_RAW_TIMER(&_statistics.parse_footer_time); + SCOPED_RAW_TIMER(&_reader_statistics.parse_footer_time); if (_tracing_file_reader->size() <= sizeof(PARQUET_VERSION_NUMBER)) { // Some system may generate parquet file with only 4 bytes: PAR1 // Should consider it as empty file. @@ -251,9 +267,8 @@ Status ParquetReader::_open_file() { &meta_size, _io_ctx)); _file_metadata = _file_metadata_ptr.get(); - _column_statistics.read_bytes += meta_size; // parse magic number & parse meta data - _statistics.file_footer_read_calls += 1; + _reader_statistics.file_footer_read_calls += 1; } else { const auto& file_meta_cache_key = FileMetaCache::get_key(_tracing_file_reader, _file_description); @@ -264,10 +279,9 @@ Status ParquetReader::_open_file() { _meta_cache->insert(file_meta_cache_key, _file_metadata_ptr.release(), &_meta_cache_handle); _file_metadata = _meta_cache_handle.data(); - _column_statistics.read_bytes += meta_size; - _statistics.file_footer_read_calls += 1; + _reader_statistics.file_footer_read_calls += 1; } else { - _statistics.file_footer_hit_cache++; + _reader_statistics.file_footer_hit_cache++; } _file_metadata = _meta_cache_handle.data(); } @@ -276,9 +290,6 @@ Status ParquetReader::_open_file() { return Status::InternalError("failed to get file meta data: {}", _file_description.path); } - _column_statistics.read_bytes += meta_size; - // parse magic number & parse meta data - _column_statistics.read_calls += 1; } return Status::OK(); } @@ -337,7 +348,7 @@ Status ParquetReader::init_reader( return Status::InternalError("failed to init parquet reader, please open reader first"); } - SCOPED_RAW_TIMER(&_statistics.parse_meta_time); + SCOPED_RAW_TIMER(&_reader_statistics.parse_meta_time); _total_groups = _t_metadata->row_groups.size(); if (_total_groups == 0) { return Status::EndOfFile("init reader failed, empty parquet file: " + _scan_range.path); @@ -629,7 +640,7 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } - SCOPED_RAW_TIMER(&_statistics.column_read_time); + SCOPED_RAW_TIMER(&_reader_statistics.column_read_time); Status batch_st = _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); if (batch_st.is()) { @@ -646,11 +657,13 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) } if (_row_group_eof) { - auto column_st = _current_group_reader->statistics(); + auto column_st = _current_group_reader->merged_column_statistics(); _column_statistics.merge(column_st); - _statistics.lazy_read_filtered_rows += _current_group_reader->lazy_read_filtered_rows(); - _statistics.predicate_filter_time += _current_group_reader->predicate_filter_time(); - _statistics.dict_filter_rewrite_time += _current_group_reader->dict_filter_rewrite_time(); + _reader_statistics.lazy_read_filtered_rows += + _current_group_reader->lazy_read_filtered_rows(); + _reader_statistics.predicate_filter_time += _current_group_reader->predicate_filter_time(); + _reader_statistics.dict_filter_rewrite_time += + _current_group_reader->dict_filter_rewrite_time(); if (_current_row_group_index.row_group_id + 1 == _total_groups) { *eof = true; } else { @@ -742,23 +755,22 @@ Status ParquetReader::_next_row_group_reader() { group_size += column_compressed_size(field); } - _statistics.read_rows += candidate_row_ranges.count(); + _reader_statistics.read_rows += candidate_row_ranges.count(); if (_io_ctx) { _io_ctx->file_reader_stats->read_rows += candidate_row_ranges.count(); } if (candidate_row_ranges.count() != 0) { // need read this row group. - _statistics.read_row_groups++; - _statistics.read_bytes += group_size; - - _statistics.filtered_page_rows += row_group.num_rows - candidate_row_ranges.count(); + _reader_statistics.read_row_groups++; + _reader_statistics.filtered_page_rows += + row_group.num_rows - candidate_row_ranges.count(); break; } else { // this row group be filtered. - _statistics.filtered_row_groups++; - _statistics.filtered_bytes += group_size; - _statistics.filtered_group_rows += row_group.num_rows; + _reader_statistics.filtered_row_groups++; + _reader_statistics.filtered_bytes += group_size; + _reader_statistics.filtered_group_rows += row_group.num_rows; } } @@ -793,12 +805,13 @@ Status ParquetReader::_next_row_group_reader() { _profile, _file_reader, io_ranges, merged_read_slice_size) : _file_reader; } - _current_group_reader.reset(new RowGroupReader( - _io_ctx ? std::make_shared(group_file_reader, - _io_ctx->file_reader_stats) - : group_file_reader, - _read_table_columns, _current_row_group_index.row_group_id, row_group, _ctz, _io_ctx, - position_delete_ctx, _lazy_read_ctx, _state, _column_ids, _filter_column_ids)); + _current_group_reader.reset( + new RowGroupReader(_io_ctx ? std::make_shared( + group_file_reader, _io_ctx->file_reader_stats) + : group_file_reader, + _read_table_columns, _current_row_group_index.row_group_id, + row_group, _ctz, _io_ctx, position_delete_ctx, _lazy_read_ctx, + _state, _column_ids, _filter_column_ids, _t_metadata->created_by)); _row_group_eof = false; _current_group_reader->set_current_row_group_idx(_current_row_group_index); @@ -889,38 +902,60 @@ Status ParquetReader::_process_page_index_filter( return Status::OK(); } + std::vector parquet_col_ids; + for (size_t idx = 0; idx < _read_table_columns.size(); idx++) { + const auto& read_table_col = _read_table_columns[idx]; + const auto& read_file_col = _read_file_columns[idx]; + if (!_colname_to_slot_id->contains(read_table_col)) { + continue; + } + auto* field = _file_metadata->schema().get_column(read_file_col); + + std::function f = [&](FieldSchema* field) { + if (!_column_ids.empty() && + _column_ids.find(field->get_column_id()) == _column_ids.end()) { + return; + } + + if (field->data_type->get_primitive_type() == TYPE_ARRAY) { + f(&field->children[0]); + } else if (field->data_type->get_primitive_type() == TYPE_MAP) { + f(&field->children[0]); + f(&field->children[1]); + } else if (field->data_type->get_primitive_type() == TYPE_STRUCT) { + for (int i = 0; i < field->children.size(); ++i) { + f(&field->children[i]); + } + } else { + int parquet_col_id = field->physical_column_index; + if (parquet_col_id >= 0) { + parquet_col_ids.push_back(parquet_col_id); + } + } + }; + + f(field); + } + auto parse_offset_index = [&]() -> Status { std::vector off_index_buff(page_index._offset_index_size); Slice res(off_index_buff.data(), page_index._offset_index_size); size_t bytes_read = 0; { - SCOPED_RAW_TIMER(&_statistics.read_page_index_time); + SCOPED_RAW_TIMER(&_reader_statistics.read_page_index_time); RETURN_IF_ERROR(_tracing_file_reader->read_at(page_index._offset_index_start, res, &bytes_read, _io_ctx)); } - _column_statistics.read_bytes += bytes_read; _column_statistics.page_index_read_calls++; _col_offsets.clear(); - for (size_t idx = 0; idx < _read_table_columns.size(); idx++) { - const auto& read_table_col = _read_table_columns[idx]; - const auto& read_file_col = _read_file_columns[idx]; - if (!_colname_to_slot_id->contains(read_table_col)) { - // equal delete may add column to read_table_col, but this column no slot_id. - continue; - } - int parquet_col_id = - _file_metadata->schema().get_column(read_file_col)->physical_column_index; - if (parquet_col_id < 0) { - // complex type, not support page index yet. - continue; - } + for (auto parquet_col_id : parquet_col_ids) { auto& chunk = row_group.columns[parquet_col_id]; if (chunk.offset_index_length == 0) [[unlikely]] { continue; } tparquet::OffsetIndex offset_index; - SCOPED_RAW_TIMER(&_statistics.parse_page_index_time); + SCOPED_RAW_TIMER(&_reader_statistics.parse_page_index_time); RETURN_IF_ERROR( page_index.parse_offset_index(chunk, off_index_buff.data(), &offset_index)); _col_offsets[parquet_col_id] = offset_index; @@ -932,7 +967,7 @@ Status ParquetReader::_process_page_index_filter( RETURN_IF_ERROR(parse_offset_index()); // Check if page index is needed for min-max filter. - if (!_enable_filter_by_min_max || _lazy_read_ctx.has_complex_type || push_down_pred.empty()) { + if (!_enable_filter_by_min_max || push_down_pred.empty()) { read_whole_row_group(); return Status::OK(); } @@ -942,14 +977,13 @@ Status ParquetReader::_process_page_index_filter( size_t bytes_read = 0; Slice result(col_index_buff.data(), page_index._column_index_size); { - SCOPED_RAW_TIMER(&_statistics.read_page_index_time); + SCOPED_RAW_TIMER(&_reader_statistics.read_page_index_time); RETURN_IF_ERROR(_tracing_file_reader->read_at(page_index._column_index_start, result, &bytes_read, _io_ctx)); } - _column_statistics.read_bytes += bytes_read; _column_statistics.page_index_read_calls++; - SCOPED_RAW_TIMER(&_statistics.page_index_filter_time); + SCOPED_RAW_TIMER(&_reader_statistics.page_index_filter_time); // Construct a cacheable page index structure to avoid repeatedly reading the page index of the same column. ParquetPredicate::CachedPageIndexStat cached_page_index; @@ -986,7 +1020,7 @@ Status ParquetReader::_process_page_index_filter( tparquet::ColumnIndex column_index; { - SCOPED_RAW_TIMER(&_statistics.parse_page_index_time); + SCOPED_RAW_TIMER(&_reader_statistics.parse_page_index_time); RETURN_IF_ERROR(page_index.parse_column_index(column_chunk, col_index_buff.data(), &column_index)); } @@ -1049,7 +1083,7 @@ Status ParquetReader::_process_min_max_bloom_filter( const RowGroupReader::RowGroupIndex& row_group_index, const tparquet::RowGroup& row_group, const std::vector>& push_down_pred, RowRanges* row_ranges) { - SCOPED_RAW_TIMER(&_statistics.row_group_filter_time); + SCOPED_RAW_TIMER(&_reader_statistics.row_group_filter_time); if (!_filter_groups) { // No row group filtering is needed; // for example, Iceberg reads position delete files. @@ -1154,35 +1188,59 @@ void ParquetReader::_collect_profile() { if (_current_group_reader != nullptr) { _current_group_reader->collect_profile_before_close(); } - COUNTER_UPDATE(_parquet_profile.filtered_row_groups, _statistics.filtered_row_groups); - COUNTER_UPDATE(_parquet_profile.to_read_row_groups, _statistics.read_row_groups); - COUNTER_UPDATE(_parquet_profile.filtered_group_rows, _statistics.filtered_group_rows); - COUNTER_UPDATE(_parquet_profile.filtered_page_rows, _statistics.filtered_page_rows); - COUNTER_UPDATE(_parquet_profile.lazy_read_filtered_rows, _statistics.lazy_read_filtered_rows); - COUNTER_UPDATE(_parquet_profile.filtered_bytes, _statistics.filtered_bytes); - COUNTER_UPDATE(_parquet_profile.raw_rows_read, _statistics.read_rows); - COUNTER_UPDATE(_parquet_profile.to_read_bytes, _statistics.read_bytes); - COUNTER_UPDATE(_parquet_profile.column_read_time, _statistics.column_read_time); - COUNTER_UPDATE(_parquet_profile.parse_meta_time, _statistics.parse_meta_time); - COUNTER_UPDATE(_parquet_profile.parse_footer_time, _statistics.parse_footer_time); - COUNTER_UPDATE(_parquet_profile.open_file_time, _statistics.open_file_time); - COUNTER_UPDATE(_parquet_profile.open_file_num, _statistics.open_file_num); - COUNTER_UPDATE(_parquet_profile.page_index_filter_time, _statistics.page_index_filter_time); - COUNTER_UPDATE(_parquet_profile.read_page_index_time, _statistics.read_page_index_time); - COUNTER_UPDATE(_parquet_profile.parse_page_index_time, _statistics.parse_page_index_time); - COUNTER_UPDATE(_parquet_profile.row_group_filter_time, _statistics.row_group_filter_time); - COUNTER_UPDATE(_parquet_profile.file_footer_read_calls, _statistics.file_footer_read_calls); - COUNTER_UPDATE(_parquet_profile.file_footer_hit_cache, _statistics.file_footer_hit_cache); + COUNTER_UPDATE(_parquet_profile.filtered_row_groups, _reader_statistics.filtered_row_groups); + COUNTER_UPDATE(_parquet_profile.to_read_row_groups, _reader_statistics.read_row_groups); + COUNTER_UPDATE(_parquet_profile.total_row_groups, _total_groups); + COUNTER_UPDATE(_parquet_profile.filtered_group_rows, _reader_statistics.filtered_group_rows); + COUNTER_UPDATE(_parquet_profile.filtered_page_rows, _reader_statistics.filtered_page_rows); + COUNTER_UPDATE(_parquet_profile.lazy_read_filtered_rows, + _reader_statistics.lazy_read_filtered_rows); + COUNTER_UPDATE(_parquet_profile.filtered_bytes, _reader_statistics.filtered_bytes); + COUNTER_UPDATE(_parquet_profile.raw_rows_read, _reader_statistics.read_rows); + COUNTER_UPDATE(_parquet_profile.column_read_time, _reader_statistics.column_read_time); + COUNTER_UPDATE(_parquet_profile.parse_meta_time, _reader_statistics.parse_meta_time); + COUNTER_UPDATE(_parquet_profile.parse_footer_time, _reader_statistics.parse_footer_time); + COUNTER_UPDATE(_parquet_profile.file_reader_create_time, + _reader_statistics.file_reader_create_time); + COUNTER_UPDATE(_parquet_profile.open_file_num, _reader_statistics.open_file_num); + COUNTER_UPDATE(_parquet_profile.page_index_filter_time, + _reader_statistics.page_index_filter_time); + COUNTER_UPDATE(_parquet_profile.read_page_index_time, _reader_statistics.read_page_index_time); + COUNTER_UPDATE(_parquet_profile.parse_page_index_time, + _reader_statistics.parse_page_index_time); + COUNTER_UPDATE(_parquet_profile.row_group_filter_time, + _reader_statistics.row_group_filter_time); + COUNTER_UPDATE(_parquet_profile.file_footer_read_calls, + _reader_statistics.file_footer_read_calls); + COUNTER_UPDATE(_parquet_profile.file_footer_hit_cache, + _reader_statistics.file_footer_hit_cache); COUNTER_UPDATE(_parquet_profile.skip_page_header_num, _column_statistics.skip_page_header_num); COUNTER_UPDATE(_parquet_profile.parse_page_header_num, _column_statistics.parse_page_header_num); - COUNTER_UPDATE(_parquet_profile.predicate_filter_time, _statistics.predicate_filter_time); - COUNTER_UPDATE(_parquet_profile.dict_filter_rewrite_time, _statistics.dict_filter_rewrite_time); + COUNTER_UPDATE(_parquet_profile.predicate_filter_time, + _reader_statistics.predicate_filter_time); + COUNTER_UPDATE(_parquet_profile.dict_filter_rewrite_time, + _reader_statistics.dict_filter_rewrite_time); COUNTER_UPDATE(_parquet_profile.page_index_read_calls, _column_statistics.page_index_read_calls); COUNTER_UPDATE(_parquet_profile.decompress_time, _column_statistics.decompress_time); COUNTER_UPDATE(_parquet_profile.decompress_cnt, _column_statistics.decompress_cnt); + COUNTER_UPDATE(_parquet_profile.page_read_counter, _column_statistics.page_read_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_write_counter, + _column_statistics.page_cache_write_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_compressed_write_counter, + _column_statistics.page_cache_compressed_write_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_decompressed_write_counter, + _column_statistics.page_cache_decompressed_write_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_hit_counter, + _column_statistics.page_cache_hit_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_missing_counter, + _column_statistics.page_cache_missing_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_compressed_hit_counter, + _column_statistics.page_cache_compressed_hit_counter); + COUNTER_UPDATE(_parquet_profile.page_cache_decompressed_hit_counter, + _column_statistics.page_cache_decompressed_hit_counter); COUNTER_UPDATE(_parquet_profile.decode_header_time, _column_statistics.decode_header_time); COUNTER_UPDATE(_parquet_profile.read_page_header_time, _column_statistics.read_page_header_time); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 3d08632102c6b0..a7a9084632171c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -74,7 +74,7 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { ENABLE_FACTORY_CREATOR(ParquetReader); public: - struct Statistics { + struct ReaderStatistics { int32_t filtered_row_groups = 0; int32_t read_row_groups = 0; int64_t filtered_group_rows = 0; @@ -82,13 +82,12 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { int64_t lazy_read_filtered_rows = 0; int64_t read_rows = 0; int64_t filtered_bytes = 0; - int64_t read_bytes = 0; int64_t column_read_time = 0; int64_t parse_meta_time = 0; int64_t parse_footer_time = 0; int64_t file_footer_read_calls = 0; int64_t file_footer_hit_cache = 0; - int64_t open_file_time = 0; + int64_t file_reader_create_time = 0; int64_t open_file_num = 0; int64_t row_group_filter_time = 0; int64_t page_index_filter_time = 0; @@ -141,7 +140,7 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { Status get_parsed_schema(std::vector* col_names, std::vector* col_types) override; - Statistics& statistics() { return _statistics; } + ReaderStatistics& reader_statistics() { return _reader_statistics; } const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; } @@ -171,16 +170,16 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { struct ParquetProfile { RuntimeProfile::Counter* filtered_row_groups = nullptr; RuntimeProfile::Counter* to_read_row_groups = nullptr; + RuntimeProfile::Counter* total_row_groups = nullptr; RuntimeProfile::Counter* filtered_group_rows = nullptr; RuntimeProfile::Counter* filtered_page_rows = nullptr; RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr; RuntimeProfile::Counter* filtered_bytes = nullptr; RuntimeProfile::Counter* raw_rows_read = nullptr; - RuntimeProfile::Counter* to_read_bytes = nullptr; RuntimeProfile::Counter* column_read_time = nullptr; RuntimeProfile::Counter* parse_meta_time = nullptr; RuntimeProfile::Counter* parse_footer_time = nullptr; - RuntimeProfile::Counter* open_file_time = nullptr; + RuntimeProfile::Counter* file_reader_create_time = nullptr; RuntimeProfile::Counter* open_file_num = nullptr; RuntimeProfile::Counter* row_group_filter_time = nullptr; RuntimeProfile::Counter* page_index_read_calls = nullptr; @@ -191,6 +190,14 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { RuntimeProfile::Counter* file_footer_hit_cache = nullptr; RuntimeProfile::Counter* decompress_time = nullptr; RuntimeProfile::Counter* decompress_cnt = nullptr; + RuntimeProfile::Counter* page_read_counter = nullptr; + RuntimeProfile::Counter* page_cache_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_missing_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_hit_counter = nullptr; RuntimeProfile::Counter* decode_header_time = nullptr; RuntimeProfile::Counter* read_page_header_time = nullptr; RuntimeProfile::Counter* decode_value_time = nullptr; @@ -318,8 +325,8 @@ class ParquetReader : public GenericReader, public ExprPushDownHelper { // _table_column_names = _missing_cols + _read_table_columns const std::vector* _table_column_names = nullptr; - Statistics _statistics; - ParquetColumnReader::Statistics _column_statistics; + ReaderStatistics _reader_statistics; + ParquetColumnReader::ColumnStatistics _column_statistics; ParquetProfile _parquet_profile; bool _closed = false; io::IOContext* _io_ctx = nullptr; diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp index a2e74b3d54773d..3cc4ed9d7e5ee8 100644 --- a/be/src/vec/exec/scan/file_scanner.cpp +++ b/be/src/vec/exec/scan/file_scanner.cpp @@ -1757,7 +1757,6 @@ void FileScanner::update_realtime_counters() { _file_cache_statistics->bytes_read_from_local - _last_bytes_read_from_local; int64_t delta_bytes_read_from_remote = _file_cache_statistics->bytes_read_from_remote - _last_bytes_read_from_remote; - if (_file_cache_statistics->bytes_read_from_local == 0 && _file_cache_statistics->bytes_read_from_remote == 0) { _state->get_query_ctx() diff --git a/be/test/io/fs/buffered_reader_test.cpp b/be/test/io/fs/buffered_reader_test.cpp index 3874b06c68c592..bc92d22b178d05 100644 --- a/be/test/io/fs/buffered_reader_test.cpp +++ b/be/test/io/fs/buffered_reader_test.cpp @@ -68,6 +68,8 @@ class SyncLocalFileReader : public io::FileReader { bool closed() const override { return _reader->closed(); } + int64_t mtime() const override { return _reader->mtime(); } + private: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const io::IOContext* io_ctx) override { @@ -96,6 +98,8 @@ class MockOffsetFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return 0; } + protected: Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, const io::IOContext* io_ctx) override { @@ -130,6 +134,8 @@ class TestingRangeCacheFileReader : public io::FileReader { bool closed() const override { return _delegate->closed(); } + int64_t mtime() const override { return _delegate->mtime(); } + const io::PrefetchRange& last_read_range() const { return *_last_read_range; } protected: diff --git a/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp b/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp index 3aef8db8459f39..5ea6335d2b9d18 100644 --- a/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp +++ b/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp @@ -38,6 +38,8 @@ class MockFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return 0; } + Status close() override { _closed = true; return Status::OK(); diff --git a/be/test/vec/exec/format/parquet/parquet_page_cache_test.cpp b/be/test/vec/exec/format/parquet/parquet_page_cache_test.cpp new file mode 100644 index 00000000000000..21f2fa4e82b0c8 --- /dev/null +++ b/be/test/vec/exec/format/parquet/parquet_page_cache_test.cpp @@ -0,0 +1,859 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "common/config.h" +#include "io/fs/buffered_reader.h" +#include "olap/page_cache.h" +#include "runtime/exec_env.h" +#include "runtime/memory/cache_manager.h" +#include "util/block_compression.h" +#include "util/faststring.h" +#include "util/thrift_util.h" +#include "vec/exec/format/parquet/schema_desc.h" +#include "vec/exec/format/parquet/vparquet_column_chunk_reader.h" +#include "vec/exec/format/parquet/vparquet_page_reader.h" + +using namespace doris; +using namespace doris::vectorized; + +class FakeBufferedReader : public io::BufferedStreamReader { +public: + FakeBufferedReader(std::string path, std::vector data) + : _path(std::move(path)), _data(std::move(data)) {} + Status read_bytes(const uint8_t** buf, uint64_t offset, const size_t bytes_to_read, + const doris::io::IOContext* io_ctx) override { + if (offset + bytes_to_read > _data.size()) return Status::IOError("Out of bounds"); + *buf = _data.data() + offset; + return Status::OK(); + } + Status read_bytes(Slice& slice, uint64_t offset, const doris::io::IOContext* io_ctx) override { + if (offset + slice.size > _data.size()) return Status::IOError("Out of bounds"); + slice.data = reinterpret_cast(_data.data() + offset); + return Status::OK(); + } + std::string path() override { return _path; } + + int64_t mtime() const override { return 0; } + +private: + std::string _path; + std::vector _data; +}; + +TEST(ParquetPageCacheTest, CacheHitReturnsDecompressedPayload) { + // setup storage page cache + // ExecEnv::GetInstance()->set_storage_page_cache(StoragePageCache::create_global_cache(1 << 20, 10, 0)); + + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + + // construct thrift PageHeader (uncompressed payload) and payload + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__set_compressed_page_size(4); + header.__set_uncompressed_page_size(4); + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector payload = {0x11, 0x22, 0x33, 0x44}; + std::vector cached_data; + cached_data.insert(cached_data.end(), header_bytes.begin(), header_bytes.end()); + cached_data.insert(cached_data.end(), payload.begin(), payload.end()); + + std::string path = "test_parquet_cache_file"; + int64_t header_offset = 128; + // make file_end_offset consistent with reader/page reader end offset used in test + int64_t file_end_offset = header_offset + static_cast(cached_data.size()); + + // insert into cache + int64_t mtime = 0; + StoragePageCache::CacheKey key(fmt::format("{}::{}", path, mtime), + static_cast(file_end_offset), header_offset); + size_t total = cached_data.size(); + auto* page = new DataPage(total, true, segment_v2::DATA_PAGE); + memcpy(page->data(), cached_data.data(), total); + page->reset_size(total); + PageCacheHandle handle; + StoragePageCache::instance()->insert(key, page, &handle, segment_v2::DATA_PAGE); + + // create fake reader and a ColumnChunkReader to verify cache hit + // ensure the reader contains the same header+payload at the header offset so header parsing succeeds + std::vector backing(256, 0); + memcpy(backing.data() + header_offset, cached_data.data(), total); + FakeBufferedReader reader(path, backing); + // prepare column chunk metadata so ColumnChunkReader uses same offsets + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(header_offset); + cc.meta_data.__set_total_compressed_size(total); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + + FieldSchema field_schema; + field_schema.repetition_level = 0; + field_schema.definition_level = 0; + + ColumnChunkReader ccr(&reader, &cc, &field_schema, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + // load_page_data should hit the cache and return decompressed payload + ASSERT_TRUE(ccr.load_page_data().ok()); + Slice s = ccr.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + ASSERT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + // stats: ensure there was a page read and at least one hit recorded + auto& statistics = ccr.statistics(); + EXPECT_EQ(statistics.page_read_counter, 1); + EXPECT_EQ(statistics.page_cache_hit_counter, 1); + EXPECT_EQ(statistics.page_cache_decompressed_hit_counter, 1); + // now safe to cleanup cache + // { + // StoragePageCache* _cache_ptr = StoragePageCache::instance(); + // // Ensure any outstanding PageCacheHandle is released before destroying cache + // handle = PageCacheHandle(); + // ExecEnv::GetInstance()->set_storage_page_cache(nullptr); + // delete _cache_ptr; + // // Clear CacheManager registration so tests do not leave global state + // // behind when unregister_cache is a no-op under BE_TEST. + // CacheManager::instance()->clear_for_test(); + // } +} + +TEST(ParquetPageCacheTest, DecompressedPageInsertedByColumnChunkReader) { + // ExecEnv::GetInstance()->set_storage_page_cache(StoragePageCache::create_global_cache(1 << 20, 10, 0)); + + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // ensure decompressed pages are cached via BE config + double old_thresh = config::parquet_page_cache_decompress_threshold; + bool old_enable_compressed = config::enable_parquet_cache_compressed_pages; + config::parquet_page_cache_decompress_threshold = 100.0; + config::enable_parquet_cache_compressed_pages = false; + + // construct uncompressed header + payload in file buffer + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__set_compressed_page_size(4); + header.__set_uncompressed_page_size(4); + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector payload = {0x55, 0x66, 0x77, 0x88}; + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), payload.begin(), payload.end()); + + std::string path = "test_parquet_insert_file"; + int64_t header_offset = 0; + + FakeBufferedReader reader(path, file_data); + + // prepare column chunk metadata + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(header_offset); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + + { + FieldSchema field_schema; + field_schema.repetition_level = 0; + field_schema.definition_level = 0; + ColumnChunkReader ccr(&reader, &cc, &field_schema, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + + // Now cache should have an entry; verify by creating a fresh ColumnChunkReader and hitting cache + ColumnChunkReader ccr_check(&reader, &cc, &field_schema, nullptr, 0, nullptr, + ctx); + ASSERT_TRUE(ccr_check.init().ok()); + // ASSERT_TRUE(ccr_check.next_page().ok()); + ASSERT_TRUE(ccr_check.load_page_data().ok()); + Slice s = ccr_check.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + EXPECT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + EXPECT_EQ(ccr_check.statistics().page_cache_hit_counter, 1); + } + // cleanup cache after readers go out of scope + // { + // StoragePageCache* _cache_ptr = StoragePageCache::instance(); + // ExecEnv::GetInstance()->set_storage_page_cache(nullptr); + // delete _cache_ptr; + // CacheManager::instance()->clear_for_test(); + // } + // restore config + config::parquet_page_cache_decompress_threshold = old_thresh; + config::enable_parquet_cache_compressed_pages = old_enable_compressed; +} + +TEST(ParquetPageCacheTest, V2LevelsPreservedInCache) { + // ExecEnv::GetInstance()->set_storage_page_cache(StoragePageCache::create_global_cache(1 << 20, 10, 0)); + + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // ensure decompressed pages are cached via BE config + double old_thresh = config::parquet_page_cache_decompress_threshold; + bool old_enable_compressed = config::enable_parquet_cache_compressed_pages; + config::parquet_page_cache_decompress_threshold = 100.0; + config::enable_parquet_cache_compressed_pages = false; + + // construct v2 header + levels + payload in file buffer (uncompressed) + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE_V2; + int rl = 2; + int dl = 1; + int payload_sz = 2; + header.__set_compressed_page_size(rl + dl + payload_sz); + header.__set_uncompressed_page_size(rl + dl + payload_sz); + header.__isset.data_page_header_v2 = true; + header.data_page_header_v2.__set_repetition_levels_byte_length(rl); + header.data_page_header_v2.__set_definition_levels_byte_length(dl); + header.data_page_header_v2.__set_is_compressed(false); + header.data_page_header_v2.__set_num_values(1); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector level_bytes = {0x11, 0x22, 0x33}; + std::vector payload = {0xAA, 0xBB}; + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), level_bytes.begin(), level_bytes.end()); + file_data.insert(file_data.end(), payload.begin(), payload.end()); + + std::string path = "test_v2_levels_file"; + FakeBufferedReader reader(path, file_data); + + // prepare column chunk metadata + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(0); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + + FieldSchema field_schema; + field_schema.repetition_level = 0; + field_schema.definition_level = 0; + { + ColumnChunkReader ccr(&reader, &cc, &field_schema, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + + // Now cache should have entry; verify by creating a ColumnChunkReader and hitting cache + ColumnChunkReader ccr_check(&reader, &cc, &field_schema, nullptr, 0, nullptr, + ctx); + ASSERT_TRUE(ccr_check.init().ok()); + // ASSERT_TRUE(ccr_check.next_page().ok()); + ASSERT_TRUE(ccr_check.load_page_data().ok()); + Slice s = ccr_check.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + EXPECT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + } + + // Verify that a fresh ColumnChunkReader reusing cache gets level bytes preserved + FieldSchema field_schema2; + field_schema2.repetition_level = 2; // v2 levels present + field_schema2.definition_level = 1; + ColumnChunkReader ccr2(&reader, &cc, &field_schema2, nullptr, 0, nullptr); + ASSERT_TRUE(ccr2.init().ok()); + // ASSERT_TRUE(ccr2.next_page().ok()); + ASSERT_TRUE(ccr2.load_page_data().ok()); + // Level slices should equal the original level bytes + const Slice& rep = ccr2.v2_rep_levels(); + const Slice& def = ccr2.v2_def_levels(); + auto& statistics = ccr2.statistics(); + EXPECT_GT(statistics.page_cache_hit_counter, 0); + // because threshold is set to cache decompressed, we should see decompressed hits + EXPECT_GT(statistics.page_cache_decompressed_hit_counter, 0); + ASSERT_EQ(def.size, dl); + EXPECT_EQ(0, memcmp(rep.data, level_bytes.data(), rl)); + EXPECT_EQ(0, memcmp(def.data, level_bytes.data() + rl, dl)); + // // cleanup cache after readers have been destroyed + // { + // StoragePageCache* _cache_ptr = StoragePageCache::instance(); + // ExecEnv::GetInstance()->set_storage_page_cache(nullptr); + // delete _cache_ptr; + // CacheManager::instance()->clear_for_test(); + // } + // restore config + config::parquet_page_cache_decompress_threshold = old_thresh; + config::enable_parquet_cache_compressed_pages = old_enable_compressed; +} + +TEST(ParquetPageCacheTest, CompressedV1PageCachedAndHit) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: parquet_page_cache_decompress_threshold and enable_parquet_cache_compressed_pages + // are now BE config variables, not context fields + + // construct compressed v1 header + compressed payload in file buffer + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + + std::vector payload = {0x01, 0x02, 0x03, 0x04}; + + // compress payload using a block codec + BlockCompressionCodec* codec = nullptr; + ASSERT_TRUE(get_block_compression_codec(segment_v2::CompressionTypePB::SNAPPY, &codec).ok()); + faststring compressed_fast; + std::vector inputs; + inputs.emplace_back(payload.data(), payload.size()); + ASSERT_TRUE(codec->compress(inputs, payload.size(), &compressed_fast).ok()); + + header.__set_compressed_page_size(static_cast(compressed_fast.size())); + header.__set_uncompressed_page_size(static_cast(payload.size())); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), compressed_fast.data(), + compressed_fast.data() + compressed_fast.size()); + + std::string path = "test_compressed_v1_file"; + FakeBufferedReader reader(path, file_data); + + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(0); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::SNAPPY); + + FieldSchema field_schema; + field_schema.repetition_level = 0; + field_schema.definition_level = 0; + + // Load page to trigger decompression + cache insert + ColumnChunkReader ccr(&reader, &cc, &field_schema, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + EXPECT_EQ(ccr.statistics().page_cache_write_counter, 1); + + // Now verify a fresh reader hits the cache and returns payload + ColumnChunkReader ccr_check(&reader, &cc, &field_schema, nullptr, 0, nullptr, + ctx); + ASSERT_TRUE(ccr_check.init().ok()); + // ASSERT_TRUE(ccr_check.next_page().ok()); + ASSERT_TRUE(ccr_check.load_page_data().ok()); + Slice s = ccr_check.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + EXPECT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + EXPECT_EQ(ccr_check.statistics().page_cache_hit_counter, 1); +} + +TEST(ParquetPageCacheTest, CompressedV2LevelsPreservedInCache) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: parquet_page_cache_decompress_threshold and enable_parquet_cache_compressed_pages + // are now BE config variables, not context fields + + // construct v2 header + levels + compressed payload in file buffer + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE_V2; + int rl = 2; + int dl = 1; + //int payload_sz = 2; + header.__isset.data_page_header_v2 = true; + header.data_page_header_v2.__set_repetition_levels_byte_length(rl); + header.data_page_header_v2.__set_definition_levels_byte_length(dl); + header.data_page_header_v2.__set_is_compressed(true); + header.data_page_header_v2.__set_num_values(1); + + std::vector level_bytes = {0x11, 0x22, 0x33}; + std::vector payload = {0xAA, 0xBB}; + + // compress payload + BlockCompressionCodec* codec = nullptr; + ASSERT_TRUE(get_block_compression_codec(segment_v2::CompressionTypePB::SNAPPY, &codec).ok()); + faststring compressed_fast; + std::vector inputs; + inputs.emplace_back(payload.data(), payload.size()); + ASSERT_TRUE(codec->compress(inputs, payload.size(), &compressed_fast).ok()); + + // compressed page: levels (uncompressed) followed by compressed payload + std::vector compressed_page; + compressed_page.insert(compressed_page.end(), level_bytes.begin(), level_bytes.end()); + compressed_page.insert(compressed_page.end(), compressed_fast.data(), + compressed_fast.data() + compressed_fast.size()); + + header.__set_compressed_page_size(static_cast(compressed_page.size())); + header.__set_uncompressed_page_size(static_cast(level_bytes.size() + payload.size())); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), compressed_page.begin(), compressed_page.end()); + + std::string path = "test_compressed_v2_file"; + FakeBufferedReader reader(path, file_data); + + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(0); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::SNAPPY); + + FieldSchema field_schema; + field_schema.repetition_level = 0; + field_schema.definition_level = 0; + + // Load page to trigger decompression + cache insert + ColumnChunkReader ccr(&reader, &cc, &field_schema, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + EXPECT_EQ(ccr.statistics().page_cache_write_counter, 1); + + // Now verify a fresh reader hits the cache and v2 levels are preserved + FieldSchema field_schema2; + field_schema2.repetition_level = rl; + field_schema2.definition_level = dl; + ColumnChunkReader ccr_check(&reader, &cc, &field_schema2, nullptr, 0, nullptr, + ctx); + ASSERT_TRUE(ccr_check.init().ok()); + // ASSERT_TRUE(ccr_check.next_page().ok()); + ASSERT_TRUE(ccr_check.load_page_data().ok()); + Slice s = ccr_check.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + EXPECT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + const Slice& rep = ccr_check.v2_rep_levels(); + const Slice& def = ccr_check.v2_def_levels(); + ASSERT_EQ(rep.size, rl); + ASSERT_EQ(def.size, dl); + // cached v2 page is stored decompressed (threshold=100), make sure counter reflects it + EXPECT_GT(ccr_check.statistics().page_cache_decompressed_hit_counter, 0); + EXPECT_EQ(0, memcmp(rep.data, level_bytes.data(), rl)); + EXPECT_EQ(0, memcmp(def.data, level_bytes.data() + rl, dl)); +} + +TEST(ParquetPageCacheTest, MultiPagesMixedV1V2CacheHit) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: parquet_page_cache_decompress_threshold and enable_parquet_cache_compressed_pages + // are now BE config variables, not context fields + + // Prepare a v1 uncompressed page and a v2 uncompressed page and insert both into cache + std::string path = "test_multi_pages_file"; + + // v1 page + tparquet::PageHeader hdr1; + hdr1.type = tparquet::PageType::DATA_PAGE; + hdr1.__set_compressed_page_size(4); + hdr1.__set_uncompressed_page_size(4); + hdr1.__isset.data_page_header = true; + hdr1.data_page_header.__set_num_values(1); + std::vector header1_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&hdr1, &header1_bytes).ok()); + std::vector payload1 = {0x10, 0x20, 0x30, 0x40}; + std::vector cached1; + cached1.insert(cached1.end(), header1_bytes.begin(), header1_bytes.end()); + cached1.insert(cached1.end(), payload1.begin(), payload1.end()); + + // v2 page + tparquet::PageHeader hdr2; + hdr2.type = tparquet::PageType::DATA_PAGE_V2; + int rl = 2; + int dl = 1; + int payload2_sz = 2; + hdr2.__set_compressed_page_size(rl + dl + payload2_sz); + hdr2.__set_uncompressed_page_size(rl + dl + payload2_sz); + hdr2.__isset.data_page_header_v2 = true; + hdr2.data_page_header_v2.__set_repetition_levels_byte_length(rl); + hdr2.data_page_header_v2.__set_definition_levels_byte_length(dl); + hdr2.data_page_header_v2.__set_is_compressed(false); + hdr2.data_page_header_v2.__set_num_values(1); + std::vector header2_bytes; + ASSERT_TRUE(ts.serialize(&hdr2, &header2_bytes).ok()); + std::vector level_bytes = {0x11, 0x22, 0x33}; + std::vector payload2 = {0xAA, 0xBB}; + std::vector cached2; + cached2.insert(cached2.end(), header2_bytes.begin(), header2_bytes.end()); + cached2.insert(cached2.end(), level_bytes.begin(), level_bytes.end()); + cached2.insert(cached2.end(), payload2.begin(), payload2.end()); + + // Insert both pages into cache under different header offsets + size_t total1 = cached1.size(); + auto* page1 = new DataPage(total1, true, segment_v2::DATA_PAGE); + memcpy(page1->data(), cached1.data(), total1); + page1->reset_size(total1); + PageCacheHandle h1; + size_t header1_start = 128; + int64_t mtime = 0; + StoragePageCache::CacheKey key1(fmt::format("{}::{}", path, mtime), + static_cast(header1_start + total1), header1_start); + StoragePageCache::instance()->insert(key1, page1, &h1, segment_v2::DATA_PAGE); + + size_t total2 = cached2.size(); + auto* page2 = new DataPage(total2, true, segment_v2::DATA_PAGE); + memcpy(page2->data(), cached2.data(), total2); + page2->reset_size(total2); + PageCacheHandle h2; + size_t header2_start = 256; + StoragePageCache::CacheKey key2(fmt::format("{}::{}", path, mtime), + static_cast(header2_start + total2), header2_start); + StoragePageCache::instance()->insert(key2, page2, &h2, segment_v2::DATA_PAGE); + + // Now create readers that would lookup those cache keys + // Reader1 must expose header+page bytes at offset header1_start + std::vector reader_backing1(3000, 0); + memcpy(reader_backing1.data() + header1_start, cached1.data(), total1); + FakeBufferedReader reader1(path, reader_backing1); + tparquet::ColumnChunk cc1; + cc1.meta_data.__set_data_page_offset(128); + cc1.meta_data.__set_total_compressed_size(total1); + cc1.meta_data.__set_num_values(1); + cc1.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + FieldSchema field_schema1; + field_schema1.repetition_level = 0; + field_schema1.definition_level = 0; + ColumnChunkReader ccr1(&reader1, &cc1, &field_schema1, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr1.init().ok()); + // ASSERT_TRUE(ccr1.next_page().ok()); + ASSERT_TRUE(ccr1.load_page_data().ok()); + Slice s1 = ccr1.get_page_data(); + ASSERT_EQ(s1.size, payload1.size()); + EXPECT_EQ(0, memcmp(s1.data, payload1.data(), payload1.size())); + + std::vector reader_backing2(3000, 0); + memcpy(reader_backing2.data() + header2_start, cached2.data(), total2); + FakeBufferedReader reader2(path, reader_backing2); + tparquet::ColumnChunk cc2; + cc2.meta_data.__set_data_page_offset(256); + cc2.meta_data.__set_total_compressed_size(total2); + cc2.meta_data.__set_num_values(1); + cc2.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + FieldSchema field_schema2; + field_schema2.repetition_level = rl; + field_schema2.definition_level = dl; + ColumnChunkReader ccr2(&reader2, &cc2, &field_schema2, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr2.init().ok()); + // ASSERT_TRUE(ccr2.next_page().ok()); + ASSERT_TRUE(ccr2.load_page_data().ok()); + Slice s2 = ccr2.get_page_data(); + ASSERT_EQ(s2.size, payload2.size()); + EXPECT_EQ(0, memcmp(s2.data, payload2.data(), payload2.size())); + const Slice& rep = ccr2.v2_rep_levels(); + const Slice& def = ccr2.v2_def_levels(); + ASSERT_EQ(rep.size, rl); + ASSERT_EQ(def.size, dl); + EXPECT_EQ(0, memcmp(rep.data, level_bytes.data(), rl)); + EXPECT_EQ(0, memcmp(def.data, level_bytes.data() + rl, dl)); +} + +TEST(ParquetPageCacheTest, CacheMissThenHit) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: parquet_page_cache_decompress_threshold and enable_parquet_cache_compressed_pages + // are now BE config variables, not context fields + + // uncompressed v1 page + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__set_compressed_page_size(4); + header.__set_uncompressed_page_size(4); + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + std::vector payload = {0xDE, 0xAD, 0xBE, 0xEF}; + std::vector backing(256, 0); + std::vector cached; + cached.insert(cached.end(), header_bytes.begin(), header_bytes.end()); + cached.insert(cached.end(), payload.begin(), payload.end()); + int64_t header_offset = 64; + memcpy(backing.data() + header_offset, cached.data(), cached.size()); + + std::string path = "test_miss_then_hit"; + FakeBufferedReader reader(path, backing); + + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(header_offset); + cc.meta_data.__set_total_compressed_size(cached.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + + FieldSchema fs; + fs.repetition_level = 0; + fs.definition_level = 0; + + // First reader: should not hit cache, but should write cache + ColumnChunkReader ccr(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + auto& statistics = ccr.statistics(); + EXPECT_EQ(statistics.page_cache_hit_counter, 0); + EXPECT_EQ(statistics.page_cache_write_counter, 1); + + // Second reader: should hit cache + ColumnChunkReader ccr2(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr2.init().ok()); + // ASSERT_TRUE(ccr2.next_page().ok()); + ASSERT_TRUE(ccr2.load_page_data().ok()); + auto& statistics2 = ccr2.statistics(); + EXPECT_EQ(statistics2.page_cache_hit_counter, 1); + EXPECT_EQ(statistics2.page_cache_decompressed_hit_counter, 1); +} + +TEST(ParquetPageCacheTest, DecompressThresholdCachesCompressed) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: enable_parquet_cache_compressed_pages is now a BE config variable, not a context field + + // prepare a compressible payload (lots of zeros) + std::vector payload(1024, 0); + + // compress payload using snappy + BlockCompressionCodec* codec = nullptr; + ASSERT_TRUE(get_block_compression_codec(segment_v2::CompressionTypePB::SNAPPY, &codec).ok()); + faststring compressed_fast; + std::vector inputs; + inputs.emplace_back(payload.data(), payload.size()); + ASSERT_TRUE(codec->compress(inputs, payload.size(), &compressed_fast).ok()); + + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__set_compressed_page_size(static_cast(compressed_fast.size())); + header.__set_uncompressed_page_size(static_cast(payload.size())); + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), compressed_fast.data(), + compressed_fast.data() + compressed_fast.size()); + + std::string path = "test_threshold_file_compressed"; + FakeBufferedReader reader(path, file_data); + + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(0); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::SNAPPY); + + FieldSchema fs; + fs.repetition_level = 0; + fs.definition_level = 0; + + // Case: very small threshold -> cache the compressed payload (smaller footprint) + double old_thresh = config::parquet_page_cache_decompress_threshold; + bool old_enable_compressed = config::enable_parquet_cache_compressed_pages; + config::parquet_page_cache_decompress_threshold = 0.1; + config::enable_parquet_cache_compressed_pages = true; + ColumnChunkReader ccr_small_thresh(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr_small_thresh.init().ok()); + // ASSERT_TRUE(ccr_small_thresh.next_page().ok()); + ASSERT_TRUE(ccr_small_thresh.load_page_data().ok()); + EXPECT_EQ(ccr_small_thresh.statistics().page_cache_write_counter, 1); + + // Inspect cache entry: payload stored should be compressed size + PageCacheHandle handle_small; + size_t file_end = header_bytes.size() + compressed_fast.size(); + int64_t mtime = 0; + StoragePageCache::CacheKey key_small(fmt::format("{}::{}", path, mtime), + /*file_end_offset*/ file_end, /*header_start*/ 0); + bool found_small = + StoragePageCache::instance()->lookup(key_small, &handle_small, segment_v2::DATA_PAGE); + ASSERT_TRUE(found_small); + Slice cached_small = handle_small.data(); + size_t header_size = header_bytes.size(); + size_t payload_in_cache_size = cached_small.size - header_size; // no levels here + ASSERT_EQ(payload_in_cache_size, compressed_fast.size()); + + // restore config + config::parquet_page_cache_decompress_threshold = old_thresh; + config::enable_parquet_cache_compressed_pages = old_enable_compressed; +} + +TEST(ParquetPageCacheTest, DecompressThresholdCachesDecompressed) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + // Note: enable_parquet_cache_compressed_pages is now a BE config variable, not a context field + + // prepare a compressible payload (lots of zeros) + std::vector payload(1024, 0); + + // compress payload using snappy + BlockCompressionCodec* codec = nullptr; + ASSERT_TRUE(get_block_compression_codec(segment_v2::CompressionTypePB::SNAPPY, &codec).ok()); + faststring compressed_fast; + std::vector inputs; + inputs.emplace_back(payload.data(), payload.size()); + ASSERT_TRUE(codec->compress(inputs, payload.size(), &compressed_fast).ok()); + + tparquet::PageHeader header; + header.type = tparquet::PageType::DATA_PAGE; + header.__set_compressed_page_size(static_cast(compressed_fast.size())); + header.__set_uncompressed_page_size(static_cast(payload.size())); + header.__isset.data_page_header = true; + header.data_page_header.__set_num_values(1); + + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&header, &header_bytes).ok()); + + std::vector file_data; + file_data.insert(file_data.end(), header_bytes.begin(), header_bytes.end()); + file_data.insert(file_data.end(), compressed_fast.data(), + compressed_fast.data() + compressed_fast.size()); + + std::string path = "test_threshold_file_decompressed"; + FakeBufferedReader reader(path, file_data); + + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(0); + cc.meta_data.__set_total_compressed_size(file_data.size()); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::SNAPPY); + + FieldSchema fs; + fs.repetition_level = 0; + fs.definition_level = 0; + + // Case: very large threshold -> cache decompressed payload + double old_thresh = config::parquet_page_cache_decompress_threshold; + bool old_enable_compressed = config::enable_parquet_cache_compressed_pages; + config::parquet_page_cache_decompress_threshold = 100.0; + config::enable_parquet_cache_compressed_pages = false; + ColumnChunkReader ccr_large_thresh(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr_large_thresh.init().ok()); + // ASSERT_TRUE(ccr_large_thresh.next_page().ok()); + ASSERT_TRUE(ccr_large_thresh.load_page_data().ok()); + EXPECT_EQ(ccr_large_thresh.statistics().page_cache_write_counter, 1); + + // Inspect cache entry for large threshold: payload stored should be uncompressed size + PageCacheHandle handle_large; + size_t file_end = header_bytes.size() + compressed_fast.size(); + int64_t mtime = 0; + StoragePageCache::CacheKey key_large(fmt::format("{}::{}", path, mtime), + /*file_end_offset*/ file_end, /*header_start*/ 0); + bool found_large = + StoragePageCache::instance()->lookup(key_large, &handle_large, segment_v2::DATA_PAGE); + ASSERT_TRUE(found_large); + Slice cached_large = handle_large.data(); + size_t payload_in_cache_size_large = cached_large.size - header_bytes.size(); + ASSERT_EQ(payload_in_cache_size_large, payload.size()); + + // Verify cache hit for a new reader (should hit the decompressed entry we just created) + ColumnChunkReader ccr_check(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr_check.init().ok()); + // ASSERT_TRUE(ccr_check.next_page().ok()); + ASSERT_TRUE(ccr_check.load_page_data().ok()); + EXPECT_EQ(ccr_check.statistics().page_cache_hit_counter, 1); + // restore config + config::parquet_page_cache_decompress_threshold = old_thresh; + config::enable_parquet_cache_compressed_pages = old_enable_compressed; +} + +TEST(ParquetPageCacheTest, MultipleReadersShareCachedEntry) { + ParquetPageReadContext ctx; + ctx.enable_parquet_file_page_cache = true; + double old_thresh = config::parquet_page_cache_decompress_threshold; + bool old_enable_compressed = config::enable_parquet_cache_compressed_pages; + config::parquet_page_cache_decompress_threshold = 100.0; + config::enable_parquet_cache_compressed_pages = false; + + // Create a v2 cached page and then instantiate multiple readers that hit the cache + std::string path = "test_shared_handles"; + tparquet::PageHeader hdr; + hdr.type = tparquet::PageType::DATA_PAGE_V2; + int rl = 2; + int dl = 1; + hdr.__isset.data_page_header_v2 = true; + hdr.data_page_header_v2.__set_repetition_levels_byte_length(rl); + hdr.data_page_header_v2.__set_definition_levels_byte_length(dl); + hdr.data_page_header_v2.__set_is_compressed(false); + hdr.data_page_header_v2.__set_num_values(1); + std::vector header_bytes; + ThriftSerializer ts(/*compact*/ true, /*initial*/ 256); + ASSERT_TRUE(ts.serialize(&hdr, &header_bytes).ok()); + std::vector level_bytes = {0x11, 0x22, 0x33}; + std::vector payload = {0x0A, 0x0B}; + std::vector cached; + cached.insert(cached.end(), header_bytes.begin(), header_bytes.end()); + cached.insert(cached.end(), level_bytes.begin(), level_bytes.end()); + cached.insert(cached.end(), payload.begin(), payload.end()); + + size_t total = cached.size(); + auto* page = new DataPage(total, true, segment_v2::DATA_PAGE); + memcpy(page->data(), cached.data(), total); + page->reset_size(total); + PageCacheHandle handle; + size_t header_start = 512; + int64_t mtime = 0; + StoragePageCache::CacheKey key(fmt::format("{}::{}", path, mtime), + static_cast(header_start + total), header_start); + StoragePageCache::instance()->insert(key, page, &handle, segment_v2::DATA_PAGE); + + // Create multiple readers that will hit cache + const int N = 4; + for (int i = 0; i < N; ++i) { + std::vector reader_backing(5000, 0); + memcpy(reader_backing.data() + header_start, cached.data(), total); + FakeBufferedReader reader(path, reader_backing); + tparquet::ColumnChunk cc; + cc.meta_data.__set_data_page_offset(512); + cc.meta_data.__set_total_compressed_size(total); + cc.meta_data.__set_num_values(1); + cc.meta_data.__set_codec(tparquet::CompressionCodec::UNCOMPRESSED); + FieldSchema fs; + fs.repetition_level = rl; + fs.definition_level = dl; + ColumnChunkReader ccr(&reader, &cc, &fs, nullptr, 0, nullptr, ctx); + ASSERT_TRUE(ccr.init().ok()); + // ASSERT_TRUE(ccr.next_page().ok()); + ASSERT_TRUE(ccr.load_page_data().ok()); + Slice s = ccr.get_page_data(); + ASSERT_EQ(s.size, payload.size()); + EXPECT_EQ(0, memcmp(s.data, payload.data(), payload.size())); + const Slice& rep = ccr.v2_rep_levels(); + const Slice& def = ccr.v2_def_levels(); + ASSERT_EQ(rep.size, rl); + ASSERT_EQ(def.size, dl); + } + // restore config + config::parquet_page_cache_decompress_threshold = old_thresh; + config::enable_parquet_cache_compressed_pages = old_enable_compressed; +} diff --git a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp index 4f4202e1143ccb..f6da2a3f65b2b4 100644 --- a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp @@ -177,7 +177,7 @@ static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, s static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::ColumnChunk* column_chunk, FieldSchema* field_schema, ColumnPtr& doris_column, - DataTypePtr& data_type, level_t* definitions) { + DataTypePtr& data_type, level_t* definitions, size_t total_rows) { tparquet::ColumnMetaData chunk_meta = column_chunk->meta_data; size_t start_offset = has_dict_page(chunk_meta) ? chunk_meta.dictionary_page_offset : chunk_meta.data_page_offset; @@ -197,12 +197,12 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024); - ColumnChunkReader chunk_reader(&stream_reader, column_chunk, field_schema, nullptr, &ctz, - nullptr); + ColumnChunkReader chunk_reader(&stream_reader, column_chunk, field_schema, + nullptr, total_rows, nullptr); // initialize chunk reader static_cast(chunk_reader.init()); // seek to next page header - static_cast(chunk_reader.next_page()); + static_cast(chunk_reader.parse_page_header()); // load page data into underlying container static_cast(chunk_reader.load_page_data()); int rows = chunk_reader.remaining_num_values(); @@ -210,7 +210,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column if (field_schema->definition_level == 0) { // required field std::fill(definitions, definitions + rows, 1); } else { - chunk_reader.get_def_levels(definitions, rows); + chunk_reader._def_level_decoder.get_levels(definitions, rows); } MutableColumnPtr data_column; if (src_column->is_nullable()) { @@ -240,10 +240,11 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column if (definitions[i] != level_type) { if (level_type == 0) { // null values - chunk_reader.insert_null_values(data_column, num_values); + data_column->insert_many_defaults(num_values); } else { std::vector null_map = {(u_short)num_values}; - RETURN_IF_ERROR(run_length_map.init(null_map, rows, nullptr, &filter_map, 0)); + RETURN_IF_ERROR( + run_length_map.init(null_map, num_values, nullptr, &filter_map, 0)); RETURN_IF_ERROR(chunk_reader.decode_values(data_column, resolved_type, run_length_map, false)); } @@ -255,10 +256,10 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column } if (level_type == 0) { // null values - chunk_reader.insert_null_values(data_column, num_values); + data_column->insert_many_defaults(num_values); } else { std::vector null_map = {(u_short)num_values}; - RETURN_IF_ERROR(run_length_map.init(null_map, rows, nullptr, &filter_map, 0)); + RETURN_IF_ERROR(run_length_map.init(null_map, num_values, nullptr, &filter_map, 0)); RETURN_IF_ERROR( chunk_reader.decode_values(data_column, resolved_type, run_length_map, false)); } @@ -414,7 +415,7 @@ static void read_parquet_data_and_check(const std::string& parquet_file, static_cast( get_column_values(reader, &t_metadata.row_groups[0].columns[c], const_cast(schema_descriptor.get_column(c)), - data_column, data_type, defs.data())); + data_column, data_type, defs.data(), rows)); } // `date_v2_col` date, // 14 - 13, DATEV2 { @@ -424,7 +425,7 @@ static void read_parquet_data_and_check(const std::string& parquet_file, static_cast( get_column_values(reader, &t_metadata.row_groups[0].columns[13], const_cast(schema_descriptor.get_column(13)), - data_column, data_type, defs.data())); + data_column, data_type, defs.data(), rows)); } // `timestamp_v2_col` timestamp, // 15 - 9, DATETIMEV2 { @@ -434,7 +435,7 @@ static void read_parquet_data_and_check(const std::string& parquet_file, static_cast( get_column_values(reader, &t_metadata.row_groups[0].columns[9], const_cast(schema_descriptor.get_column(9)), - data_column, data_type, defs.data())); + data_column, data_type, defs.data(), rows)); } io::FileReaderSPtr result; diff --git a/be/test/vec/exec/orc/orc_file_reader_test.cpp b/be/test/vec/exec/orc/orc_file_reader_test.cpp index 9e1003c397f07f..4c71129cdbbc3d 100644 --- a/be/test/vec/exec/orc/orc_file_reader_test.cpp +++ b/be/test/vec/exec/orc/orc_file_reader_test.cpp @@ -41,6 +41,8 @@ class MockFileReader : public io::FileReader { bool closed() const override { return _closed; } + int64_t mtime() const override { return 0; } + void set_data(const std::string& data) { _data = data; } protected: diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run80.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run80.hql index 7f374dacfc0128..21f381cfa70eef 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run80.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run80.hql @@ -21,3 +21,24 @@ CREATE TABLE `parquet_topn_lazy_mat_table`( msck repair table orc_topn_lazy_mat_table; msck repair table parquet_topn_lazy_mat_table; + + + +CREATE TABLE `parquet_topn_lazy_complex_table`( + id INT, + col1 STRING, + col2 STRUCT>, + col3 MAP> +) STORED AS PARQUET LOCATION + '/user/doris/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/'; + +CREATE TABLE `parquet_topn_lazy_complex_table_multi_pages`( + id INT, + col1 STRING, + col2 STRUCT>, + col3 MAP> +) STORED AS PARQUET LOCATION + '/user/doris/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/'; + +msck repair table parquet_topn_lazy_complex_table; +msck repair table parquet_topn_lazy_complex_table_multi_pages; \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_1.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_1.parquet new file mode 100644 index 00000000000000..7bac48aa2c123f Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_1.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_2.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_2.parquet new file mode 100644 index 00000000000000..fd8d8852078a7a Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_2.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_3.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_3.parquet new file mode 100644 index 00000000000000..881fc9e7304e5b Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_3.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_4.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_4.parquet new file mode 100644 index 00000000000000..ddde09317ba38b Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table/data_part_4.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_1.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_1.parquet new file mode 100644 index 00000000000000..1d9a3707dc2d5e Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_1.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_2.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_2.parquet new file mode 100644 index 00000000000000..307da5f60b0d22 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_2.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_3.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_3.parquet new file mode 100644 index 00000000000000..1ea7d9e84619e2 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_3.parquet differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_4.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_4.parquet new file mode 100644 index 00000000000000..4edcc8841ba438 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_topn_lazy_complex_table_multi_pages/data_part_4.parquet differ diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 361fd904ea81fc..b33deddb695af4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -496,6 +496,9 @@ public class SessionVariable implements Serializable, Writable { public static final String SHOW_USER_DEFAULT_ROLE = "show_user_default_role"; public static final String ENABLE_PAGE_CACHE = "enable_page_cache"; + public static final String ENABLE_PARQUET_FILE_PAGE_CACHE = "enable_parquet_file_page_cache"; + // public static final String PARQUET_PAGE_CACHE_DECOMPRESS_THRESHOLD = "parquet_page_cache_decompress_threshold"; + // public static final String ENABLE_PARQUET_CACHE_COMPRESSED_PAGES = "enable_parquet_cache_compressed_pages"; public static final String MINIDUMP_PATH = "minidump_path"; @@ -2154,6 +2157,27 @@ public boolean isEnableHboNonStrictMatchingMode() { needForward = true) public boolean enablePageCache = true; + @VariableMgr.VarAttr( + name = ENABLE_PARQUET_FILE_PAGE_CACHE, + description = {"控制是否启用 Parquet file page cache。默认为 true。", + "Controls whether to use Parquet file page cache. The default is true."}, + needForward = true) + public boolean enableParquetFilePageCache = true; + + // @VariableMgr.VarAttr( + // name = PARQUET_PAGE_CACHE_DECOMPRESS_THRESHOLD, + // description = {"决定是否缓存解压后 page 的阈值,默认 1.5。", + // "Threshold ratio to decide caching decompressed parquet page, default 1.5."}, + // needForward = true) + // public double parquetPageCacheDecompressThreshold = 1.5; + + // @VariableMgr.VarAttr( + // name = ENABLE_PARQUET_CACHE_COMPRESSED_PAGES, + // description = {"控制是否缓存压缩的 Parquet 页面,默认为 false", + // "Controls whether to cache compressed parquet pages. Default false."}, + // needForward = true) + // public boolean enableParquetCacheCompressedPages = false; + @VariableMgr.VarAttr(name = ENABLE_FOLD_NONDETERMINISTIC_FN) public boolean enableFoldNondeterministicFn = false; @@ -4876,6 +4900,8 @@ public TQueryOptions toThrift() { tResult.setEnablePageCache(enablePageCache); + tResult.setEnableParquetFilePageCache(enableParquetFilePageCache); + tResult.setFileCacheBasePath(fileCacheBasePath); tResult.setEnableInvertedIndexQuery(enableInvertedIndexQuery); @@ -4889,6 +4915,7 @@ public TQueryOptions toThrift() { tResult.setEnableParquetLazyMat(enableParquetLazyMat); tResult.setEnableOrcLazyMat(enableOrcLazyMat); tResult.setEnableParquetFilterByMinMax(enableParquetFilterByMinMax); + tResult.setEnableParquetFilePageCache(enableParquetFilePageCache); tResult.setEnableOrcFilterByMinMax(enableOrcFilterByMinMax); tResult.setCheckOrcInitSargsSuccess(checkOrcInitSargsSuccess); diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 88f063988c9ed9..67c64195b340c9 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -418,6 +418,10 @@ struct TQueryOptions { 180: optional i32 max_file_scanners_concurrency = 0; 181: optional i32 min_file_scanners_concurrency = 0; + // Parquet page cache session options + // Whether to enable parquet file page cache on BE for this query + 184: optional bool enable_parquet_file_page_cache = true; + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. diff --git a/regression-test/data/external_table_p0/hive/test_hive_topn_lazy_mat.out b/regression-test/data/external_table_p0/hive/test_hive_topn_lazy_mat.out index 248bf4b90f794a..0d4e28fd9f097c 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_topn_lazy_mat.out +++ b/regression-test/data/external_table_p0/hive/test_hive_topn_lazy_mat.out @@ -1240,6 +1240,720 @@ 1 user1 1.0 false 0.5 1 8 user8 8.0 true 4.0 1 1 user1 1.0 false 0.5 1 9 user9 9.0 false 4.5 1 +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +text_16 {16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +text_15 {15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +text_14 {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +text_13 {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +text_12 {12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +text_11 {11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +text_10 {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +text_9 {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +text_8 {8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +text_7 {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +text_6 {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +text_5 {5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +text_4 {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +text_3 {3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +text_2 {2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +text_1 {1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +{1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 +{"a":null, "b":[17, 18]} 17 +{"a":18, "b":null} 18 +{"a":19, "b":[1, null]} 19 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +text_16 {16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +text_15 {15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +text_14 {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +text_13 {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +text_12 {12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +text_11 {11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +text_10 {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +text_9 {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +text_8 {8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +text_7 {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +text_6 {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +text_5 {5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +text_4 {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +text_3 {3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +text_2 {2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +text_1 {1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +{1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 +{"a":null, "b":[17, 18]} 17 +{"a":18, "b":null} 18 +{"a":19, "b":[1, null]} 19 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} + -- !1 -- 1 user1 1.0 false 0.5 1 2 user2 2.0 true 1.0 1 @@ -2481,3 +3195,716 @@ 1 user1 1.0 false 0.5 1 8 user8 8.0 true 4.0 1 1 user1 1.0 false 0.5 1 9 user9 9.0 false 4.5 1 +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +text_16 {16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +text_15 {15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +text_14 {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +text_13 {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +text_12 {12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +text_11 {11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +text_10 {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +text_9 {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +text_8 {8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +text_7 {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +text_6 {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +text_5 {5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +text_4 {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +text_3 {3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +text_2 {2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +text_1 {1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +{1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 +{"a":null, "b":[17, 18]} 17 +{"a":18, "b":null} 18 +{"a":19, "b":[1, null]} 19 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_1 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_2 -- +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_3 -- +text_24 {24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +text_23 {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +text_22 {22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +text_21 {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +text_20 {20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +text_19 {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +text_18 {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +text_17 {17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +text_16 {16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +text_15 {15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +text_14 {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +text_13 {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +text_12 {12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +text_11 {11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +text_10 {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +text_9 {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +text_8 {8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +text_7 {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +text_6 {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +text_5 {5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +text_4 {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +text_3 {3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +text_2 {2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +text_1 {1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_4 -- +{9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} {"a":9, "b":[9, 10]} +{8:["c", null], 9:["d1", "d2"], 10:null} {"a":null, "b":[8, 9]} +{7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} {"a":7, "b":[7, 8]} +{6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} {"a":6, "b":null} +{5:["m", "n"], 6:null, 7:["x1", "x2"]} {"a":null, "b":[5, 6]} +{4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} {"a":4, "b":[4, 5]} +{3:["p", "q"], 4:null, 5:["z1", "z2"]} {"a":3, "b":null} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{2:["s1", null], 3:["v1", "v2"], 4:null} {"a":null, "b":[2, 3]} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{10:["q1", null], 11:["w1", "w2"], 12:["x", null]} {"a":10, "b":null} +{1:["a", null], 2:["x", "y"], 3:["k1", null]} {"a":1, "b":[1, 2]} + +-- !complex_5 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} + +-- !complex_6 -- +{11:["x", null], 12:["u1", "u2"], 13:["l", null]} {"a":11, "b":[11, 12]} +{12:["p", null], 13:["k1", "k2"], 14:null} {"a":null, "b":[12, 13]} +{13:["d", null], 14:["e1", "e2"], 15:["f1", null]} {"a":13, "b":null} +{14:["f", null], 15:["g1", "g2"], 16:["h1", null]} {"a":14, "b":[14, 15]} +{15:["z", null], 16:["y1", "y2"], 17:null} {"a":null, "b":null} +{16:["i", null], 17:["j1", "j2"], 18:["k", null]} {"a":16, "b":[16, 17]} +{17:["t", null], 18:["u1", "u2"], 19:null} {"a":null, "b":[17, 18]} +{18:["v", null], 19:["w1", "w2"], 20:["aa", null]} {"a":18, "b":null} +{19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} {"a":19, "b":[1, null]} +{20:["y1", null], 21:null, 22:["qq", "rr"]} {"a":null, "b":[2, 3]} +{21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} {"a":21, "b":null} +{22:["a", null], 23:["b", "c"], 24:["dd", null]} {"a":22, "b":[4, 5]} +{23:["k", null], 24:["q1", "q2"], 25:["tt", null]} {"a":null, "b":[6, 7]} +{24:["u1", null], 25:["u2", "u3"], 26:null} {"a":24, "b":null} + +-- !complex_7 -- +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} + +-- !complex_8 -- +3 text_3 +7 text_7 +12 text_12 + +-- !complex_9 -- +8 {8:["c", null], 9:["d1", "d2"], 10:null} text_8 + +-- !complex_10 -- +{"a":1, "b":[1, 2]} 1 +{"a":10, "b":null} 10 +{"a":11, "b":[11, 12]} 11 +{"a":null, "b":[12, 13]} 12 +{"a":13, "b":null} 13 +{"a":14, "b":[14, 15]} 14 +{"a":null, "b":null} 15 +{"a":16, "b":[16, 17]} 16 +{"a":null, "b":[17, 18]} 17 +{"a":18, "b":null} 18 +{"a":19, "b":[1, null]} 19 + +-- !complex_11 -- +2 text_2 +10 text_10 +20 text_20 + +-- !complex_12 -- +2 text_2 {"a":null, "b":[2, 3]} {2:["s1", null], 3:["v1", "v2"], 4:null} +4 text_4 {"a":4, "b":[4, 5]} {4:["k1", null], 5:["a1", "a2"], 6:["t1", null]} +6 text_6 {"a":6, "b":null} {6:["z1", null], 7:["t1", "t2"], 8:["c1", null]} +8 text_8 {"a":null, "b":[8, 9]} {8:["c", null], 9:["d1", "d2"], 10:null} +10 text_10 {"a":10, "b":null} {10:["q1", null], 11:["w1", "w2"], 12:["x", null]} +12 text_12 {"a":null, "b":[12, 13]} {12:["p", null], 13:["k1", "k2"], 14:null} +14 text_14 {"a":14, "b":[14, 15]} {14:["f", null], 15:["g1", "g2"], 16:["h1", null]} +16 text_16 {"a":16, "b":[16, 17]} {16:["i", null], 17:["j1", "j2"], 18:["k", null]} +18 text_18 {"a":18, "b":null} {18:["v", null], 19:["w1", "w2"], 20:["aa", null]} +20 text_20 {"a":null, "b":[2, 3]} {20:["y1", null], 21:null, 22:["qq", "rr"]} +22 text_22 {"a":22, "b":[4, 5]} {22:["a", null], 23:["b", "c"], 24:["dd", null]} +24 text_24 {"a":24, "b":null} {24:["u1", null], 25:["u2", "u3"], 26:null} + +-- !complex_13 -- +1 text_1 {"a":1, "b":[1, 2]} {1:["a", null], 2:["x", "y"], 3:["k1", null]} +3 text_3 {"a":3, "b":null} {3:["p", "q"], 4:null, 5:["z1", "z2"]} +5 text_5 {"a":null, "b":[5, 6]} {5:["m", "n"], 6:null, 7:["x1", "x2"]} +7 text_7 {"a":7, "b":[7, 8]} {7:["h", null], 8:["b1", "b2"], 9:["aa", "bb"]} +9 text_9 {"a":9, "b":[9, 10]} {9:["aa", null], 10:["bb", "cc"], 11:["mm", null]} +11 text_11 {"a":11, "b":[11, 12]} {11:["x", null], 12:["u1", "u2"], 13:["l", null]} +13 text_13 {"a":13, "b":null} {13:["d", null], 14:["e1", "e2"], 15:["f1", null]} +15 text_15 {"a":null, "b":null} {15:["z", null], 16:["y1", "y2"], 17:null} +17 text_17 {"a":null, "b":[17, 18]} {17:["t", null], 18:["u1", "u2"], 19:null} +19 text_19 {"a":19, "b":[1, null]} {19:["x1", null], 20:["x2", "x3"], 21:["pp", null]} +21 text_21 {"a":21, "b":null} {21:["z1", "z2"], 22:["m1", null], 23:["k1", "k2"]} +23 text_23 {"a":null, "b":[6, 7]} {23:["k", null], 24:["q1", "q2"], 25:["tt", null]} diff --git a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy index 6ec5438a518892..7d6a3181ceff39 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_topn_lazy_mat.groovy @@ -156,9 +156,28 @@ suite("test_hive_topn_lazy_mat", "p0,external,hive,external_docker,external_dock order by a.id,b.id limit 100; """ } - } - + for (String table : ["parquet_topn_lazy_complex_table", "parquet_topn_lazy_complex_table_multi_pages"]) { + for (int limit : limitValues) { + qt_complex_1 """ select * from ${table} order by id asc limit ${limit}; """ + qt_complex_2 """ select * from ${table} order by col1 desc limit ${limit}; """ + qt_complex_3 """ select col1,col3,col2 from ${table} order by id desc limit ${limit}; """ + qt_complex_4 """ select col3,col2 from ${table} order by col1 desc limit ${limit}; """ + + qt_complex_5 """ select * from ${table} where id = 1 order by id limit ${limit}; """ + qt_complex_6 """ select col3,col2 from ${table} where id > 10 order by id limit ${limit}; """ + qt_complex_7 """ select * from ${table} where id between 5 and 15 order by id limit ${limit}; """ + qt_complex_8 """ select id, col1 from ${table} where id in (3, 7, 12) order by id limit ${limit}; """ + + qt_complex_9 """ select id,col3,col1 from ${table} where col1 = 'text_8' order by id limit ${limit}; """ + qt_complex_10 """ select col2,id from ${table} where col1 like 'text_1%' order by id limit ${limit}; """ + qt_complex_11 """ select id, col1 from ${table} where col1 in ('text_2', 'text_10', 'text_20') order by id limit ${limit}; """ + + qt_complex_12 """ select * from ${table} where id%2 = 0 order by id limit ${limit}; """ + qt_complex_13 """ select * from ${table} where id%2 = 1 order by id limit ${limit}; """ + } + } + } for (String hivePrefix : ["hive2"]) { String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")