diff --git a/be/src/exec/decompressor.cpp b/be/src/exec/decompressor.cpp index af69a896a2b8e4..964654132a3a29 100644 --- a/be/src/exec/decompressor.cpp +++ b/be/src/exec/decompressor.cpp @@ -42,6 +42,12 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom case CompressType::LZ4FRAME: *decompressor = new Lz4FrameDecompressor(); break; + case CompressType::LZ4BLOCK: + *decompressor = new Lz4BlockDecompressor(); + break; + case CompressType::SNAPPYBLOCK: + *decompressor = new SnappyBlockDecompressor(); + break; #ifdef DORIS_WITH_LZO case CompressType::LZOP: *decompressor = new LzopDecompressor(); @@ -59,6 +65,10 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom return st; } +uint32_t Decompressor::_read_int32(uint8_t* buf) { + return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; +} + std::string Decompressor::debug_info() { return "Decompressor"; } @@ -239,7 +249,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t size_t* decompressed_len, bool* stream_end, size_t* more_input_bytes, size_t* more_output_bytes) { uint8_t* src = input; - size_t src_size = input_len; + size_t remaining_input_size = input_len; size_t ret = 1; *input_bytes_read = 0; @@ -257,7 +267,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } LZ4F_frameInfo_t info; - ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &src_size); + ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &remaining_input_size); if (LZ4F_isError(ret)) { return Status::InternalError("LZ4F_getFrameInfo error: {}", std::string(LZ4F_getErrorName(ret))); @@ -270,17 +280,17 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t std::string(LZ4F_getErrorName(ret))); } - *input_bytes_read = src_size; + *input_bytes_read = remaining_input_size; - src += src_size; - src_size = input_len - src_size; + src += remaining_input_size; + remaining_input_size = input_len - remaining_input_size; LOG(INFO) << "lz4 block size: " << _expect_dec_buf_size; } // decompress size_t output_len = output_max_len; - ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &src_size, + ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &remaining_input_size, /* LZ4F_decompressOptions_t */ nullptr); if (LZ4F_isError(ret)) { return Status::InternalError("Decompression error: {}", @@ -288,7 +298,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } // update - *input_bytes_read += src_size; + *input_bytes_read += remaining_input_size; *decompressed_len = output_len; if (ret == 0) { *stream_end = true; @@ -324,4 +334,165 @@ size_t Lz4FrameDecompressor::get_block_size(const LZ4F_frameInfo_t* info) { } } +/// Lz4BlockDecompressor +Status Lz4BlockDecompressor::init() { + return Status::OK(); +} + +Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, + uint8_t* output, size_t output_max_len, + size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop lz4 codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = Decompressor::_read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // Decompress + int uncompressed_len = LZ4_decompress_safe(reinterpret_cast(src), + reinterpret_cast(output), compressed_len, + remaining_output_size); + if (uncompressed_len < 0 || uncompressed_len != uncompressed_block_len) { + return Status::InternalError( + "lz4 block decompress failed. uncompressed_len: {}, expected: {}", + uncompressed_len, uncompressed_block_len); + } + + output += uncompressed_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string Lz4BlockDecompressor::debug_info() { + std::stringstream ss; + ss << "Lz4BlockDecompressor."; + return ss.str(); +} + +/// SnappyBlockDecompressor +Status SnappyBlockDecompressor::init() { + return Status::OK(); +} + +Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, + size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, + bool* stream_end, size_t* more_input_bytes, + size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop snappy codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/SnappyCodec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = _read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // ATTN: the uncompressed len from GetUncompressedLength() is same as + // uncompressed_block_len, so I think it is unnecessary to get it again. + // Get uncompressed len from snappy + // size_t uncompressed_len; + // if (!snappy::GetUncompressedLength(reinterpret_cast(src), + // compressed_len, &uncompressed_len)) { + // return Status::InternalError("snappy block decompress failed to get uncompressed len"); + // } + + // Decompress + if (!snappy::RawUncompress(reinterpret_cast(src), compressed_len, + reinterpret_cast(output))) { + return Status::InternalError("snappy block decompress failed. uncompressed_len: {}", + uncompressed_block_len); + } + + output += uncompressed_block_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_block_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string SnappyBlockDecompressor::debug_info() { + std::stringstream ss; + ss << "SnappyBlockDecompressor."; + return ss.str(); +} + } // namespace doris diff --git a/be/src/exec/decompressor.h b/be/src/exec/decompressor.h index af37335f1f7145..2b07e71139fb83 100644 --- a/be/src/exec/decompressor.h +++ b/be/src/exec/decompressor.h @@ -18,7 +18,10 @@ #pragma once #include +#include #include +#include +#include #include #include #include @@ -34,7 +37,7 @@ namespace doris { -enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP }; +enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP, LZ4BLOCK, SNAPPYBLOCK }; class Decompressor { public: @@ -68,6 +71,8 @@ class Decompressor { protected: virtual Status init() = 0; + static uint32_t _read_int32(uint8_t* buf); + Decompressor(CompressType ctype) : _ctype(ctype) {} CompressType _ctype; @@ -140,6 +145,38 @@ class Lz4FrameDecompressor : public Decompressor { const static unsigned DORIS_LZ4F_VERSION; }; +class Lz4BlockDecompressor : public Decompressor { +public: + ~Lz4BlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + Lz4BlockDecompressor() : Decompressor(CompressType::LZ4FRAME) {} + Status init() override; +}; + +class SnappyBlockDecompressor : public Decompressor { +public: + ~SnappyBlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + SnappyBlockDecompressor() : Decompressor(CompressType::SNAPPYBLOCK) {} + Status init() override; +}; + #ifdef DORIS_WITH_LZO class LzopDecompressor : public Decompressor { public: diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 3838668577ac38..b458af6336317e 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -632,6 +632,8 @@ void PInternalServiceImpl::fetch_table_schema(google::protobuf::RpcController* c case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: { // file_slots is no use diff --git a/be/src/util/load_util.cpp b/be/src/util/load_util.cpp index 8736561db4f3e1..1277132378b85a 100644 --- a/be/src/util/load_util.cpp +++ b/be/src/util/load_util.cpp @@ -46,9 +46,15 @@ void LoadUtil::parse_format(const std::string& format_str, const std::string& co } else if (iequal(compress_type_str, "LZ4")) { *format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME; *compress_type = TFileCompressType::LZ4FRAME; + } else if (iequal(compress_type_str, "LZ4_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_LZ4BLOCK; + *compress_type = TFileCompressType::LZ4BLOCK; } else if (iequal(compress_type_str, "LZOP")) { *format_type = TFileFormatType::FORMAT_CSV_LZOP; *compress_type = TFileCompressType::LZO; + } else if (iequal(compress_type_str, "SNAPPY_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_SNAPPYBLOCK; + *compress_type = TFileCompressType::SNAPPYBLOCK; } else if (iequal(compress_type_str, "DEFLATE")) { *format_type = TFileFormatType::FORMAT_CSV_DEFLATE; *compress_type = TFileCompressType::DEFLATE; @@ -72,6 +78,7 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { case TFileFormatType::FORMAT_CSV_DEFLATE: case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZO: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_JSON: @@ -81,4 +88,4 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { } return false; } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index ba5d69cb73708a..64749e935cf072 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -340,8 +340,12 @@ Status CsvReader::init_reader(bool is_load) { [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZ4FRAME: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZOP: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_DEFLATE: _line_reader = NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(), @@ -397,21 +401,51 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); size_t rows = 0; - auto columns = block->mutate_columns(); - while (rows < batch_size && !_line_reader_eof) { - const uint8_t* ptr = nullptr; - size_t size = 0; - RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); - if (_skip_lines > 0) { - _skip_lines--; - continue; + + bool success = false; + if (_push_down_agg_type == TPushAggOp::type::COUNT) { + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + ++rows; } - if (size == 0) { - // Read empty row, just continue - continue; + + for (auto& col : block->mutate_columns()) { + col->resize(rows); } - RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } else { + auto columns = block->mutate_columns(); + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + if (!success) { + continue; + } + RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } } *eof = (rows == 0); @@ -473,9 +507,15 @@ Status CsvReader::_create_decompressor() { case TFileCompressType::LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileCompressType::LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileCompressType::DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileCompressType::SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown compress type: {}", _file_compress_type); } @@ -495,12 +535,18 @@ Status CsvReader::_create_decompressor() { case TFileFormatType::FORMAT_CSV_LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileFormatType::FORMAT_CSV_LZOP: compress_type = CompressType::LZOP; break; case TFileFormatType::FORMAT_CSV_DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown format type: {}", _file_format_type); } @@ -554,7 +600,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, return Status::OK(); } -Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { +Status CsvReader::_validate_line(const Slice& line, bool* success) { if (!_is_proto_format && !validate_utf8(line.data, line.size)) { if (!_is_load) { return Status::InternalError("Only support csv data in utf8 codec"); @@ -572,7 +618,11 @@ Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { return Status::OK(); } } + *success = true; + return Status::OK(); +} +Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { _split_line(line); if (_is_load) { diff --git a/be/src/vec/exec/format/csv/csv_reader.h b/be/src/vec/exec/format/csv/csv_reader.h index 5721bbd9291a38..2659703f8dce6e 100644 --- a/be/src/vec/exec/format/csv/csv_reader.h +++ b/be/src/vec/exec/format/csv/csv_reader.h @@ -221,6 +221,12 @@ class CsvReader : public GenericReader { // TODO(ftw): parse type Status _parse_col_types(size_t col_nums, std::vector* col_types); + // check the utf8 encoding of a line. + // return error status to stop processing. + // If return Status::OK but "success" is false, which means this is load request + // and the line is skipped as unqualified row, and the process should continue. + Status _validate_line(const Slice& line, bool* success); + RuntimeState* _state; RuntimeProfile* _profile; ScannerCounter* _counter; diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp index b59bbef1f1c085..c27aba354f6e1f 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp @@ -201,7 +201,6 @@ NewPlainTextLineReader::NewPlainTextLineReader(RuntimeProfile* profile, _output_buf_limit(0), _file_eof(false), _eof(false), - _stream_end(true), _more_input_bytes(0), _more_output_bytes(0), _current_offset(current_offset), @@ -324,6 +323,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _line_reader_ctx->refresh(); int found_line_delimiter = 0; size_t offset = 0; + bool stream_end = true; while (!done()) { // find line delimiter in current decompressed data uint8_t* cur_ptr = _output_buf + _output_buf_pos; @@ -379,7 +379,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool COUNTER_UPDATE(_bytes_read_counter, read_len); } if (_file_eof || read_len == 0) { - if (!_stream_end) { + if (!stream_end) { return Status::InternalError( "Compressed file has been truncated, which is not allowed"); } else { @@ -392,7 +392,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool if (_decompressor == nullptr) { _output_buf_limit += read_len; - _stream_end = true; + stream_end = true; } else { // only update input limit. // input pos is set at MARK step @@ -418,10 +418,10 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _input_buf_limit - _input_buf_pos, /* input_len */ &input_read_bytes, _output_buf + _output_buf_limit, /* output */ _output_buf_size - _output_buf_limit, /* output_max_len */ - &decompressed_len, &_stream_end, &_more_input_bytes, &_more_output_bytes)); + &decompressed_len, &stream_end, &_more_input_bytes, &_more_output_bytes)); // LOG(INFO) << "after decompress:" - // << " stream_end: " << _stream_end + // << " stream_end: " << stream_end // << " input_read_bytes: " << input_read_bytes // << " decompressed_len: " << decompressed_len // << " more_input_bytes: " << _more_input_bytes diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h index 7326812b92bcb3..9947259300dae4 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h @@ -235,7 +235,6 @@ class NewPlainTextLineReader : public LineReader { bool _file_eof; bool _eof; - bool _stream_end; size_t _more_input_bytes; size_t _more_output_bytes; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 055ec224a3e6c7..505b6807b439ca 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -261,22 +261,26 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eo // use read_rows instead of _src_block_ptr->rows(), because the first column of _src_block_ptr // may not be filled after calling `get_next_block()`, so _src_block_ptr->rows() may return wrong result. if (read_rows > 0) { - // Convert the src block columns type to string in-place. - RETURN_IF_ERROR(_cast_to_input_block(block)); - // FileReader can fill partition and missing columns itself - if (!_cur_reader->fill_all_columns()) { - // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) - RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); - // Fill columns not exist in file with null or default value - RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + // If the push_down_agg_type is COUNT, no need to do the rest, + // because we only save a number in block. + if (_parent->get_push_down_agg_type() != TPushAggOp::type::COUNT) { + // Convert the src block columns type to string in-place. + RETURN_IF_ERROR(_cast_to_input_block(block)); + // FileReader can fill partition and missing columns itself + if (!_cur_reader->fill_all_columns()) { + // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) + RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); + // Fill columns not exist in file with null or default value + RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + } + // Apply _pre_conjunct_ctxs to filter src block. + RETURN_IF_ERROR(_pre_filter_src_block()); + // Convert src block to output block (dest block), string to dest data type and apply filters. + RETURN_IF_ERROR(_convert_to_output_block(block)); + // Truncate char columns or varchar columns if size is smaller than file columns + // or not found in the file column schema. + RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); } - // Apply _pre_conjunct_ctxs to filter src block. - RETURN_IF_ERROR(_pre_filter_src_block()); - // Convert src block to output block (dest block), string to dest data type and apply filters. - RETURN_IF_ERROR(_convert_to_output_block(block)); - // Truncate char columns or varchar columns if size is smaller than file columns - // or not found in the file column schema. - RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); break; } } while (true); @@ -755,8 +759,10 @@ Status VFileScanner::_get_next_reader() { case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_PROTO: { _cur_reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java index 8d9c2c6b0c07aa..0dff4b6caaba08 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java @@ -550,6 +550,8 @@ public static TFileFormatType getFileFormatType(String path) { return TFileFormatType.FORMAT_CSV_LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileFormatType.FORMAT_CSV_DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileFormatType.FORMAT_CSV_SNAPPYBLOCK; } else { return TFileFormatType.FORMAT_CSV_PLAIN; } @@ -575,6 +577,8 @@ public static TFileCompressType inferFileCompressTypeByPath(String path) { return TFileCompressType.LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileCompressType.DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileCompressType.SNAPPYBLOCK; } else { return TFileCompressType.PLAIN; } @@ -599,6 +603,8 @@ public static boolean isCsvFormat(TFileFormatType fileFormatType) { || fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE || fileFormatType == TFileFormatType.FORMAT_CSV_GZ || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME + || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4BLOCK + || fileFormatType == TFileFormatType.FORMAT_CSV_SNAPPYBLOCK || fileFormatType == TFileFormatType.FORMAT_CSV_LZO || fileFormatType == TFileFormatType.FORMAT_CSV_LZOP || fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index e7e621948e24ab..45b84dfb174d41 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -1000,6 +1000,7 @@ public static class FileCacheValue { // File Cache for self splitter. private final List files = Lists.newArrayList(); // File split cache for old splitter. This is a temp variable. + @Deprecated private final List splits = Lists.newArrayList(); private boolean isSplittable; // The values of partitions. @@ -1021,6 +1022,7 @@ public void addFile(RemoteFile file) { } } + @Deprecated public void addSplit(FileSplit split) { if (isFileVisible(split.getPath())) { splits.add(split); diff --git a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java index 704d0fadf84fee..e1baea3652cd89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java @@ -190,7 +190,8 @@ public static boolean isSplittable(InputFormat inputFormat, Path path, Job return true; } - // use reflection to get isSplittable method on FileInputFormat + // use reflection to get isSplitable method on FileInputFormat + // ATTN: the method name is actually "isSplitable", but the right spell is "isSplittable" Method method = null; for (Class clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) { try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java index 662aa939ee4b16..8e2c8ed3a4521e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.Util; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.external.FileSplit.FileSplitCreator; import org.apache.doris.qe.ConnectContext; @@ -32,6 +33,7 @@ import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileRangeDesc; import org.apache.doris.thrift.TFileScanNode; import org.apache.doris.thrift.TFileScanRangeParams; @@ -221,19 +223,20 @@ protected List splitFile(Path path, long blockSize, BlockLocation[] block if (blockLocations == null) { blockLocations = new BlockLocation[0]; } - long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); - if (splitSize <= 0) { - splitSize = blockSize; - } - // Min split size is DEFAULT_SPLIT_SIZE(128MB). - splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); List result = Lists.newArrayList(); - if (!splittable) { + TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.toString()); + if (!splittable || compressType != TFileCompressType.PLAIN) { LOG.debug("Path {} is not splittable.", path); String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts(); result.add(splitCreator.create(path, 0, length, length, modificationTime, hosts, partitionValues)); return result; } + long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); + if (splitSize <= 0) { + splitSize = blockSize; + } + // Min split size is DEFAULT_SPLIT_SIZE(128MB). + splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); long bytesRemaining; for (bytesRemaining = length; (double) bytesRemaining / (double) splitSize > 1.1D; bytesRemaining -= splitSize) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 7178a585ff1739..5d0033c90ea806 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -51,6 +51,7 @@ import org.apache.doris.spi.Split; import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileFormatType; import org.apache.doris.thrift.TFileTextScanRangeParams; import org.apache.doris.thrift.TFileType; @@ -386,4 +387,14 @@ public boolean pushDownAggNoGrouping(FunctionCallExpr aggExpr) { public boolean pushDownAggNoGroupingCheckCol(FunctionCallExpr aggExpr, Column col) { return !col.isAllowNull(); } + + @Override + protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException { + TFileCompressType compressType = super.getFileCompressType(fileSplit); + // hadoop use lz4 blocked codec + if (compressType == TFileCompressType.LZ4FRAME) { + compressType = TFileCompressType.LZ4BLOCK; + } + return compressType; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java index 0f230c85f43943..0bc8442760710e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java @@ -32,11 +32,6 @@ public HiveSplit(Path path, long start, long length, long fileLength, this.acidInfo = acidInfo; } - public HiveSplit(Path path, long start, long length, long fileLength, String[] hosts, AcidInfo acidInfo) { - super(path, start, length, fileLength, hosts, null); - this.acidInfo = acidInfo; - } - @Override public Object getInfo() { return acidInfo; diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 0717fc498dfc20..cc3a3bf2c30d20 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -116,6 +116,8 @@ enum TFileFormatType { FORMAT_PROTO, FORMAT_JNI, FORMAT_AVRO, + FORMAT_CSV_LZ4BLOCK, + FORMAT_CSV_SNAPPYBLOCK, } // In previous versions, the data compression format and file format were stored together, as TFileFormatType, @@ -132,6 +134,8 @@ enum TFileCompressType { LZ4FRAME, DEFLATE, LZOP, + LZ4BLOCK, + SNAPPYBLOCK } struct THdfsConf { diff --git a/regression-test/data/external_table_p2/hive/test_compress_type.out b/regression-test/data/external_table_p2/hive/test_compress_type.out new file mode 100644 index 00000000000000..a95bf1f0dd3c02 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_compress_type.out @@ -0,0 +1,47 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q21 -- +600005 + +-- !q22 -- +1510010 + +-- !q23 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy + +-- !q31 -- +600005 + +-- !q32 -- +1510010 + +-- !q33 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4 +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy + diff --git a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy new file mode 100644 index 00000000000000..d02ff3fbd0c47d --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_compress_type" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use multi_catalog """ + + // table test_compress_partitioned has 6 partitions with different compressed file: plain, gzip, bzip2, deflate + sql """set file_split_size=0""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16" + contains "partition=8/8" + } + qt_q21 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q22 """select count(*) from test_compress_partitioned""" + order_qt_q23 """select * from test_compress_partitioned where watchid=4611870011201662970""" + + sql """set file_split_size=8388608""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82" + contains "partition=8/8" + } + + qt_q31 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q32 """select count(*) from test_compress_partitioned""" + order_qt_q33 """select * from test_compress_partitioned where watchid=4611870011201662970""" + sql """set file_split_size=0""" + } +}