diff --git a/be/src/io/fs/file_meta_cache.cpp b/be/src/io/fs/file_meta_cache.cpp index 226fb663f5b152..f97b2f80cd6ee6 100644 --- a/be/src/io/fs/file_meta_cache.cpp +++ b/be/src/io/fs/file_meta_cache.cpp @@ -17,26 +17,27 @@ #include "io/fs/file_meta_cache.h" -#include "vec/exec/format/parquet/parquet_thrift_util.h" - namespace doris { -Status FileMetaCache::get_parquet_footer(io::FileReaderSPtr file_reader, io::IOContext* io_ctx, - int64_t mtime, size_t* meta_size, - ObjLRUCache::CacheHandle* handle) { - ObjLRUCache::CacheHandle cache_handle; - std::string cache_key = file_reader->path().native() + std::to_string(mtime); - auto hit_cache = _cache.lookup({cache_key}, &cache_handle); - if (hit_cache) { - *handle = std::move(cache_handle); - *meta_size = 0; +std::string FileMetaCache::get_key(const std::string file_name, int64_t modification_time, + int64_t file_size) { + std::string meta_cache_key; + meta_cache_key.resize(file_name.size() + sizeof(int64_t)); + + memcpy(meta_cache_key.data(), file_name.data(), file_name.size()); + if (modification_time != 0) { + memcpy(meta_cache_key.data() + file_name.size(), &modification_time, sizeof(int64_t)); } else { - vectorized::FileMetaData* meta = nullptr; - RETURN_IF_ERROR(vectorized::parse_thrift_footer(file_reader, &meta, meta_size, io_ctx)); - _cache.insert({cache_key}, meta, handle); + memcpy(meta_cache_key.data() + file_name.size(), &file_size, sizeof(int64_t)); } + return meta_cache_key; +} - return Status::OK(); +std::string FileMetaCache::get_key(io::FileReaderSPtr file_reader, + const io::FileDescription& _file_description) { + return FileMetaCache::get_key( + file_reader->path().native(), _file_description.mtime, + _file_description.file_size == -1 ? file_reader->size() : _file_description.file_size); } } // namespace doris diff --git a/be/src/io/fs/file_meta_cache.h b/be/src/io/fs/file_meta_cache.h index 5d3384677eea39..0c62c963ce122d 100644 --- a/be/src/io/fs/file_meta_cache.h +++ b/be/src/io/fs/file_meta_cache.h @@ -17,6 +17,7 @@ #pragma once +#include "io/file_factory.h" #include "io/fs/file_reader_writer_fwd.h" #include "util/obj_lru_cache.h" @@ -34,14 +35,23 @@ class FileMetaCache { ObjLRUCache& cache() { return _cache; } - Status get_parquet_footer(io::FileReaderSPtr file_reader, io::IOContext* io_ctx, int64_t mtime, - size_t* meta_size, ObjLRUCache::CacheHandle* handle); + static std::string get_key(const std::string file_name, int64_t modification_time, + int64_t file_size); - Status get_orc_footer() { - // TODO: implement - return Status::OK(); + static std::string get_key(io::FileReaderSPtr file_reader, + const io::FileDescription& _file_description); + + bool lookup(const std::string& key, ObjLRUCache::CacheHandle* handle) { + return _cache.lookup({key}, handle); + } + + template + void insert(const std::string& key, T* value, ObjLRUCache::CacheHandle* handle) { + _cache.insert({key}, value, handle); } + bool enabled() const { return _cache.enabled(); } + private: ObjLRUCache _cache; }; diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 051eac147faaee..e97639b7d95045 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -40,6 +40,7 @@ #include "common/config.h" #include "common/logging.h" #include "common/status.h" +#include "io/fs/file_meta_cache.h" #include "io/hdfs_builder.h" #include "olap/cumulative_compaction_time_series_policy.h" #include "olap/delete_handler.h" diff --git a/be/src/util/obj_lru_cache.cpp b/be/src/util/obj_lru_cache.cpp index 600ffdb647ce44..fb03f1907d3ac0 100644 --- a/be/src/util/obj_lru_cache.cpp +++ b/be/src/util/obj_lru_cache.cpp @@ -22,9 +22,8 @@ namespace doris { ObjLRUCache::ObjLRUCache(int64_t capacity, uint32_t num_shards) : LRUCachePolicy(CachePolicy::CacheType::COMMON_OBJ_LRU_CACHE, capacity, LRUCacheType::NUMBER, config::common_obj_lru_cache_stale_sweep_time_sec, - num_shards) { - _enabled = (capacity > 0); -} + num_shards), + _enabled(capacity > 0) {} bool ObjLRUCache::lookup(const ObjKey& key, CacheHandle* handle) { if (!_enabled) { diff --git a/be/src/util/obj_lru_cache.h b/be/src/util/obj_lru_cache.h index 680a32e79bc991..a62378a14bf53f 100644 --- a/be/src/util/obj_lru_cache.h +++ b/be/src/util/obj_lru_cache.h @@ -72,9 +72,10 @@ class ObjLRUCache : public LRUCachePolicy { bool valid() { return _cache != nullptr && _handle != nullptr; } LRUCachePolicy* cache() const { return _cache; } + template - void* data() const { - return (void*)((ObjValue*)_cache->value(_handle))->value; + const T* data() const { + return ((ObjValue*)_cache->value(_handle))->value; } private: @@ -98,7 +99,8 @@ class ObjLRUCache : public LRUCachePolicy { CachePriority::NORMAL); *cache_handle = CacheHandle {this, handle}; } else { - cache_handle = nullptr; + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "ObjLRUCache disable, can not insert."); } } @@ -106,8 +108,10 @@ class ObjLRUCache : public LRUCachePolicy { bool exceed_prune_limit() override; + bool enabled() const { return _enabled; } + private: - bool _enabled; + const bool _enabled; }; } // namespace doris diff --git a/be/src/vec/exec/format/generic_reader.h b/be/src/vec/exec/format/generic_reader.h index c3efc321e2fba6..d12107a4b889d5 100644 --- a/be/src/vec/exec/format/generic_reader.h +++ b/be/src/vec/exec/format/generic_reader.h @@ -20,6 +20,7 @@ #include #include "common/status.h" +#include "io/fs/file_meta_cache.h" #include "runtime/descriptors.h" #include "runtime/types.h" #include "util/profile_collector.h" @@ -85,6 +86,10 @@ class GenericReader : public ProfileCollector { /// Whether the underlying FileReader has filled the partition&missing columns bool _fill_all_columns = false; TPushAggOp::type _push_down_agg_type {}; + + // Cache to save some common part such as file footer. + // Maybe null if not used + FileMetaCache* _meta_cache = nullptr; }; #include "common/compile_check_end.h" diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 741399ff7dc6fc..2e60dcfb6c589d 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -165,7 +165,7 @@ void StripeStreamInputStream::read(void* buf, uint64_t length, uint64_t offset) OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, - bool enable_lazy_mat) + FileMetaCache* meta_cache, bool enable_lazy_mat) : _profile(profile), _state(state), _scan_params(params), @@ -183,13 +183,15 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, VecDateTimeValue t; t.from_unixtime(0, ctz); _offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1. + _meta_cache = meta_cache; _init_profile(); _init_system_properties(); _init_file_description(); } OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat) + const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache, + bool enable_lazy_mat) : _profile(nullptr), _scan_params(params), _scan_range(range), @@ -199,6 +201,7 @@ OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& r _enable_lazy_mat(enable_lazy_mat), _enable_filter_by_min_max(true), _dict_cols_has_converted(false) { + _meta_cache = meta_cache; _init_system_properties(); _init_file_description(); } @@ -221,7 +224,8 @@ void OrcReader::_collect_profile_before_close() { COUNTER_UPDATE(_orc_profile.predicate_filter_time, _statistics.predicate_filter_time); COUNTER_UPDATE(_orc_profile.dict_filter_rewrite_time, _statistics.dict_filter_rewrite_time); COUNTER_UPDATE(_orc_profile.lazy_read_filtered_rows, _statistics.lazy_read_filtered_rows); - + COUNTER_UPDATE(_orc_profile.file_footer_read_calls, _statistics.file_footer_read_calls); + COUNTER_UPDATE(_orc_profile.file_footer_hit_cache, _statistics.file_footer_hit_cache); if (_file_input_stream != nullptr) { _file_input_stream->collect_profile_before_close(); } @@ -260,10 +264,15 @@ void OrcReader::_init_profile() { ADD_COUNTER_WITH_LEVEL(_profile, "SelectedRowGroupCount", TUnit::UNIT, 1); _orc_profile.evaluated_row_group_count = ADD_COUNTER_WITH_LEVEL(_profile, "EvaluatedRowGroupCount", TUnit::UNIT, 1); + _orc_profile.file_footer_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterReadCalls", TUnit::UNIT, 1); + _orc_profile.file_footer_hit_cache = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterHitCache", TUnit::UNIT, 1); } } Status OrcReader::_create_file_reader() { + SCOPED_RAW_TIMER(&_statistics.create_reader_time); if (_reader != nullptr) { return Status::OK(); } @@ -283,27 +292,56 @@ Status OrcReader::_create_file_reader() { if (_file_input_stream->getLength() == 0) { return Status::EndOfFile("empty orc file: " + _scan_range.path); } + // create orc reader - try { - orc::ReaderOptions options; - options.setMemoryPool(*ExecEnv::GetInstance()->orc_memory_pool()); - options.setReaderMetrics(&_reader_metrics); - _reader = orc::createReader( - std::unique_ptr(_file_input_stream.release()), options); - } catch (std::exception& e) { - // invoker maybe just skip Status.NotFound and continue - // so we need distinguish between it and other kinds of errors - std::string _err_msg = e.what(); - if (_io_ctx && _io_ctx->should_stop && _err_msg == "stop") { - return Status::EndOfFile("stop"); + orc::ReaderOptions options; + options.setMemoryPool(*ExecEnv::GetInstance()->orc_memory_pool()); + options.setReaderMetrics(&_reader_metrics); + + auto create_orc_reader = [&]() { + try { + _reader = orc::createReader( + std::unique_ptr(_file_input_stream.release()), options); + } catch (std::exception& e) { + // invoker maybe just skip Status.NotFound and continue + // so we need distinguish between it and other kinds of errors + std::string _err_msg = e.what(); + if (_io_ctx && _io_ctx->should_stop && _err_msg == "stop") { + return Status::EndOfFile("stop"); + } + // one for fs, the other is for oss. + if (_err_msg.find("No such file or directory") != std::string::npos || + _err_msg.find("NoSuchKey") != std::string::npos) { + return Status::NotFound(_err_msg); + } + return Status::InternalError("Init OrcReader failed. reason = {}", _err_msg); } - // one for fs, the other is for oss. - if (_err_msg.find("No such file or directory") != std::string::npos || - _err_msg.find("NoSuchKey") != std::string::npos) { - return Status::NotFound(_err_msg); + return Status::OK(); + }; + + if (_meta_cache == nullptr) { + _statistics.file_footer_read_calls++; + RETURN_IF_ERROR(create_orc_reader()); + } else { + auto inner_file_reader = _file_input_stream->get_inner_reader(); + const auto& file_meta_cache_key = + FileMetaCache::get_key(inner_file_reader, _file_description); + + // Local variables can be required because setSerializedFileTail is an assignment operation, not a reference. + ObjLRUCache::CacheHandle _meta_cache_handle; + if (_meta_cache->lookup(file_meta_cache_key, &_meta_cache_handle)) { + const std::string* footer_ptr = _meta_cache_handle.data(); + options.setSerializedFileTail(*footer_ptr); + RETURN_IF_ERROR(create_orc_reader()); + _statistics.file_footer_hit_cache++; + } else { + _statistics.file_footer_read_calls++; + RETURN_IF_ERROR(create_orc_reader()); + std::string* footer_ptr = new std::string {_reader->getSerializedFileTail()}; + _meta_cache->insert(file_meta_cache_key, footer_ptr, &_meta_cache_handle); } - return Status::InternalError("Init OrcReader failed. reason = {}", _err_msg); } + return Status::OK(); } @@ -337,14 +375,8 @@ Status OrcReader::init_reader( _orc_max_merge_distance_bytes = _state->query_options().orc_max_merge_distance_bytes; } - { - SCOPED_RAW_TIMER(&_statistics.create_reader_time); - RETURN_IF_ERROR(_create_file_reader()); - } - { - SCOPED_RAW_TIMER(&_statistics.init_column_time); - RETURN_IF_ERROR(_init_read_columns()); - } + RETURN_IF_ERROR(_create_file_reader()); + RETURN_IF_ERROR(_init_read_columns()); return Status::OK(); } @@ -364,6 +396,7 @@ Status OrcReader::get_parsed_schema(std::vector* col_names, } Status OrcReader::_init_read_columns() { + SCOPED_RAW_TIMER(&_statistics.init_column_time); const auto& root_type = _reader->getType(); if (_is_acid) { for (uint64_t i = 0; i < root_type.getSubtypeCount(); ++i) { diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index febb72028572f2..82f8693cc31cad 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -36,6 +36,7 @@ #include "exec/olap_common.h" #include "io/file_factory.h" #include "io/fs/buffered_reader.h" +#include "io/fs/file_meta_cache.h" #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" #include "io/fs/tracing_file_reader.h" @@ -132,14 +133,18 @@ class OrcReader : public GenericReader { int64_t predicate_filter_time = 0; int64_t dict_filter_rewrite_time = 0; int64_t lazy_read_filtered_rows = 0; + int64_t file_footer_read_calls = 0; + int64_t file_footer_hit_cache = 0; }; OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, - io::IOContext* io_ctx, bool enable_lazy_mat = true); + io::IOContext* io_ctx, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true); OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true); + const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true); ~OrcReader() override; //If you want to read the file by index instead of column name, set hive_use_column_names to false. @@ -240,6 +245,8 @@ class OrcReader : public GenericReader { RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr; RuntimeProfile::Counter* selected_row_group_count = nullptr; RuntimeProfile::Counter* evaluated_row_group_count = nullptr; + RuntimeProfile::Counter* file_footer_read_calls = nullptr; + RuntimeProfile::Counter* file_footer_hit_cache = nullptr; }; class ORCFilterImpl : public orc::ORCFilter { diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h index 15927fe4f65166..8f4b106c9df0bb 100644 --- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -36,8 +36,9 @@ constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'}; constexpr uint32_t PARQUET_FOOTER_SIZE = 8; constexpr size_t INIT_META_SIZE = 48 * 1024; // 48k -static Status parse_thrift_footer(io::FileReaderSPtr file, FileMetaData** file_metadata, - size_t* meta_size, io::IOContext* io_ctx) { +static Status parse_thrift_footer(io::FileReaderSPtr file, + std::unique_ptr* file_metadata, size_t* meta_size, + io::IOContext* io_ctx) { size_t file_size = file->size(); size_t bytes_read = std::min(file_size, INIT_META_SIZE); std::vector footer(bytes_read); @@ -75,7 +76,7 @@ static Status parse_thrift_footer(io::FileReaderSPtr file, FileMetaData** file_m tparquet::FileMetaData t_metadata; // deserialize footer RETURN_IF_ERROR(deserialize_thrift_msg(meta_ptr, &metadata_size, true, &t_metadata)); - *file_metadata = new FileMetaData(t_metadata, metadata_size); + *file_metadata = std::make_unique(t_metadata, metadata_size); RETURN_IF_ERROR((*file_metadata)->init_schema()); *meta_size = PARQUET_FOOTER_SIZE + metadata_size; return Status::OK(); diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp index 1f72a1fda5b479..706dfee4e7c22b 100644 --- a/be/src/vec/exec/format/parquet/schema_desc.cpp +++ b/be/src/vec/exec/format/parquet/schema_desc.cpp @@ -247,72 +247,6 @@ std::pair FieldDescriptor::get_doris_type( return ans; } -// Copy from org.apache.iceberg.avro.AvroSchemaUtil#validAvroName -static bool is_valid_avro_name(const std::string& name) { - int length = name.length(); - char first = name[0]; - if (!isalpha(first) && first != '_') { - return false; - } - - for (int i = 1; i < length; i++) { - char character = name[i]; - if (!isalpha(character) && !isdigit(character) && character != '_') { - return false; - } - } - return true; -} - -// Copy from org.apache.iceberg.avro.AvroSchemaUtil#sanitize -static void sanitize_avro_name(std::ostringstream& buf, char character) { - if (isdigit(character)) { - buf << '_' << character; - } else { - std::stringstream ss; - ss << std::hex << (int)character; - std::string hex_str = ss.str(); - buf << "_x" << doris::to_lower(hex_str); - } -} - -// Copy from org.apache.iceberg.avro.AvroSchemaUtil#sanitize -static std::string sanitize_avro_name(const std::string& name) { - std::ostringstream buf; - int length = name.length(); - char first = name[0]; - if (!isalpha(first) && first != '_') { - sanitize_avro_name(buf, first); - } else { - buf << first; - } - - for (int i = 1; i < length; i++) { - char character = name[i]; - if (!isalpha(character) && !isdigit(character) && character != '_') { - sanitize_avro_name(buf, character); - } else { - buf << character; - } - } - return buf.str(); -} - -void FieldDescriptor::iceberg_sanitize(const std::vector& read_columns) { - for (const std::string& col : read_columns) { - if (!is_valid_avro_name(col)) { - std::string sanitize_name = sanitize_avro_name(col); - auto it = _name_to_field.find(sanitize_name); - if (it != _name_to_field.end()) { - FieldSchema* schema = const_cast(it->second); - schema->name = col; - _name_to_field.emplace(col, schema); - _name_to_field.erase(sanitize_name); - } - } - } -} - std::pair FieldDescriptor::convert_to_doris_type( tparquet::LogicalType logicalType) { std::pair ans = {INVALID_TYPE, false}; diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h index 408a45eae4bc85..16d3c1cc16ce83 100644 --- a/be/src/vec/exec/format/parquet/schema_desc.h +++ b/be/src/vec/exec/format/parquet/schema_desc.h @@ -100,10 +100,6 @@ class FieldDescriptor { public: std::pair get_doris_type(const tparquet::SchemaElement& physical_schema); - // org.apache.iceberg.avro.AvroSchemaUtil#sanitize will encode special characters, - // we have to decode these characters - void iceberg_sanitize(const std::vector& read_columns); - FieldDescriptor() = default; ~FieldDescriptor() = default; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index 1d2c2afe5637ad..619f87cad970d6 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -58,7 +58,7 @@ class ParquetColumnReader { Statistics() : read_time(0), read_calls(0), - meta_read_calls(0), + page_index_read_calls(0), read_bytes(0), decompress_time(0), decompress_cnt(0), @@ -74,7 +74,7 @@ class ParquetColumnReader { int64_t null_map_time) : read_time(fs.read_time), read_calls(fs.read_calls), - meta_read_calls(0), + page_index_read_calls(0), read_bytes(fs.read_bytes), decompress_time(cs.decompress_time), decompress_cnt(cs.decompress_cnt), @@ -88,7 +88,7 @@ class ParquetColumnReader { int64_t read_time; int64_t read_calls; - int64_t meta_read_calls; + int64_t page_index_read_calls; int64_t read_bytes; int64_t decompress_time; int64_t decompress_cnt; @@ -104,7 +104,7 @@ class ParquetColumnReader { read_time += statistics.read_time; read_calls += statistics.read_calls; read_bytes += statistics.read_bytes; - meta_read_calls += statistics.meta_read_calls; + page_index_read_calls += statistics.page_index_read_calls; decompress_time += statistics.decompress_time; decompress_cnt += statistics.decompress_cnt; decode_header_time += statistics.decode_header_time; diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp index 98de497e320b1b..fea7d9d545ba66 100644 --- a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp @@ -44,7 +44,7 @@ Status FileMetaData::init_schema() { return _schema.parse_from_thrift(_metadata.schema); } -const tparquet::FileMetaData& FileMetaData::to_thrift() { +const tparquet::FileMetaData& FileMetaData::to_thrift() const { return _metadata; } diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.h b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h index d1ebb06957daa3..9dfdaf97bebe6d 100644 --- a/be/src/vec/exec/format/parquet/vparquet_file_metadata.h +++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h @@ -31,10 +31,7 @@ class FileMetaData { ~FileMetaData(); Status init_schema(); const FieldDescriptor& schema() const { return _schema; } - const tparquet::FileMetaData& to_thrift(); - void iceberg_sanitize(const std::vector& read_columns) { - _schema.iceberg_sanitize(read_columns); - } + const tparquet::FileMetaData& to_thrift() const; std::string debug_string() const; size_t get_mem_size() const { return _mem_size; } diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 331e96962509c4..56d88430e16ecc 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -90,18 +90,19 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams _ctz(ctz), _io_ctx(io_ctx), _state(state), - _meta_cache(meta_cache), _enable_lazy_mat(enable_lazy_mat), _enable_filter_by_min_max( state == nullptr ? true : state->query_options().enable_parquet_filter_by_min_max) { + _meta_cache = meta_cache; _init_profile(); _init_system_properties(); _init_file_description(); } ParquetReader::ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx, RuntimeState* state, bool enable_lazy_mat) + io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache, + bool enable_lazy_mat) : _profile(nullptr), _scan_params(params), _scan_range(range), @@ -111,6 +112,7 @@ ParquetReader::ParquetReader(const TFileScanRangeParams& params, const TFileRang _enable_filter_by_min_max( state == nullptr ? true : state->query_options().enable_parquet_filter_by_min_max) { + _meta_cache = meta_cache; _init_system_properties(); _init_file_description(); } @@ -156,6 +158,8 @@ void ParquetReader::_init_profile() { ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FileOpenTime", parquet_profile, 1); _parquet_profile.open_file_num = ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "FileNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_index_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "PageIndexReadCalls", TUnit::UNIT, 1); _parquet_profile.page_index_filter_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexFilterTime", parquet_profile, 1); _parquet_profile.read_page_index_time = @@ -164,8 +168,10 @@ void ParquetReader::_init_profile() { ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexParseTime", parquet_profile, 1); _parquet_profile.row_group_filter_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "RowGroupFilterTime", parquet_profile, 1); - _parquet_profile.file_meta_read_calls = - ADD_COUNTER_WITH_LEVEL(_profile, "FileMetaReadCalls", TUnit::UNIT, 1); + _parquet_profile.file_footer_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterReadCalls", TUnit::UNIT, 1); + _parquet_profile.file_footer_hit_cache = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterHitCache", TUnit::UNIT, 1); _parquet_profile.decompress_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecompressTime", parquet_profile, 1); _parquet_profile.decompress_cnt = ADD_CHILD_COUNTER_WITH_LEVEL( @@ -220,6 +226,7 @@ Status ParquetReader::_open_file() { _file_reader, _io_ctx->file_reader_stats) : _file_reader; } + if (_file_metadata == nullptr) { SCOPED_RAW_TIMER(&_statistics.parse_footer_time); if (_tracing_file_reader->size() <= sizeof(PARQUET_VERSION_NUMBER)) { @@ -230,25 +237,30 @@ Status ParquetReader::_open_file() { } size_t meta_size = 0; if (_meta_cache == nullptr) { - auto st = - parse_thrift_footer(_tracing_file_reader, &_file_metadata, &meta_size, _io_ctx); - // wrap it with unique ptr, so that it can be released finally. - _file_metadata_ptr.reset(_file_metadata); - RETURN_IF_ERROR(st); + // wrap _file_metadata with unique ptr, so that it can be released finally. + RETURN_IF_ERROR(parse_thrift_footer(_tracing_file_reader, &_file_metadata_ptr, + &meta_size, _io_ctx)); + _file_metadata = _file_metadata_ptr.get(); _column_statistics.read_bytes += meta_size; // parse magic number & parse meta data - _column_statistics.meta_read_calls += 1; + _statistics.file_footer_read_calls += 1; } else { - RETURN_IF_ERROR(_meta_cache->get_parquet_footer(_tracing_file_reader, _io_ctx, - _file_description.mtime, &meta_size, - &_meta_cache_handle)); - _column_statistics.read_bytes += meta_size; - if (meta_size > 0) { - _column_statistics.meta_read_calls += 1; + const auto& file_meta_cache_key = + FileMetaCache::get_key(_tracing_file_reader, _file_description); + if (!_meta_cache->lookup(file_meta_cache_key, &_meta_cache_handle)) { + RETURN_IF_ERROR(parse_thrift_footer(_file_reader, &_file_metadata_ptr, &meta_size, + _io_ctx)); + // _file_metadata_ptr.release() : move control of _file_metadata to _meta_cache_handle + _meta_cache->insert(file_meta_cache_key, _file_metadata_ptr.release(), + &_meta_cache_handle); + _file_metadata = _meta_cache_handle.data(); + _column_statistics.read_bytes += meta_size; + _statistics.file_footer_read_calls += 1; + } else { + _statistics.file_footer_hit_cache++; } - - _file_metadata = (FileMetaData*)_meta_cache_handle.data(); + _file_metadata = _meta_cache_handle.data(); } if (_file_metadata == nullptr) { @@ -292,12 +304,6 @@ void ParquetReader::_init_file_description() { } } -void ParquetReader::iceberg_sanitize(const std::vector& read_columns) { - if (_file_metadata != nullptr) { - _file_metadata->iceberg_sanitize(read_columns); - } -} - Status ParquetReader::init_reader( const std::vector& all_column_names, const std::unordered_map* colname_to_value_range, @@ -792,7 +798,7 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group, } _column_statistics.read_bytes += bytes_read; // read twice: parse column index & parse offset index - _column_statistics.meta_read_calls += 2; + _column_statistics.page_index_read_calls += 2; SCOPED_RAW_TIMER(&_statistics.parse_page_index_time); for (size_t idx = 0; idx < _read_table_columns.size(); idx++) { @@ -1014,13 +1020,16 @@ void ParquetReader::_collect_profile() { COUNTER_UPDATE(_parquet_profile.read_page_index_time, _statistics.read_page_index_time); COUNTER_UPDATE(_parquet_profile.parse_page_index_time, _statistics.parse_page_index_time); COUNTER_UPDATE(_parquet_profile.row_group_filter_time, _statistics.row_group_filter_time); + COUNTER_UPDATE(_parquet_profile.file_footer_read_calls, _statistics.file_footer_read_calls); + COUNTER_UPDATE(_parquet_profile.file_footer_hit_cache, _statistics.file_footer_hit_cache); COUNTER_UPDATE(_parquet_profile.skip_page_header_num, _column_statistics.skip_page_header_num); COUNTER_UPDATE(_parquet_profile.parse_page_header_num, _column_statistics.parse_page_header_num); COUNTER_UPDATE(_parquet_profile.predicate_filter_time, _statistics.predicate_filter_time); COUNTER_UPDATE(_parquet_profile.dict_filter_rewrite_time, _statistics.dict_filter_rewrite_time); - COUNTER_UPDATE(_parquet_profile.file_meta_read_calls, _column_statistics.meta_read_calls); + COUNTER_UPDATE(_parquet_profile.page_index_read_calls, + _column_statistics.page_index_read_calls); COUNTER_UPDATE(_parquet_profile.decompress_time, _column_statistics.decompress_time); COUNTER_UPDATE(_parquet_profile.decompress_cnt, _column_statistics.decompress_cnt); COUNTER_UPDATE(_parquet_profile.decode_header_time, _column_statistics.decode_header_time); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index c560b1c3800ed5..b6329397be6b09 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -86,6 +86,8 @@ class ParquetReader : public GenericReader { int64_t column_read_time = 0; int64_t parse_meta_time = 0; int64_t parse_footer_time = 0; + int64_t file_footer_read_calls = 0; + int64_t file_footer_hit_cache = 0; int64_t open_file_time = 0; int64_t open_file_num = 0; int64_t row_group_filter_time = 0; @@ -102,7 +104,8 @@ class ParquetReader : public GenericReader { bool enable_lazy_mat = true); ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx, RuntimeState* state, bool enable_lazy_mat = true); + io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache = nullptr, + bool enable_lazy_mat = true); ~ParquetReader() override; // for unit test @@ -143,9 +146,6 @@ class ParquetReader : public GenericReader { const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; } - // Only for iceberg reader to sanitize invalid column names - void iceberg_sanitize(const std::vector& read_columns); - Status set_fill_columns( const std::unordered_map>& partition_columns, @@ -174,11 +174,12 @@ class ParquetReader : public GenericReader { RuntimeProfile::Counter* open_file_time = nullptr; RuntimeProfile::Counter* open_file_num = nullptr; RuntimeProfile::Counter* row_group_filter_time = nullptr; + RuntimeProfile::Counter* page_index_read_calls = nullptr; RuntimeProfile::Counter* page_index_filter_time = nullptr; RuntimeProfile::Counter* read_page_index_time = nullptr; RuntimeProfile::Counter* parse_page_index_time = nullptr; - - RuntimeProfile::Counter* file_meta_read_calls = nullptr; + RuntimeProfile::Counter* file_footer_read_calls = nullptr; + RuntimeProfile::Counter* file_footer_hit_cache = nullptr; RuntimeProfile::Counter* decompress_time = nullptr; RuntimeProfile::Counter* decompress_cnt = nullptr; RuntimeProfile::Counter* decode_header_time = nullptr; @@ -240,7 +241,7 @@ class ParquetReader : public GenericReader { // after _file_reader. Otherwise, there may be heap-use-after-free bug. ObjLRUCache::CacheHandle _meta_cache_handle; std::unique_ptr _file_metadata_ptr; - FileMetaData* _file_metadata = nullptr; + const FileMetaData* _file_metadata = nullptr; const tparquet::FileMetaData* _t_metadata = nullptr; // _tracing_file_reader wraps _file_reader. @@ -290,9 +291,6 @@ class ParquetReader : public GenericReader { bool _closed = false; io::IOContext* _io_ctx = nullptr; RuntimeState* _state = nullptr; - // Cache to save some common part such as file footer. - // Maybe null if not used - FileMetaCache* _meta_cache = nullptr; bool _enable_lazy_mat = true; bool _enable_filter_by_min_max = true; const TupleDescriptor* _tuple_descriptor = nullptr; diff --git a/be/src/vec/exec/format/table/hive_reader.cpp b/be/src/vec/exec/format/table/hive_reader.cpp index ee7f805b076a3c..9fb084f1f91217 100644 --- a/be/src/vec/exec/format/table/hive_reader.cpp +++ b/be/src/vec/exec/format/table/hive_reader.cpp @@ -46,7 +46,7 @@ Status HiveOrcReader::init_reader( if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) { // Directly use the table column name to match the file column name, but pay attention to the case issue. RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr, - table_info_node_ptr)); + table_info_node_ptr, _is_file_slot)); } else { // hive1 / use index std::map slot_map; // table_name to slot @@ -70,6 +70,10 @@ Status HiveOrcReader::init_reader( table_info_node_ptr->add_children( table_column_name, orc_type_ptr->getFieldName(file_index), field_node); } + slot_map.erase(table_column_name); + } + for (const auto& [partition_col_name, _] : slot_map) { + table_info_node_ptr->add_not_exist_children(partition_col_name); } } @@ -95,7 +99,7 @@ Status HiveParquetReader::init_reader( if (_state->query_options().hive_parquet_use_column_names) { // Directly use the table column name to match the file column name, but pay attention to the case issue. RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc, - table_info_node_ptr)); + table_info_node_ptr, _is_file_slot)); } else { // use idx std::map slot_map; //table_name to slot for (const auto& slot : tuple_descriptor->slots()) { @@ -109,8 +113,10 @@ Status HiveParquetReader::init_reader( auto file_index = _params.column_idxs[idx]; if (file_index >= parquet_fields_schema.size()) { + // Non-partitioning columns, which may be columns added later. table_info_node_ptr->add_not_exist_children(table_column_name); } else { + // Non-partitioning columns, columns that exist in both the table and the file. auto field_node = std::make_shared(); // for sub-columns, still use name to match columns. RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( @@ -119,6 +125,22 @@ Status HiveParquetReader::init_reader( table_info_node_ptr->add_children( table_column_name, parquet_fields_schema[file_index].name, field_node); } + slot_map.erase(table_column_name); + } + /* + * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`. + * eg: + * Table : A, B, C, D (D: partition column) + * Parquet file : A, B + * Column C is obtained by add column. + * + * sql : select * from table; + * slot : A, B, C, D + * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column) + * + */ + for (const auto& [partition_col_name, _] : slot_map) { + table_info_node_ptr->add_not_exist_children(partition_col_name); } } diff --git a/be/src/vec/exec/format/table/hive_reader.h b/be/src/vec/exec/format/table/hive_reader.h index 2f2c1151799466..6acd344e4176f3 100644 --- a/be/src/vec/exec/format/table/hive_reader.h +++ b/be/src/vec/exec/format/table/hive_reader.h @@ -30,15 +30,21 @@ class HiveReader : public TableFormatReader, public TableSchemaChangeHelper { public: HiveReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx) + io::IOContext* io_ctx, const std::set* is_file_slot, + FileMetaCache* meta_cache) : TableFormatReader(std::move(file_format_reader), state, profile, params, range, - io_ctx) {}; + io_ctx, meta_cache), + _is_file_slot(is_file_slot) {}; ~HiveReader() override = default; Status get_next_block_inner(Block* block, size_t* read_rows, bool* eof) final; Status init_row_filters() final { return Status::OK(); }; + +protected: + // https://github.com/apache/doris/pull/23369 + const std::set* _is_file_slot = nullptr; }; class HiveOrcReader final : public HiveReader { @@ -46,8 +52,10 @@ class HiveOrcReader final : public HiveReader { ENABLE_FACTORY_CREATOR(HiveOrcReader); HiveOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, + const std::set* is_file_slot, FileMetaCache* meta_cache) + : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + is_file_slot, meta_cache) {}; ~HiveOrcReader() final = default; Status init_reader( @@ -65,8 +73,10 @@ class HiveParquetReader final : public HiveReader { ENABLE_FACTORY_CREATOR(HiveParquetReader); HiveParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, + const std::set* is_file_slot, FileMetaCache* meta_cache) + : HiveReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + is_file_slot, meta_cache) {}; ~HiveParquetReader() final = default; Status init_reader( diff --git a/be/src/vec/exec/format/table/hudi_reader.h b/be/src/vec/exec/format/table/hudi_reader.h index 751094018c942a..50fc0e1b495e57 100644 --- a/be/src/vec/exec/format/table/hudi_reader.h +++ b/be/src/vec/exec/format/table/hudi_reader.h @@ -27,9 +27,9 @@ class HudiReader : public TableFormatReader, public TableSchemaChangeHelper { public: HudiReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx) + io::IOContext* io_ctx, FileMetaCache* meta_cache) : TableFormatReader(std::move(file_format_reader), state, profile, params, range, - io_ctx) {}; + io_ctx, meta_cache) {}; ~HudiReader() override = default; @@ -43,8 +43,9 @@ class HudiParquetReader final : public HudiReader { ENABLE_FACTORY_CREATOR(HudiParquetReader); HudiParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) + : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + meta_cache) {}; ~HudiParquetReader() final = default; Status init_reader( @@ -63,8 +64,9 @@ class HudiOrcReader final : public HudiReader { ENABLE_FACTORY_CREATOR(HudiOrcReader); HudiOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) + : HudiReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + meta_cache) {}; ~HudiOrcReader() final = default; Status init_reader( diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index d6471a49efb596..d777bf2dca0fc9 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -78,8 +78,9 @@ IcebergTableReader::IcebergTableReader(std::unique_ptr file_forma RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx), + io::IOContext* io_ctx, FileMetaCache* meta_cache) + : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, + meta_cache), _kv_cache(kv_cache) { static const char* iceberg_profile = "IcebergProfile"; ADD_TIMER(_profile, iceberg_profile); @@ -457,7 +458,7 @@ Status IcebergParquetReader ::_read_position_delete_file(const TFileRangeDesc* d DeleteFile* position_delete) { ParquetReader parquet_delete_reader( _profile, _params, *delete_range, READ_DELETE_FILE_BATCH_SIZE, - const_cast(&_state->timezone_obj()), _io_ctx, _state); + const_cast(&_state->timezone_obj()), _io_ctx, _state, _meta_cache); RETURN_IF_ERROR(parquet_delete_reader.init_reader( delete_file_col_names, nullptr, {}, nullptr, nullptr, nullptr, nullptr, nullptr, TableSchemaChangeHelper::ConstNode::get_instance(), false)); @@ -537,7 +538,8 @@ Status IcebergOrcReader::init_reader( Status IcebergOrcReader::_read_position_delete_file(const TFileRangeDesc* delete_range, DeleteFile* position_delete) { OrcReader orc_delete_reader(_profile, _state, _params, *delete_range, - READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx); + READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx, + _meta_cache); std::unordered_map colname_to_value_range; RETURN_IF_ERROR(orc_delete_reader.init_reader(&delete_file_col_names, &colname_to_value_range, {}, false, {}, {}, nullptr, nullptr)); diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index 67a1c1cc66b5b1..cc88325acf9cfd 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -74,8 +74,8 @@ class IcebergTableReader : public TableFormatReader, public TableSchemaChangeHel IcebergTableReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx); + const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, + FileMetaCache* meta_cache); ~IcebergTableReader() override = default; Status init_row_filters() final; @@ -164,9 +164,9 @@ class IcebergParquetReader final : public IcebergTableReader { IcebergParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, ShardedKVCache* kv_cache, - io::IOContext* io_ctx) + io::IOContext* io_ctx, FileMetaCache* meta_cache) : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, - kv_cache, io_ctx) {} + kv_cache, io_ctx, meta_cache) {} Status init_reader( const std::vector& file_col_names, const std::unordered_map* colname_to_value_range, @@ -187,9 +187,10 @@ class IcebergParquetReader final : public IcebergTableReader { protected: std::unique_ptr _create_equality_reader( const TFileRangeDesc& delete_desc) final { - return ParquetReader::create_unique( - _profile, _params, delete_desc, READ_DELETE_FILE_BATCH_SIZE, - const_cast(&_state->timezone_obj()), _io_ctx, _state); + return ParquetReader::create_unique(_profile, _params, delete_desc, + READ_DELETE_FILE_BATCH_SIZE, + const_cast(&_state->timezone_obj()), + _io_ctx, _state, _meta_cache); } }; class IcebergOrcReader final : public IcebergTableReader { @@ -201,9 +202,10 @@ class IcebergOrcReader final : public IcebergTableReader { IcebergOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx) + const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, + FileMetaCache* meta_cache) : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, - kv_cache, io_ctx) {} + kv_cache, io_ctx, meta_cache) {} void set_delete_rows() final { auto* orc_reader = (OrcReader*)_file_format_reader.get(); @@ -223,7 +225,8 @@ class IcebergOrcReader final : public IcebergTableReader { std::unique_ptr _create_equality_reader( const TFileRangeDesc& delete_desc) override { return OrcReader::create_unique(_profile, _state, _params, delete_desc, - READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx); + READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx, + _meta_cache); } private: diff --git a/be/src/vec/exec/format/table/paimon_reader.cpp b/be/src/vec/exec/format/table/paimon_reader.cpp index 5a84a22863c32d..d5bb048ebf7e58 100644 --- a/be/src/vec/exec/format/table/paimon_reader.cpp +++ b/be/src/vec/exec/format/table/paimon_reader.cpp @@ -28,8 +28,9 @@ namespace doris::vectorized { PaimonReader::PaimonReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx) { + io::IOContext* io_ctx, FileMetaCache* meta_cache) + : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, + meta_cache) { static const char* paimon_profile = "PaimonProfile"; ADD_TIMER(_profile, paimon_profile); _paimon_profile.num_delete_rows = diff --git a/be/src/vec/exec/format/table/paimon_reader.h b/be/src/vec/exec/format/table/paimon_reader.h index eb6d909bac5c21..2bb8e105be50cc 100644 --- a/be/src/vec/exec/format/table/paimon_reader.h +++ b/be/src/vec/exec/format/table/paimon_reader.h @@ -30,7 +30,7 @@ class PaimonReader : public TableFormatReader, public TableSchemaChangeHelper { public: PaimonReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx); + const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache); ~PaimonReader() override = default; @@ -54,8 +54,9 @@ class PaimonOrcReader final : public PaimonReader { ENABLE_FACTORY_CREATOR(PaimonOrcReader); PaimonOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : PaimonReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) + : PaimonReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + meta_cache) {}; ~PaimonOrcReader() final = default; void set_delete_rows() final { @@ -90,8 +91,10 @@ class PaimonParquetReader final : public PaimonReader { ENABLE_FACTORY_CREATOR(PaimonParquetReader); PaimonParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : PaimonReader(std::move(file_format_reader), profile, state, params, range, io_ctx) {}; + const TFileRangeDesc& range, io::IOContext* io_ctx, + FileMetaCache* meta_cache) + : PaimonReader(std::move(file_format_reader), profile, state, params, range, io_ctx, + meta_cache) {}; ~PaimonParquetReader() final = default; void set_delete_rows() final { diff --git a/be/src/vec/exec/format/table/table_format_reader.cpp b/be/src/vec/exec/format/table/table_format_reader.cpp index 3bb2957981bcaf..6ecad595f7b8ad 100644 --- a/be/src/vec/exec/format/table/table_format_reader.cpp +++ b/be/src/vec/exec/format/table/table_format_reader.cpp @@ -38,7 +38,8 @@ const Status TableSchemaChangeHelper::BuildTableInfoUtil::SCHEMA_ERROR = Status: Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( const TupleDescriptor* table_tuple_descriptor, const FieldDescriptor& parquet_field_desc, - std::shared_ptr& node) { + std::shared_ptr& node, + const std::set* is_file_slot) { auto struct_node = std::make_shared(); auto parquet_fields_schema = parquet_field_desc.get_fields_schema(); std::map file_column_name_idx_map; @@ -48,8 +49,9 @@ Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( for (const auto& slot : table_tuple_descriptor->slots()) { const auto& table_column_name = slot->col_name(); - - if (file_column_name_idx_map.contains(table_column_name)) { + // https://github.com/apache/doris/pull/23369/files + if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && + file_column_name_idx_map.contains(table_column_name)) { auto file_column_idx = file_column_name_idx_map[table_column_name]; std::shared_ptr field_node = nullptr; RETURN_IF_ERROR(by_parquet_name(slot->get_data_type_ptr(), @@ -159,7 +161,8 @@ Status TableSchemaChangeHelper::BuildTableInfoUtil::by_parquet_name( Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( const TupleDescriptor* table_tuple_descriptor, const orc::Type* orc_type_ptr, - std::shared_ptr& node) { + std::shared_ptr& node, + const std::set* is_file_slot) { auto struct_node = std::make_shared(); std::map file_column_name_idx_map; @@ -170,7 +173,8 @@ Status TableSchemaChangeHelper::BuildTableInfoUtil::by_orc_name( for (const auto& slot : table_tuple_descriptor->slots()) { const auto& table_column_name = slot->col_name(); - if (file_column_name_idx_map.contains(table_column_name)) { + if ((is_file_slot == nullptr || is_file_slot->contains(slot->id())) && + file_column_name_idx_map.contains(table_column_name)) { auto file_column_idx = file_column_name_idx_map[table_column_name]; std::shared_ptr field_node = nullptr; RETURN_IF_ERROR(by_orc_name(slot->get_data_type_ptr(), diff --git a/be/src/vec/exec/format/table/table_format_reader.h b/be/src/vec/exec/format/table/table_format_reader.h index 1f79de54ed4002..86f30ecb1697fb 100644 --- a/be/src/vec/exec/format/table/table_format_reader.h +++ b/be/src/vec/exec/format/table/table_format_reader.h @@ -47,13 +47,14 @@ class TableFormatReader : public GenericReader { public: TableFormatReader(std::unique_ptr file_format_reader, RuntimeState* state, RuntimeProfile* profile, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) + const TFileRangeDesc& range, io::IOContext* io_ctx, FileMetaCache* meta_cache) : _file_format_reader(std::move(file_format_reader)), _state(state), _profile(profile), _params(params), _range(range), _io_ctx(io_ctx) { + _meta_cache = meta_cache; if (range.table_format_params.__isset.table_level_row_count) { _table_level_row_count = range.table_format_params.table_level_row_count; } else { @@ -330,7 +331,8 @@ class TableSchemaChangeHelper { // for hive parquet : The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. static Status by_parquet_name(const TupleDescriptor* table_tuple_descriptor, const FieldDescriptor& parquet_field_desc, - std::shared_ptr& node); + std::shared_ptr& node, + const std::set* is_file_slot = nullptr); // for hive parquet static Status by_parquet_name(const DataTypePtr& table_data_type, @@ -340,7 +342,8 @@ class TableSchemaChangeHelper { // for hive orc: The table column names passed from fe are lowercase, so use lowercase file column names to match table column names. static Status by_orc_name(const TupleDescriptor* table_tuple_descriptor, const orc::Type* orc_type_ptr, - std::shared_ptr& node); + std::shared_ptr& node, + const std::set* is_file_slot = nullptr); // for hive orc static Status by_orc_name(const DataTypePtr& table_data_type, const orc::Type* orc_root, std::shared_ptr& node); diff --git a/be/src/vec/exec/format/table/transactional_hive_reader.cpp b/be/src/vec/exec/format/table/transactional_hive_reader.cpp index 79e611fe066c8a..87b3f9c2dfbd71 100644 --- a/be/src/vec/exec/format/table/transactional_hive_reader.cpp +++ b/be/src/vec/exec/format/table/transactional_hive_reader.cpp @@ -38,8 +38,10 @@ namespace doris::vectorized { TransactionalHiveReader::TransactionalHiveReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, - const TFileRangeDesc& range, io::IOContext* io_ctx) - : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx) { + const TFileRangeDesc& range, io::IOContext* io_ctx, + FileMetaCache* meta_cache) + : TableFormatReader(std::move(file_format_reader), state, profile, params, range, io_ctx, + meta_cache) { static const char* transactional_hive_profile = "TransactionalHiveProfile"; ADD_TIMER(_profile, transactional_hive_profile); _transactional_orc_profile.num_delete_files = @@ -163,7 +165,7 @@ Status TransactionalHiveReader::init_row_filters() { delete_range.file_size = -1; OrcReader delete_reader(_profile, _state, _params, delete_range, _MIN_BATCH_SIZE, - _state->timezone(), _io_ctx, false); + _state->timezone(), _io_ctx, _meta_cache, false); auto acid_info_node = std::make_shared(); for (auto idx = 0; idx < TransactionalHive::DELETE_ROW_COLUMN_NAMES_LOWER_CASE.size(); diff --git a/be/src/vec/exec/format/table/transactional_hive_reader.h b/be/src/vec/exec/format/table/transactional_hive_reader.h index 60114bbb29c4f0..e8f432103965c5 100644 --- a/be/src/vec/exec/format/table/transactional_hive_reader.h +++ b/be/src/vec/exec/format/table/transactional_hive_reader.h @@ -83,7 +83,7 @@ class TransactionalHiveReader : public TableFormatReader, public TableSchemaChan TransactionalHiveReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, - io::IOContext* io_ctx); + io::IOContext* io_ctx, FileMetaCache* meta_cache); ~TransactionalHiveReader() override = default; Status init_row_filters() final; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 6e197ccee46b16..3665e52270f002 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -1006,12 +1006,14 @@ Status VFileScanner::_get_next_reader() { break; } case TFileFormatType::FORMAT_PARQUET: { + auto file_meta_cache_ptr = _should_enable_file_meta_cache() + ? ExecEnv::GetInstance()->file_meta_cache() + : nullptr; std::unique_ptr parquet_reader = ParquetReader::create_unique( _profile, *_params, range, _state->query_options().batch_size, const_cast(&_state->timezone_obj()), _io_ctx.get(), _state, - _should_enable_file_meta_cache() ? ExecEnv::GetInstance()->file_meta_cache() - : nullptr, - _state->query_options().enable_parquet_lazy_mat); + file_meta_cache_ptr, _state->query_options().enable_parquet_lazy_mat); + // ATTN: the push down agg type may be set back to NONE, // see IcebergTableReader::init_row_filters for example. parquet_reader->set_push_down_agg_type(_get_push_down_agg_type()); @@ -1023,7 +1025,7 @@ Status VFileScanner::_get_next_reader() { std::unique_ptr iceberg_reader = IcebergParquetReader::create_unique(std::move(parquet_reader), _profile, _state, *_params, range, _kv_cache, - _io_ctx.get()); + _io_ctx.get(), file_meta_cache_ptr); init_status = iceberg_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, @@ -1033,7 +1035,8 @@ Status VFileScanner::_get_next_reader() { range.table_format_params.table_format_type == "paimon") { std::unique_ptr paimon_reader = PaimonParquetReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _io_ctx.get()); + _state, *_params, range, _io_ctx.get(), + file_meta_cache_ptr); init_status = paimon_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, @@ -1042,18 +1045,19 @@ Status VFileScanner::_get_next_reader() { _cur_reader = std::move(paimon_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { - std::unique_ptr hudi_reader = - HudiParquetReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _io_ctx.get()); + std::unique_ptr hudi_reader = HudiParquetReader::create_unique( + std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), + file_meta_cache_ptr); init_status = hudi_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); _cur_reader = std::move(hudi_reader); } else if (range.table_format_params.table_format_type == "hive") { - auto hive_reader = - HiveParquetReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _io_ctx.get()); + auto hive_reader = HiveParquetReader::create_unique( + std::move(parquet_reader), _profile, _state, *_params, range, _io_ctx.get(), + &_is_file_slot, file_meta_cache_ptr); + init_status = hive_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, @@ -1112,9 +1116,14 @@ Status VFileScanner::_get_next_reader() { break; } case TFileFormatType::FORMAT_ORC: { + auto file_meta_cache_ptr = _should_enable_file_meta_cache() + ? ExecEnv::GetInstance()->file_meta_cache() + : nullptr; std::unique_ptr orc_reader = OrcReader::create_unique( _profile, _state, *_params, range, _state->query_options().batch_size, - _state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat); + _state->timezone(), _io_ctx.get(), file_meta_cache_ptr, + _state->query_options().enable_orc_lazy_mat); + orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { RETURN_IF_ERROR(_process_late_arrival_conjuncts()); @@ -1124,7 +1133,7 @@ Status VFileScanner::_get_next_reader() { std::unique_ptr tran_orc_reader = TransactionalHiveReader::create_unique(std::move(orc_reader), _profile, _state, *_params, range, - _io_ctx.get()); + _io_ctx.get(), file_meta_cache_ptr); init_status = tran_orc_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), @@ -1133,9 +1142,9 @@ Status VFileScanner::_get_next_reader() { _cur_reader = std::move(tran_orc_reader); } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { - std::unique_ptr iceberg_reader = - IcebergOrcReader::create_unique(std::move(orc_reader), _profile, _state, - *_params, range, _kv_cache, _io_ctx.get()); + std::unique_ptr iceberg_reader = IcebergOrcReader::create_unique( + std::move(orc_reader), _profile, _state, *_params, range, _kv_cache, + _io_ctx.get(), file_meta_cache_ptr); init_status = iceberg_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, @@ -1145,7 +1154,8 @@ Status VFileScanner::_get_next_reader() { } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "paimon") { std::unique_ptr paimon_reader = PaimonOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get()); + std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get(), + file_meta_cache_ptr); init_status = paimon_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, @@ -1156,7 +1166,8 @@ Status VFileScanner::_get_next_reader() { } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hudi") { std::unique_ptr hudi_reader = HudiOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get()); + std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get(), + file_meta_cache_ptr); init_status = hudi_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, @@ -1166,8 +1177,8 @@ Status VFileScanner::_get_next_reader() { } else if (range.__isset.table_format_params && range.table_format_params.table_format_type == "hive") { std::unique_ptr hive_reader = HiveOrcReader::create_unique( - std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get()); - + std::move(orc_reader), _profile, _state, *_params, range, _io_ctx.get(), + &_is_file_slot, file_meta_cache_ptr); init_status = hive_reader->init_reader( _file_col_names, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), @@ -1428,6 +1439,7 @@ Status VFileScanner::_init_expr_ctxes() { fmt::format("Unknown source slot descriptor, slot_id={}", slot_id)); } if (slot_info.is_file_slot) { + _is_file_slot.emplace(slot_id); _file_slot_descs.emplace_back(it->second); _file_col_names.push_back(it->second->col_name()); } diff --git a/be/src/vec/exec/scan/vfile_scanner.h b/be/src/vec/exec/scan/vfile_scanner.h index 7e58d258d4634a..437a87a68167f9 100644 --- a/be/src/vec/exec/scan/vfile_scanner.h +++ b/be/src/vec/exec/scan/vfile_scanner.h @@ -30,6 +30,7 @@ #include "common/global_types.h" #include "common/status.h" #include "exec/olap_common.h" +#include "io/fs/file_meta_cache.h" #include "io/io_common.h" #include "pipeline/exec/file_scan_operator.h" #include "runtime/descriptors.h" @@ -151,6 +152,7 @@ class VFileScanner : public VScanner { // owned by scan node ShardedKVCache* _kv_cache = nullptr; + std::set _is_file_slot; bool _scanner_eof = false; int _rows = 0; int _num_of_columns_from_file; @@ -247,7 +249,7 @@ class VFileScanner : public VScanner { // 2. the file number is less than 1/3 of cache's capacibility // Otherwise, the cache miss rate will be high bool _should_enable_file_meta_cache() { - return config::max_external_file_meta_cache_num > 0 && + return ExecEnv::GetInstance()->file_meta_cache()->enabled() && _split_source->num_scan_ranges() < config::max_external_file_meta_cache_num / 3; } }; diff --git a/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp b/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp new file mode 100644 index 00000000000000..3aef8db8459f39 --- /dev/null +++ b/be/test/vec/exec/format/file_reader/file_meta_cache_test.cpp @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/fs/file_meta_cache.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "io/fs/file_reader.h" + +namespace doris { + +class MockFileReader : public io::FileReader { +public: + MockFileReader(const std::string& file_name, size_t size) + : _file_name(file_name), _size(size), _closed(false) {} + ~MockFileReader() override = default; + + const io::Path& path() const override { + static io::Path p(_file_name); + return p; + } + + size_t size() const override { return _size; } + + bool closed() const override { return _closed; } + + Status close() override { + _closed = true; + return Status::OK(); + } + +protected: + Status read_at_impl(size_t offset, Slice result, size_t* bytes_read, + const io::IOContext* io_ctx) override { + *bytes_read = 0; + return Status::OK(); + } + +private: + std::string _file_name; + size_t _size; + bool _closed; +}; + +TEST(FileMetaCacheTest, KeyGenerationFromParams) { + std::string file_name = "/path/to/file"; + int64_t mtime = 123456789; + int64_t file_size = 987654321; + + std::string key1 = FileMetaCache::get_key(file_name, mtime, file_size); + std::string key2 = FileMetaCache::get_key(file_name, mtime, file_size); + EXPECT_EQ(key1, key2) << "Same parameters should produce same key"; + + // Different mtime should produce different key + std::string key3 = FileMetaCache::get_key(file_name, mtime + 1, file_size); + EXPECT_NE(key1, key3); + + // mtime == 0, use file_size + std::string key4 = FileMetaCache::get_key(file_name, 0, file_size); + std::string key5 = FileMetaCache::get_key(file_name, 0, file_size); + EXPECT_EQ(key4, key5); + EXPECT_NE(key1, key4); + + // mtime == 0, different file_size + std::string key6 = FileMetaCache::get_key(file_name, 0, file_size + 1); + EXPECT_NE(key4, key6); +} + +TEST(FileMetaCacheTest, KeyGenerationFromFileReader) { + std::string file_name = "/path/to/file"; + int64_t mtime = 123456789; + int64_t file_size = 100; + + // file_description.file_size != -1, use it as file size + io::FileDescription desc1; + desc1.mtime = mtime; + desc1.file_size = file_size; + auto reader1 = std::make_shared(file_name, 200); + + std::string key1 = FileMetaCache::get_key(reader1, desc1); + std::string expected_key1 = FileMetaCache::get_key(file_name, mtime, file_size); + EXPECT_EQ(key1, expected_key1); + + // file_description.file_size == -1, use reader->size() + io::FileDescription desc2; + desc2.mtime = 0; + desc2.file_size = -1; + auto reader2 = std::make_shared(file_name, 300); + + std::string key2 = FileMetaCache::get_key(reader2, desc2); + std::string expected_key2 = FileMetaCache::get_key(file_name, 0, 300); + EXPECT_EQ(key2, expected_key2); +} +TEST(FileMetaCacheTest, KeyContentVerification) { + std::string file_name = "/path/to/file"; + int64_t mtime = 0x0102030405060708; + int64_t file_size = 0x1112131415161718; + + std::string key_with_mtime = FileMetaCache::get_key(file_name, mtime, file_size); + + ASSERT_EQ(key_with_mtime.size(), file_name.size() + sizeof(int64_t)); + + EXPECT_EQ(memcmp(key_with_mtime.data(), file_name.data(), file_name.size()), 0); + + int64_t extracted_mtime = 0; + memcpy(&extracted_mtime, key_with_mtime.data() + file_name.size(), sizeof(int64_t)); + EXPECT_EQ(extracted_mtime, mtime); + + std::string key_with_filesize = FileMetaCache::get_key(file_name, 0, file_size); + ASSERT_EQ(key_with_filesize.size(), file_name.size() + sizeof(int64_t)); + EXPECT_EQ(memcmp(key_with_filesize.data(), file_name.data(), file_name.size()), 0); + int64_t extracted_filesize = 0; + memcpy(&extracted_filesize, key_with_filesize.data() + file_name.size(), sizeof(int64_t)); + EXPECT_EQ(extracted_filesize, file_size); +} + +TEST(FileMetaCacheTest, InsertAndLookupWithIntValue) { + FileMetaCache cache(1024 * 1024); + + int* value = new int(12345); + ObjLRUCache::CacheHandle handle; + + cache.insert("key_int", value, &handle); + ASSERT_NE(handle._cache, nullptr); + + const int* cached_val = handle.data(); + ASSERT_NE(cached_val, nullptr); + EXPECT_EQ(*cached_val, 12345); + + ObjLRUCache::CacheHandle handle2; + cache.lookup("key_int", &handle2); + + ASSERT_NE(handle2._cache, nullptr); + + const int* cached_val2 = handle2.data(); + ASSERT_NE(cached_val2, nullptr); + EXPECT_EQ(*cached_val2, 12345); +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp index 9a0c9c22c35c4f..feb922c8f0b312 100644 --- a/be/test/vec/exec/format/parquet/parquet_reader_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp @@ -34,6 +34,7 @@ #include "common/object_pool.h" #include "exec/olap_common.h" #include "gtest/gtest_pred_impl.h" +#include "io/fs/file_meta_cache.h" #include "io/fs/file_reader_writer_fwd.h" #include "io/fs/file_system.h" #include "io/fs/local_file_system.h" @@ -53,7 +54,9 @@ class VExprContext; class ParquetReaderTest : public testing::Test { public: - ParquetReaderTest() {} + ParquetReaderTest() : cache(1024) {} + + FileMetaCache cache; }; static void create_table_desc(TDescriptorTable& t_desc_table, TTableDescriptor& t_table_desc, @@ -142,8 +145,8 @@ TEST_F(ParquetReaderTest, normal) { scan_range.start_offset = 0; scan_range.size = 1000; } - auto p_reader = - new ParquetReader(nullptr, scan_params, scan_range, 992, &ctz, nullptr, nullptr); + auto p_reader = new ParquetReader(nullptr, scan_params, scan_range, 992, &ctz, nullptr, nullptr, + &cache); p_reader->set_file_reader(reader); RuntimeState runtime_state((TQueryGlobals())); runtime_state.set_desc_tbl(desc_tbl); diff --git a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp index 66b13dd74453fb..08c541fca5ef78 100644 --- a/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp @@ -76,7 +76,7 @@ TEST_F(ParquetThriftReaderTest, normal) { &reader); EXPECT_TRUE(st.ok()); - FileMetaData* meta_data; + std::unique_ptr meta_data; size_t meta_size; static_cast(parse_thrift_footer(reader, &meta_data, &meta_size, nullptr)); tparquet::FileMetaData t_metadata = meta_data->to_thrift(); @@ -92,7 +92,6 @@ TEST_F(ParquetThriftReaderTest, normal) { LOG(WARNING) << "schema column repetition_type: " << value.repetition_type; LOG(WARNING) << "schema column num children: " << value.num_children; } - delete meta_data; } TEST_F(ParquetThriftReaderTest, complex_nested_file) { @@ -110,7 +109,7 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { &reader); EXPECT_TRUE(st.ok()); - FileMetaData* metadata; + std::unique_ptr metadata; size_t meta_size; static_cast(parse_thrift_footer(reader, &metadata, &meta_size, nullptr)); tparquet::FileMetaData t_metadata = metadata->to_thrift(); @@ -157,7 +156,6 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { ASSERT_EQ(schemaDescriptor.get_column_index("friend"), 3); ASSERT_EQ(schemaDescriptor.get_column_index("mark"), 4); - delete metadata; } static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) { @@ -399,7 +397,8 @@ static void read_parquet_data_and_check(const std::string& parquet_file, std::unique_ptr block; create_block(block); - FileMetaData* metadata; + + std::unique_ptr metadata; size_t meta_size; static_cast(parse_thrift_footer(reader, &metadata, &meta_size, nullptr)); tparquet::FileMetaData t_metadata = metadata->to_thrift(); @@ -446,7 +445,6 @@ static void read_parquet_data_and_check(const std::string& parquet_file, Slice res(result_buf.data(), result->size()); static_cast(result->read_at(0, res, &bytes_read)); ASSERT_STREQ(block->dump_data(0, rows).c_str(), reinterpret_cast(result_buf.data())); - delete metadata; } TEST_F(ParquetThriftReaderTest, type_decoder) { diff --git a/be/test/vec/exec/orc/orc_convert_dict_test.cpp b/be/test/vec/exec/orc/orc_convert_dict_test.cpp index bce08cc63dbaae..0aea8438048bab 100644 --- a/be/test/vec/exec/orc/orc_convert_dict_test.cpp +++ b/be/test/vec/exec/orc/orc_convert_dict_test.cpp @@ -82,7 +82,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // Execute conversion auto result_column = reader->_convert_dict_column_to_string_column( @@ -119,7 +119,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) { TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( @@ -151,7 +151,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( @@ -182,7 +182,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get()); @@ -214,7 +214,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get()); diff --git a/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp index dcdc07349b3b4d..0c9e5a29eae154 100644 --- a/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp +++ b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp @@ -81,7 +81,7 @@ TEST_F(OrcReaderFillDataTest, TestFillLongColumn) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); MutableColumnPtr xx = column->assume_mutable(); @@ -107,7 +107,7 @@ TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); MutableColumnPtr xx = column->assume_mutable(); @@ -161,7 +161,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector { @@ -247,7 +247,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector {std::make_shared(), @@ -333,7 +333,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector {std::make_shared>(18, 5)}, @@ -447,7 +447,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared(std::make_shared(), std::make_shared()); diff --git a/be/test/vec/exec/orc/orc_reader_init_column_test.cpp b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp index 841b7fb813e5ef..c4c2ca51a38db7 100644 --- a/be/test/vec/exec/orc/orc_reader_init_column_test.cpp +++ b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp @@ -54,7 +54,7 @@ TEST_F(OrcReaderInitColumnTest, InitReadColumn) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); reader->_reader = std::move(orc_reader); std::vector tmp; tmp.emplace_back("col1"); @@ -73,7 +73,7 @@ TEST_F(OrcReaderInitColumnTest, CheckAcidSchemaTest) { using namespace orc; TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // 1. Test standard ACID schema { // Create standard ACID structure @@ -140,7 +140,7 @@ TEST_F(OrcReaderInitColumnTest, RemoveAcidTest) { using namespace orc; TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); // 1. Test removing ACID info from ACID schema { // Create ACID schema diff --git a/be/test/vec/exec/orc_reader_test.cpp b/be/test/vec/exec/orc_reader_test.cpp index ff7452ae625428..45b15cd414ecc3 100644 --- a/be/test/vec/exec/orc_reader_test.cpp +++ b/be/test/vec/exec/orc_reader_test.cpp @@ -23,6 +23,7 @@ #include #include +#include "io/fs/file_meta_cache.h" #include "orc/sargs/SearchArgument.hh" #include "runtime/define_primitive_type.h" #include "runtime/exec_env.h" @@ -36,9 +37,11 @@ namespace doris::vectorized { class OrcReaderTest : public testing::Test { public: - OrcReaderTest() = default; + OrcReaderTest() : cache(1024) {} ~OrcReaderTest() override = default; + FileMetaCache cache; + private: static constexpr const char* CANNOT_PUSH_DOWN_ERROR = "can't push down"; std::string build_search_argument(const std::string& expr) { @@ -65,7 +68,7 @@ class OrcReaderTest : public testing::Test { range.path = "./be/test/exec/test_data/orc_scanner/orders.orc"; range.start_offset = 0; range.size = 1293; - auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + auto reader = OrcReader::create_unique(params, range, "", nullptr, &cache, true); auto status = reader->init_reader(&column_names, nullptr, {}, false, tuple_desc, &row_desc, nullptr, nullptr); EXPECT_TRUE(status.ok()); diff --git a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql index 8bd75b2809f55d..dbc0b857f44dcc 100644 --- a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql +++ b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql @@ -362,4 +362,44 @@ VALUES (1, 'z', 0.00), (7, 'n1', -1.23), (8, 'n20', -20.00), (9, 'big', 9999999.99), - (10, 'null', NULL); \ No newline at end of file + (10, 'null', NULL); + + +create database if not exists demo.test_db; +use demo.test_db; + + +CREATE TABLE test_invalid_avro_name_parquet ( + id INT, + `TEST:A1B2.RAW.ABC-GG-1-A` STRING +) +USING iceberg +TBLPROPERTIES( + 'write.format.default' = 'parquet' +); + + +CREATE TABLE test_invalid_avro_name_orc ( + id INT, + `TEST:A1B2.RAW.ABC-GG-1-A` STRING +) +USING iceberg +TBLPROPERTIES( + 'write.format.default' = 'orc' +); + +INSERT INTO test_invalid_avro_name_parquet VALUES + (1, 'row1'), + (2, 'row2'), + (3, 'row3'), + (4, 'row4'), + (5, 'row5'); + +INSERT INTO test_invalid_avro_name_orc VALUES + (1, 'row1'), + (2, 'row2'), + (3, 'row3'), + (4, 'row4'), + (5, 'row5'); + + diff --git a/regression-test/data/external_table_p0/hive/test_external_catalog_hive.out b/regression-test/data/external_table_p0/hive/test_external_catalog_hive.out index a55a5bfee1a086..ba84b42aa43feb 100644 --- a/regression-test/data/external_table_p0/hive/test_external_catalog_hive.out +++ b/regression-test/data/external_table_p0/hive/test_external_catalog_hive.out @@ -107,6 +107,42 @@ a126 15 -- !par_fields_in_file_parquet5 -- +-- !par_fields_in_file_orc1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc5 -- + +-- !par_fields_in_file_parquet5 -- + -- !parquet_adjusted_utc -- 1997-09-21 1999-01-12T15:12:31.235784 1998-01-12 1993-06-11T11:33:12.356500 @@ -225,6 +261,42 @@ a126 15 -- !par_fields_in_file_parquet5 -- +-- !par_fields_in_file_orc1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc5 -- + +-- !par_fields_in_file_parquet5 -- + -- !parquet_adjusted_utc -- 1997-09-21 1999-01-12T15:12:31.235784 1998-01-12 1993-06-11T11:33:12.356500 diff --git a/regression-test/data/external_table_p0/hive/test_file_meta_cache.out b/regression-test/data/external_table_p0/hive/test_file_meta_cache.out new file mode 100644 index 00000000000000..0c00303595648f --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_file_meta_cache.out @@ -0,0 +1,65 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !1 -- +1 a +2 b + +-- !2 -- +3 c +4 d + +-- !3 -- +5 e +6 f + +-- !4 -- +7 g +8 h + +-- !1 -- +1 a +2 b + +-- !2 -- +3 c +4 d + +-- !3 -- +5 e +6 f + +-- !4 -- +7 g +8 h + +-- !1 -- +1 a +2 b + +-- !2 -- +3 c +4 d + +-- !3 -- +5 e +6 f + +-- !4 -- +7 g +8 h + +-- !1 -- +1 a +2 b + +-- !2 -- +3 c +4 d + +-- !3 -- +5 e +6 f + +-- !4 -- +7 g +8 h + diff --git a/regression-test/data/external_table_p0/iceberg/test_iceberg_invaild_avro_name.out b/regression-test/data/external_table_p0/iceberg/test_iceberg_invaild_avro_name.out new file mode 100644 index 00000000000000..1b8af743c6a0f2 --- /dev/null +++ b/regression-test/data/external_table_p0/iceberg/test_iceberg_invaild_avro_name.out @@ -0,0 +1,57 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !desc -- +id int Yes true \N +test:a1b2.raw.abc-gg-1-a text Yes true \N + +-- !q_1 -- +1 row1 +2 row2 +3 row3 +4 row4 +5 row5 + +-- !q_2 -- +3 row3 + +-- !q_3 -- +1 +2 +3 +4 +5 + +-- !q_4 -- +5 row5 +4 row4 +3 row3 +2 row2 +1 row1 + +-- !desc -- +id int Yes true \N +test:a1b2.raw.abc-gg-1-a text Yes true \N + +-- !q_1 -- +1 row1 +2 row2 +3 row3 +4 row4 +5 row5 + +-- !q_2 -- +3 row3 + +-- !q_3 -- +1 +2 +3 +4 +5 + +-- !q_4 -- +5 row5 +4 row4 +3 row3 +2 row2 +1 row1 + diff --git a/regression-test/suites/external_table_p0/hive/test_external_catalog_hive.groovy b/regression-test/suites/external_table_p0/hive/test_external_catalog_hive.groovy index 16c8c1f6707d04..0f30fd0e504e64 100644 --- a/regression-test/suites/external_table_p0/hive/test_external_catalog_hive.groovy +++ b/regression-test/suites/external_table_p0/hive/test_external_catalog_hive.groovy @@ -108,6 +108,8 @@ suite("test_external_catalog_hive", "p0,external,hive,external_docker,external_d //qt_null_expr_dict_filter_orc """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_orc WHERE `partitions` in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ //qt_null_expr_dict_filter_parquet """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_parquet WHERE `partitions` in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ + sql """set hive_orc_use_column_names = true """ + sql """set hive_parquet_use_column_names = true """ // test par fields in file qt_par_fields_in_file_orc1 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 and month = 8 order by id; """ qt_par_fields_in_file_parquet1 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 and month = 8 order by id; """ @@ -120,6 +122,22 @@ suite("test_external_catalog_hive", "p0,external,hive,external_docker,external_d qt_par_fields_in_file_orc5 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year = 2022 order by id; """ qt_par_fields_in_file_parquet5 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year = 2022 order by id; """ + sql """set hive_orc_use_column_names = false; """ + sql """set hive_parquet_use_column_names = false""" + qt_par_fields_in_file_orc1 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_parquet1 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_orc2 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 order by id; """ + qt_par_fields_in_file_parquet2 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 order by id; """ + qt_par_fields_in_file_orc3 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 order by id; """ + qt_par_fields_in_file_parquet3 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 order by id; """ + qt_par_fields_in_file_orc4 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_parquet4 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_orc5 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year = 2022 order by id; """ + qt_par_fields_in_file_parquet5 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year = 2022 order by id; """ + + + sql """set hive_orc_use_column_names = true """ + sql """set hive_parquet_use_column_names = true """ // timestamp with isAdjustedToUTC=true qt_parquet_adjusted_utc """select * from multi_catalog.timestamp_with_time_zone order by date_col;""" diff --git a/regression-test/suites/external_table_p0/hive/test_file_meta_cache.groovy b/regression-test/suites/external_table_p0/hive/test_file_meta_cache.groovy new file mode 100644 index 00000000000000..6fc88e63d54721 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_file_meta_cache.groovy @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_file_meta_cache", "p0,external,hive,external_docker,external_docker_hive") { + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + + + + for (String fileFormat : ["PARQUET", "ORC"] ) { + for (String hivePrefix : ["hive2", "hive3"]) { + setHivePrefix(hivePrefix) + try { + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + + sql """ drop catalog if exists test_file_meta_cache """ + sql """CREATE CATALOG test_file_meta_cache PROPERTIES ( + 'type'='hms', + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + + hive_docker """show databases;""" + hive_docker """drop table if exists default.test_file_meta_cache; """ + hive_docker """ + create table default.test_file_meta_cache (col1 int, col2 string) STORED AS ${fileFormat}; + """ + hive_docker """insert into default.test_file_meta_cache values (1, "a"),(2, "b"); """ + + sql """ refresh catalog test_file_meta_cache """ + qt_1 """ select * from test_file_meta_cache.`default`.test_file_meta_cache order by col1 ; """ + + hive_docker """ TRUNCATE TABLE test_file_meta_cache """ + hive_docker """insert into default.test_file_meta_cache values (3, "c"), (4, "d"); """ + + sql """ refresh catalog test_file_meta_cache """ + qt_2 """ select * from test_file_meta_cache.`default`.test_file_meta_cache order by col1 ; """ + + + + hive_docker """ drop TABLE test_file_meta_cache """ + hive_docker """ + create table default.test_file_meta_cache (col1 int, col2 string) STORED AS PARQUET; + """ + hive_docker """insert into default.test_file_meta_cache values (5, "e"), (6, "f"); """ + + sql """ refresh catalog test_file_meta_cache """ + qt_3 """ select * from test_file_meta_cache.`default`.test_file_meta_cache order by col1 ; """ + + hive_docker """ INSERT OVERWRITE TABLE test_file_meta_cache values (7,'g'), (8, 'h'); """ + + sql """ refresh catalog test_file_meta_cache """ + qt_4 """ select * from test_file_meta_cache.`default`.test_file_meta_cache order by col1 ; """ + + + } finally { + } + } + } + +} + diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_invaild_avro_name.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_invaild_avro_name.groovy new file mode 100644 index 00000000000000..cf79d6d1fef725 --- /dev/null +++ b/regression-test/suites/external_table_p0/iceberg/test_iceberg_invaild_avro_name.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_iceberg_invaild_avro_name", "p0,external,doris,external_docker,external_docker_doris") { + + String enabled = context.config.otherConfigs.get("enableIcebergTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable iceberg test.") + return + } + + String catalog_name = "test_iceberg_invaild_avro_name" + String rest_port = context.config.otherConfigs.get("iceberg_rest_uri_port") + String minio_port = context.config.otherConfigs.get("iceberg_minio_port") + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + sql """drop catalog if exists ${catalog_name}""" + sql """ + CREATE CATALOG ${catalog_name} PROPERTIES ( + 'type'='iceberg', + 'iceberg.catalog.type'='rest', + 'uri' = 'http://${externalEnvIp}:${rest_port}', + "s3.access_key" = "admin", + "s3.secret_key" = "password", + "s3.endpoint" = "http://${externalEnvIp}:${minio_port}", + "s3.region" = "us-east-1" + );""" + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use test_db;""" + + def tables = ["test_invalid_avro_name_parquet", "test_invalid_avro_name_orc"] + + + for (String table: tables) { + qt_desc """ desc ${table} """ + qt_q_1 """ SELECT * FROM ${table} order by id;""" + qt_q_2 """ SELECT * FROM ${table} WHERE `TEST:A1B2.RAW.ABC-GG-1-A` = 'row3' order by id;""" + qt_q_3 """ SELECT id FROM ${table} WHERE `TEST:A1B2.RAW.ABC-GG-1-A` LIKE 'row%' order by id;""" + qt_q_4 """ SELECT * FROM ${table} ORDER BY `TEST:A1B2.RAW.ABC-GG-1-A` DESC;""" + + } +} + + + + + + + +