1717
1818#include " collection_statistics.h"
1919
20+ #include < sstream>
21+
2022#include " common/exception.h"
2123#include " olap/rowset/rowset.h"
2224#include " olap/rowset/rowset_reader.h"
@@ -35,21 +37,22 @@ namespace doris {
3537Status CollectionStatistics::collect (
3638 RuntimeState* state, const std::vector<RowSetSplits>& rs_splits,
3739 const TabletSchemaSPtr& tablet_schema,
38- const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down) {
40+ const vectorized::VExprContextSPtrs& common_expr_ctxs_push_down, io::IOContext* io_ctx ) {
3941 std::unordered_map<std::wstring, CollectInfo> collect_infos;
4042 RETURN_IF_ERROR (
4143 extract_collect_info (state, common_expr_ctxs_push_down, tablet_schema, &collect_infos));
44+ if (collect_infos.empty ()) {
45+ LOG (WARNING) << " Index statistics collection: no collect info extracted." ;
46+ return Status::OK ();
47+ }
4248
4349 for (const auto & rs_split : rs_splits) {
4450 const auto & rs_reader = rs_split.rs_reader ;
4551 auto rowset = rs_reader->rowset ();
46- auto rowset_meta = rowset->rowset_meta ();
47-
4852 auto num_segments = rowset->num_segments ();
4953 for (int32_t seg_id = 0 ; seg_id < num_segments; ++seg_id) {
50- auto seg_path = DORIS_TRY (rowset->segment_path (seg_id));
51- auto status = process_segment (seg_path, rowset_meta->fs (), tablet_schema.get (),
52- collect_infos);
54+ auto status =
55+ process_segment (rowset, seg_id, tablet_schema.get (), collect_infos, io_ctx);
5356 if (!status.ok ()) {
5457 if (status.code () == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND ||
5558 status.code () == ErrorCode::INVERTED_INDEX_BYPASS) {
@@ -62,15 +65,29 @@ Status CollectionStatistics::collect(
6265 }
6366
6467#ifndef NDEBUG
65- LOG (INFO) << " term_num_docs: " << _total_num_docs;
68+ std::stringstream ss;
69+ ss << " term_num_docs: " << _total_num_docs;
6670 for (const auto & [ws_field_name, num_tokens] : _total_num_tokens) {
67- LOG (INFO) << " field_name: " << StringHelper::to_string (ws_field_name)
68- << " , num_tokens: " << num_tokens;
69- for (const auto & [term, doc_freq] : _term_doc_freqs.at (ws_field_name)) {
70- LOG (INFO) << " term: " << StringHelper::to_string (term) << " , doc_freq: " << doc_freq;
71+ ss << " , [field_name: " << StringHelper::to_string (ws_field_name)
72+ << " , num_tokens: " << num_tokens;
73+ auto it = _term_doc_freqs.find (ws_field_name);
74+ if (it != _term_doc_freqs.end ()) {
75+ ss << " , terms: {" ;
76+ bool first = true ;
77+ for (const auto & [term, doc_freq] : it->second ) {
78+ if (!first) {
79+ ss << " , " ;
80+ }
81+ ss << StringHelper::to_string (term) << " : " << doc_freq;
82+ first = false ;
83+ }
84+ ss << " }" ;
85+ } else {
86+ ss << " , (no term stats)" ;
7187 }
88+ ss << " ]" ;
7289 }
73- LOG (INFO) << " -------------------------------- " ;
90+ LOG (INFO) << " CollectionStatistics: " << ss. str () ;
7491#endif
7592
7693 return Status::OK ();
@@ -136,6 +153,11 @@ Status handle_match_pred(RuntimeState* state, const TabletSchemaSPtr& tablet_sch
136153
137154 auto term_infos = InvertedIndexAnalyzer::get_analyse_result (
138155 right_literal->value (format_options), index_meta->properties ());
156+ if (term_infos.empty ()) {
157+ LOG (WARNING) << " Index statistics collection: no terms extracted from literal value, "
158+ << " col_unique_id=" << index_meta->col_unique_ids ()[0 ];
159+ continue ;
160+ }
139161
140162 std::string field_name = std::to_string (index_meta->col_unique_ids ()[0 ]);
141163 if (!column.suffix_path ().empty ()) {
@@ -188,18 +210,22 @@ Status CollectionStatistics::extract_collect_info(
188210}
189211
190212Status CollectionStatistics::process_segment (
191- const std::string& seg_path, const io::FileSystemSPtr& fs,
192- const TabletSchema* tablet_schema,
193- const std::unordered_map<std::wstring, CollectInfo>& collect_infos) {
213+ const RowsetSharedPtr& rowset, int32_t seg_id, const TabletSchema* tablet_schema,
214+ const std::unordered_map<std::wstring, CollectInfo>& collect_infos, io::IOContext* io_ctx) {
215+ auto seg_path = DORIS_TRY (rowset->segment_path (seg_id));
216+ auto rowset_meta = rowset->rowset_meta ();
217+
194218 auto idx_file_reader = std::make_unique<IndexFileReader>(
195- fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix (seg_path)},
196- tablet_schema->get_inverted_index_storage_format ());
197- RETURN_IF_ERROR (idx_file_reader->init ());
219+ rowset_meta->fs (),
220+ std::string {InvertedIndexDescriptor::get_index_file_path_prefix (seg_path)},
221+ tablet_schema->get_inverted_index_storage_format (),
222+ rowset_meta->inverted_index_file_info (seg_id));
223+ RETURN_IF_ERROR (idx_file_reader->init (config::inverted_index_read_buffer_size, io_ctx));
198224
199225 int32_t total_seg_num_docs = 0 ;
200226 for (const auto & [ws_field_name, collect_info] : collect_infos) {
201227#ifdef BE_TEST
202- auto compound_reader = DORIS_TRY (idx_file_reader->open (collect_info.index_meta , nullptr ));
228+ auto compound_reader = DORIS_TRY (idx_file_reader->open (collect_info.index_meta , io_ctx ));
203229 auto * reader = lucene::index::IndexReader::open (compound_reader.get ());
204230 auto index_searcher = std::make_shared<lucene::search::IndexSearcher>(reader, true );
205231
@@ -211,7 +237,7 @@ Status CollectionStatistics::process_segment(
211237 if (!InvertedIndexSearcherCache::instance ()->lookup (searcher_cache_key,
212238 &inverted_index_cache_handle)) {
213239 auto compound_reader =
214- DORIS_TRY (idx_file_reader->open (collect_info.index_meta , nullptr ));
240+ DORIS_TRY (idx_file_reader->open (collect_info.index_meta , io_ctx ));
215241 auto * reader = lucene::index::IndexReader::open (compound_reader.get ());
216242 size_t reader_size = reader->getTermInfosRAMUsed ();
217243 auto index_searcher = std::make_shared<lucene::search::IndexSearcher>(reader, true );
@@ -231,7 +257,7 @@ Status CollectionStatistics::process_segment(
231257 index_reader->sumTotalTermFreq (ws_field_name.c_str ()).value_or (0 );
232258
233259 for (const auto & term_info : collect_info.term_infos ) {
234- auto iter = TermIterator::create (nullptr , false , index_reader, ws_field_name,
260+ auto iter = TermIterator::create (io_ctx , false , index_reader, ws_field_name,
235261 term_info.get_single_term ());
236262 _term_doc_freqs[ws_field_name][iter->term ()] += iter->doc_freq ();
237263 }
0 commit comments