Skip to content

Commit

Permalink
Add 'source_code_default' tokenizer with preset filters. (#3655)
Browse files Browse the repository at this point in the history
* Add 'source_code_default' tokenizer with a preset filters.

* Fix compile.

* Remove http source.

* Fix tokenizer tests.
  • Loading branch information
fmassot authored Jul 18, 2023
1 parent e45bf58 commit 24010ff
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use anyhow::Context;
#[cfg(feature = "multilang")]
use quickwit_query::MultiLangTokenizer;
use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH;
use quickwit_query::{CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH};
use serde::{Deserialize, Serialize};
use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter,
Expand Down Expand Up @@ -55,6 +55,7 @@ impl TokenizerConfig {
TokenizerType::Multilang => {
TextAnalyzer::builder(MultiLangTokenizer::default()).dynamic()
}
TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(),
TokenizerType::Ngram(options) => {
let tokenizer =
NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only)
Expand Down Expand Up @@ -126,11 +127,12 @@ impl TokenFilterType {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum TokenizerType {
Simple,
#[cfg(feature = "multilang")]
Multilang,
Ngram(NgramTokenizerOption),
Regex(RegexTokenizerOption),
Simple,
SourceCode,
}

#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)]
Expand Down
9 changes: 5 additions & 4 deletions quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mod multilang;

use once_cell::sync::Lazy;
use tantivy::tokenizer::{
LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, TokenizerManager,
AsciiFoldingFilter, LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, TokenizerManager,
};

use self::chinese_compatible::ChineseTokenizer;
Expand All @@ -48,10 +48,11 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.build();
tokenizer_manager.register("chinese_compatible", chinese_tokenizer);
tokenizer_manager.register(
"source_code",
"source_code_default",
TextAnalyzer::builder(CodeTokenizer::default())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build(),
);
#[cfg(feature = "multilang")]
Expand Down Expand Up @@ -121,13 +122,13 @@ mod tests {
#[test]
fn test_code_tokenizer_in_tokenizer_manager() {
let mut code_tokenizer = super::create_default_quickwit_tokenizer_manager()
.get("source_code")
.get("source_code_default")
.unwrap();
let mut token_stream = code_tokenizer.token_stream("PigCaféFactory2");
let mut tokens = Vec::new();
while let Some(token) = token_stream.next() {
tokens.push(token.text.to_string());
}
assert_eq!(tokens, vec!["pig", "café", "factory", "2"])
assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"])
}
}

0 comments on commit 24010ff

Please sign in to comment.