diff --git a/bindings.h b/bindings.h index bb33643..47ec75c 100644 --- a/bindings.h +++ b/bindings.h @@ -49,6 +49,11 @@ void context_register_text_analyzer_simple(struct TantivyContext *context_ptr, const char *lang_str_ptr, char **error_buffer); +void context_register_jieba_tokenizer(struct TantivyContext *context_ptr, + const char *tokenizer_name_ptr, + uintptr_t text_limit, + char **error_buffer); + void context_register_text_analyzer_raw(struct TantivyContext *context_ptr, const char *tokenizer_name_ptr, char **error_buffer); diff --git a/rust/Cargo.toml b/rust/Cargo.toml index d68190b..8beaac8 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -20,4 +20,5 @@ serde_json = "1.0.117" serde = { version = "1.0.203", features = ["derive"] } unicode-segmentation = "1.11.0" logcall = "0.1" -lazy_static = "1.5.0" \ No newline at end of file +lazy_static = "1.5.0" +tantivy-jieba = "0.11.0" \ No newline at end of file diff --git a/rust/src/c_util/util.rs b/rust/src/c_util/util.rs index 1f3ed3e..e6a9754 100644 --- a/rust/src/c_util/util.rs +++ b/rust/src/c_util/util.rs @@ -52,7 +52,7 @@ fn process_c_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> } } -pub fn assert_string(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option { +pub fn assert_string(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option { match process_c_str(str_ptr, error_buffer) { Ok(valid_str) => Some(valid_str.to_owned()), Err(_) => None, @@ -261,8 +261,14 @@ pub fn add_and_consume_documents( return; } - if writer.commit().is_err() { - rollback(error_buffer, writer, "Failed to commit the document"); + commit(writer, "Failed to commit the document", error_buffer) +} + +fn commit(writer: &mut IndexWriter, message: &str, error_buffer: *mut *mut c_char) { + let result = writer.commit(); + + if result.is_err() { + rollback(error_buffer, writer, format!("{}: {}", message, result.unwrap_err()).as_str()); } } @@ -297,9 +303,7 @@ pub fn delete_docs( return; } - if context.writer.commit().is_err() { - rollback(error_buffer, &mut context.writer, "Failed to commit removing"); - } + commit(&mut context.writer, "Failed to commit removing", error_buffer); } fn rollback( diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8605865..f962359 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -5,7 +5,7 @@ use logcall::logcall; use tantivy::{schema::*}; use crate::c_util::{add_and_consume_documents, add_field, assert_pointer, assert_str, assert_string, box_from, convert_document_as_json, create_context_with_schema, delete_docs, drop_any, get_doc, search, set_error, start_lib_init}; -use crate::tantivy_util::{add_text_field, Document, register_edge_ngram_tokenizer, register_ngram_tokenizer, register_raw_tokenizer, register_simple_tokenizer, SearchResult, TantivyContext}; +use crate::tantivy_util::{add_text_field, Document, register_edge_ngram_tokenizer, register_ngram_tokenizer, register_raw_tokenizer, register_simple_tokenizer, register_jieba_tokenizer, SearchResult, TantivyContext}; mod tantivy_util; mod c_util; @@ -186,6 +186,27 @@ pub extern "C" fn context_register_text_analyzer_simple( register_simple_tokenizer(text_limit, &context.index, tokenizer_name.as_str(), lang); } +#[logcall] +#[no_mangle] +pub extern "C" fn context_register_jieba_tokenizer( + context_ptr: *mut TantivyContext, + tokenizer_name_ptr: *const c_char, + text_limit: usize, + error_buffer: *mut *mut c_char, +) { + let context = match assert_pointer(context_ptr, error_buffer) { + Some(value) => value, + None => return + }; + + let tokenizer_name = match assert_string(tokenizer_name_ptr, error_buffer) { + Some(value) => value, + None => return + }; + + register_jieba_tokenizer(text_limit, &context.index, tokenizer_name.as_str()); +} + #[logcall] #[no_mangle] pub extern "C" fn context_register_text_analyzer_raw( diff --git a/rust/src/tantivy_util/mod.rs b/rust/src/tantivy_util/mod.rs index 47a5876..7789e3a 100644 --- a/rust/src/tantivy_util/mod.rs +++ b/rust/src/tantivy_util/mod.rs @@ -20,6 +20,7 @@ pub use self::scheme_builder::add_text_field; pub use self::scheme::get_string_field_entry; pub use self::tokenizer::register_edge_ngram_tokenizer; pub use self::tokenizer::register_simple_tokenizer; +pub use self::tokenizer::register_jieba_tokenizer; pub use self::tokenizer::register_raw_tokenizer; pub use self::tokenizer::register_ngram_tokenizer; pub use self::util::extract_text_from_owned_value; diff --git a/rust/src/tantivy_util/tokenizer.rs b/rust/src/tantivy_util/tokenizer.rs index 3858894..363bfa2 100644 --- a/rust/src/tantivy_util/tokenizer.rs +++ b/rust/src/tantivy_util/tokenizer.rs @@ -1,5 +1,5 @@ use tantivy::{Index, TantivyError}; -use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer}; +use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer}; use crate::tantivy_util::{EdgeNgramTokenizer}; use crate::tantivy_util::stemmer::create_stemmer; @@ -12,13 +12,13 @@ pub fn register_edge_ngram_tokenizer( max_gram: usize, limit: usize, index: &Index, - tokenizer_name: &str + tokenizer_name: &str, ) { let text_analyzer = TextAnalyzer::builder( EdgeNgramTokenizer::new( min_gram, max_gram, - limit + limit, )) .filter(LowerCaser) .filter(AsciiFoldingFilter) @@ -31,7 +31,7 @@ pub fn register_simple_tokenizer( text_limit: usize, index: &Index, tokenizer_name: &str, - lang: &str + lang: &str, ) { let text_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(text_limit)) @@ -43,6 +43,20 @@ pub fn register_simple_tokenizer( register_tokenizer(index, tokenizer_name, text_analyzer); } +pub fn register_jieba_tokenizer( + text_limit: usize, + index: &Index, + tokenizer_name: &str, +) { + let text_analyzer = TextAnalyzer::builder(tantivy_jieba::JiebaTokenizer {}) + .filter(RemoveLongFilter::limit(text_limit)) + .filter(LowerCaser) + .filter(Stemmer::default()) + .build(); + + register_tokenizer(index, tokenizer_name, text_analyzer); +} + pub fn register_raw_tokenizer(index: &Index, tokenizer_name: &str) { let text_analyzer = TextAnalyzer::builder(RawTokenizer::default()).build(); register_tokenizer(index, tokenizer_name, text_analyzer); @@ -53,9 +67,8 @@ pub fn register_ngram_tokenizer( max_gram: usize, prefix_only: bool, index: &Index, - tokenizer_name: &str + tokenizer_name: &str, ) -> Result<(), TantivyError> { - let tokenizer = NgramTokenizer::new( min_gram, max_gram, diff --git a/tantivy.go b/tantivy.go index 69fd835..ee53b90 100644 --- a/tantivy.go +++ b/tantivy.go @@ -22,6 +22,7 @@ import ( const TokenizerSimple = "simple" const TokenizerNgram = "ngram" +const TokenizerJieba = "jieba" const TokenizerEdgeNgram = "edge_ngram" const TokenizerRaw = "raw" diff --git a/tantivy_test.go b/tantivy_test.go index e707823..f88192c 100644 --- a/tantivy_test.go +++ b/tantivy_test.go @@ -15,6 +15,8 @@ import ( const NameBody = "body" const NameId = "id" const NameTitle = "title" +const NameBodyCh = "bodyCh" +const NameTitleCh = "titleCh" const limit = 40 const minGram = 2 @@ -325,12 +327,38 @@ func Test(t *testing.T) { docs, err = tc.NumDocs() require.Equal(t, uint64(0), docs) }) + + t.Run("docs search - when jieba", func(t *testing.T) { + _, tc := fx(t, limit, 1, false) + + defer tc.Free() + + doc, err := addDoc(t, "", "张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途", "1", tc) + require.NoError(t, err) + + doc2, err := addDoc(t, "张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途", "", "2", tc) + require.NoError(t, err) + + err = tc.AddAndConsumeDocuments(doc, doc2) + require.NoError(t, err) + + docs, err := tc.NumDocs() + require.NoError(t, err) + require.Equal(t, uint64(2), docs) + + result, err := tc.Search("售货员", 100, true, NameBodyCh, NameTitleCh) + require.NoError(t, err) + + size, err := result.GetSize() + defer result.Free() + require.Equal(t, 2, int(size)) + }) } func addDoc( t *testing.T, title string, - name string, + body string, id string, tc *tantivy_go.TantivyContext, ) (*tantivy_go.Document, error) { @@ -339,10 +367,16 @@ func addDoc( err := doc.AddField(NameTitle, title, tc) require.NoError(t, err) + err = doc.AddField(NameTitleCh, title, tc) + require.NoError(t, err) + err = doc.AddField(NameId, id, tc) require.NoError(t, err) - err = doc.AddField(NameBody, name, tc) + err = doc.AddField(NameBody, body, tc) + require.NoError(t, err) + + err = doc.AddField(NameBodyCh, body, tc) return doc, err } @@ -367,6 +401,16 @@ func fx( ) require.NoError(t, err) + err = builder.AddTextField( + NameTitleCh, + true, + true, + false, + tantivy_go.IndexRecordOptionWithFreqsAndPositions, + tantivy_go.TokenizerJieba, + ) + require.NoError(t, err) + err = builder.AddTextField( NameId, true, @@ -387,6 +431,16 @@ func fx( ) require.NoError(t, err) + err = builder.AddTextField( + NameBodyCh, + true, + true, + false, + tantivy_go.IndexRecordOptionWithFreqsAndPositions, + tantivy_go.TokenizerJieba, + ) + require.NoError(t, err) + schema, err := builder.BuildSchema() require.NoError(t, err) @@ -397,6 +451,9 @@ func fx( err = tc.RegisterTextAnalyzerSimple(tantivy_go.TokenizerSimple, limit, tantivy_go.English) require.NoError(t, err) + err = tc.RegisterTextAnalyzerJieba(tantivy_go.TokenizerJieba, limit) + require.NoError(t, err) + err = tc.RegisterTextAnalyzerEdgeNgram(tantivy_go.TokenizerEdgeNgram, minGram, 4, 100) require.NoError(t, err) diff --git a/tantivycontext.go b/tantivycontext.go index 34bbbee..1ebdef4 100644 --- a/tantivycontext.go +++ b/tantivycontext.go @@ -213,6 +213,23 @@ func (tc *TantivyContext) RegisterTextAnalyzerSimple(tokenizerName string, textL return tryExtractError(errBuffer) } +// RegisterTextAnalyzerJieba registers a jieba text analyzer with the index. +// +// Parameters: +// - tokenizerName (string): The name of the tokenizer to be used. +// - textLimit (uintptr): The limit on the length of the text to be analyzed. +// +// Returns: +// - error: An error if the registration fails. +func (tc *TantivyContext) RegisterTextAnalyzerJieba(tokenizerName string, textLimit uintptr) error { + cTokenizerName := C.CString(tokenizerName) + defer C.string_free(cTokenizerName) + var errBuffer *C.char + C.context_register_jieba_tokenizer(tc.ptr, cTokenizerName, C.uintptr_t(textLimit), &errBuffer) + + return tryExtractError(errBuffer) +} + // RegisterTextAnalyzerRaw registers a raw text analyzer with the index. // // Parameters: