Skip to content

Commit

Permalink
Try jieba
Browse files Browse the repository at this point in the history
  • Loading branch information
fat-fellow committed Oct 25, 2024
1 parent 66c50cf commit c56da9d
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 16 deletions.
5 changes: 5 additions & 0 deletions bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ void context_register_text_analyzer_simple(struct TantivyContext *context_ptr,
const char *lang_str_ptr,
char **error_buffer);

void context_register_jieba_tokenizer(struct TantivyContext *context_ptr,
const char *tokenizer_name_ptr,
uintptr_t text_limit,
char **error_buffer);

void context_register_text_analyzer_raw(struct TantivyContext *context_ptr,
const char *tokenizer_name_ptr,
char **error_buffer);
Expand Down
3 changes: 2 additions & 1 deletion rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ serde_json = "1.0.117"
serde = { version = "1.0.203", features = ["derive"] }
unicode-segmentation = "1.11.0"
logcall = "0.1"
lazy_static = "1.5.0"
lazy_static = "1.5.0"
tantivy-jieba = "0.11.0"
16 changes: 10 additions & 6 deletions rust/src/c_util/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ fn process_c_str<'a>(str_ptr: *const c_char, error_buffer: *mut *mut c_char) ->
}
}

pub fn assert_string(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option<String> {
pub fn assert_string(str_ptr: *const c_char, error_buffer: *mut *mut c_char) -> Option<String> {
match process_c_str(str_ptr, error_buffer) {
Ok(valid_str) => Some(valid_str.to_owned()),
Err(_) => None,
Expand Down Expand Up @@ -261,8 +261,14 @@ pub fn add_and_consume_documents(
return;
}

if writer.commit().is_err() {
rollback(error_buffer, writer, "Failed to commit the document");
commit(writer, "Failed to commit the document", error_buffer)
}

fn commit(writer: &mut IndexWriter, message: &str, error_buffer: *mut *mut c_char) {
let result = writer.commit();

if result.is_err() {
rollback(error_buffer, writer, format!("{}: {}", message, result.unwrap_err()).as_str());
}
}

Expand Down Expand Up @@ -297,9 +303,7 @@ pub fn delete_docs(
return;
}

if context.writer.commit().is_err() {
rollback(error_buffer, &mut context.writer, "Failed to commit removing");
}
commit(&mut context.writer, "Failed to commit removing", error_buffer);
}

fn rollback(
Expand Down
23 changes: 22 additions & 1 deletion rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use logcall::logcall;
use tantivy::{schema::*};

use crate::c_util::{add_and_consume_documents, add_field, assert_pointer, assert_str, assert_string, box_from, convert_document_as_json, create_context_with_schema, delete_docs, drop_any, get_doc, search, set_error, start_lib_init};
use crate::tantivy_util::{add_text_field, Document, register_edge_ngram_tokenizer, register_ngram_tokenizer, register_raw_tokenizer, register_simple_tokenizer, SearchResult, TantivyContext};
use crate::tantivy_util::{add_text_field, Document, register_edge_ngram_tokenizer, register_ngram_tokenizer, register_raw_tokenizer, register_simple_tokenizer, register_jieba_tokenizer, SearchResult, TantivyContext};

mod tantivy_util;
mod c_util;
Expand Down Expand Up @@ -186,6 +186,27 @@ pub extern "C" fn context_register_text_analyzer_simple(
register_simple_tokenizer(text_limit, &context.index, tokenizer_name.as_str(), lang);
}

#[logcall]
#[no_mangle]
pub extern "C" fn context_register_jieba_tokenizer(
context_ptr: *mut TantivyContext,
tokenizer_name_ptr: *const c_char,
text_limit: usize,
error_buffer: *mut *mut c_char,
) {
let context = match assert_pointer(context_ptr, error_buffer) {
Some(value) => value,
None => return
};

let tokenizer_name = match assert_string(tokenizer_name_ptr, error_buffer) {
Some(value) => value,
None => return
};

register_jieba_tokenizer(text_limit, &context.index, tokenizer_name.as_str());
}

#[logcall]
#[no_mangle]
pub extern "C" fn context_register_text_analyzer_raw(
Expand Down
1 change: 1 addition & 0 deletions rust/src/tantivy_util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub use self::scheme_builder::add_text_field;
pub use self::scheme::get_string_field_entry;
pub use self::tokenizer::register_edge_ngram_tokenizer;
pub use self::tokenizer::register_simple_tokenizer;
pub use self::tokenizer::register_jieba_tokenizer;
pub use self::tokenizer::register_raw_tokenizer;
pub use self::tokenizer::register_ngram_tokenizer;
pub use self::util::extract_text_from_owned_value;
Expand Down
25 changes: 19 additions & 6 deletions rust/src/tantivy_util/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use tantivy::{Index, TantivyError};
use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer};
use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, TextAnalyzer};
use crate::tantivy_util::{EdgeNgramTokenizer};
use crate::tantivy_util::stemmer::create_stemmer;

Expand All @@ -12,13 +12,13 @@ pub fn register_edge_ngram_tokenizer(
max_gram: usize,
limit: usize,
index: &Index,
tokenizer_name: &str
tokenizer_name: &str,
) {
let text_analyzer = TextAnalyzer::builder(
EdgeNgramTokenizer::new(
min_gram,
max_gram,
limit
limit,
))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
Expand All @@ -31,7 +31,7 @@ pub fn register_simple_tokenizer(
text_limit: usize,
index: &Index,
tokenizer_name: &str,
lang: &str
lang: &str,
) {
let text_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(text_limit))
Expand All @@ -43,6 +43,20 @@ pub fn register_simple_tokenizer(
register_tokenizer(index, tokenizer_name, text_analyzer);
}

pub fn register_jieba_tokenizer(
text_limit: usize,
index: &Index,
tokenizer_name: &str,
) {
let text_analyzer = TextAnalyzer::builder(tantivy_jieba::JiebaTokenizer {})
.filter(RemoveLongFilter::limit(text_limit))
.filter(LowerCaser)
.filter(Stemmer::default())
.build();

register_tokenizer(index, tokenizer_name, text_analyzer);
}

pub fn register_raw_tokenizer(index: &Index, tokenizer_name: &str) {
let text_analyzer = TextAnalyzer::builder(RawTokenizer::default()).build();
register_tokenizer(index, tokenizer_name, text_analyzer);
Expand All @@ -53,9 +67,8 @@ pub fn register_ngram_tokenizer(
max_gram: usize,
prefix_only: bool,
index: &Index,
tokenizer_name: &str
tokenizer_name: &str,
) -> Result<(), TantivyError> {

let tokenizer = NgramTokenizer::new(
min_gram,
max_gram,
Expand Down
1 change: 1 addition & 0 deletions tantivy.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

const TokenizerSimple = "simple"
const TokenizerNgram = "ngram"
const TokenizerJieba = "jieba"
const TokenizerEdgeNgram = "edge_ngram"
const TokenizerRaw = "raw"

Expand Down
61 changes: 59 additions & 2 deletions tantivy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
const NameBody = "body"
const NameId = "id"
const NameTitle = "title"
const NameBodyCh = "bodyCh"
const NameTitleCh = "titleCh"

const limit = 40
const minGram = 2
Expand Down Expand Up @@ -325,12 +327,38 @@ func Test(t *testing.T) {
docs, err = tc.NumDocs()
require.Equal(t, uint64(0), docs)
})

t.Run("docs search - when jieba", func(t *testing.T) {
_, tc := fx(t, limit, 1, false)

defer tc.Free()

doc, err := addDoc(t, "", "张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途", "1", tc)
require.NoError(t, err)

doc2, err := addDoc(t, "张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途", "", "2", tc)
require.NoError(t, err)

err = tc.AddAndConsumeDocuments(doc, doc2)
require.NoError(t, err)

docs, err := tc.NumDocs()
require.NoError(t, err)
require.Equal(t, uint64(2), docs)

result, err := tc.Search("售货员", 100, true, NameBodyCh, NameTitleCh)
require.NoError(t, err)

size, err := result.GetSize()
defer result.Free()
require.Equal(t, 2, int(size))
})
}

func addDoc(
t *testing.T,
title string,
name string,
body string,
id string,
tc *tantivy_go.TantivyContext,
) (*tantivy_go.Document, error) {
Expand All @@ -339,10 +367,16 @@ func addDoc(
err := doc.AddField(NameTitle, title, tc)
require.NoError(t, err)

err = doc.AddField(NameTitleCh, title, tc)
require.NoError(t, err)

err = doc.AddField(NameId, id, tc)
require.NoError(t, err)

err = doc.AddField(NameBody, name, tc)
err = doc.AddField(NameBody, body, tc)
require.NoError(t, err)

err = doc.AddField(NameBodyCh, body, tc)
return doc, err
}

Expand All @@ -367,6 +401,16 @@ func fx(
)
require.NoError(t, err)

err = builder.AddTextField(
NameTitleCh,
true,
true,
false,
tantivy_go.IndexRecordOptionWithFreqsAndPositions,
tantivy_go.TokenizerJieba,
)
require.NoError(t, err)

err = builder.AddTextField(
NameId,
true,
Expand All @@ -387,6 +431,16 @@ func fx(
)
require.NoError(t, err)

err = builder.AddTextField(
NameBodyCh,
true,
true,
false,
tantivy_go.IndexRecordOptionWithFreqsAndPositions,
tantivy_go.TokenizerJieba,
)
require.NoError(t, err)

schema, err := builder.BuildSchema()
require.NoError(t, err)

Expand All @@ -397,6 +451,9 @@ func fx(
err = tc.RegisterTextAnalyzerSimple(tantivy_go.TokenizerSimple, limit, tantivy_go.English)
require.NoError(t, err)

err = tc.RegisterTextAnalyzerJieba(tantivy_go.TokenizerJieba, limit)
require.NoError(t, err)

err = tc.RegisterTextAnalyzerEdgeNgram(tantivy_go.TokenizerEdgeNgram, minGram, 4, 100)
require.NoError(t, err)

Expand Down
17 changes: 17 additions & 0 deletions tantivycontext.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,23 @@ func (tc *TantivyContext) RegisterTextAnalyzerSimple(tokenizerName string, textL
return tryExtractError(errBuffer)
}

// RegisterTextAnalyzerJieba registers a jieba text analyzer with the index.
//
// Parameters:
// - tokenizerName (string): The name of the tokenizer to be used.
// - textLimit (uintptr): The limit on the length of the text to be analyzed.
//
// Returns:
// - error: An error if the registration fails.
func (tc *TantivyContext) RegisterTextAnalyzerJieba(tokenizerName string, textLimit uintptr) error {
cTokenizerName := C.CString(tokenizerName)
defer C.string_free(cTokenizerName)
var errBuffer *C.char
C.context_register_jieba_tokenizer(tc.ptr, cTokenizerName, C.uintptr_t(textLimit), &errBuffer)

return tryExtractError(errBuffer)
}

// RegisterTextAnalyzerRaw registers a raw text analyzer with the index.
//
// Parameters:
Expand Down

0 comments on commit c56da9d

Please sign in to comment.