From e41dfd5ede94c399cd6f9b0c0df198ece0e508b2 Mon Sep 17 00:00:00 2001 From: Gusted Date: Mon, 15 Apr 2024 23:54:47 +0200 Subject: [PATCH] Add `\t` as recognized separator Currently `\t` isn't seen as an recognized separator. This was causing issues for meilisearch, when it was trying to search on a keyword (fuzzy or exact match) and in the document the keyword was present but the character before the keyword was an `\t` charabia would create a token that was `\t` which in turn led to meilisearch returning the document as part of the search but not returning the positions of matches (`_matchesPosition` field). The actual reproducer for this bug was code files of the Linux kernel (such as `fs/ext4/readpage.c`) which uses tabs for indentation and searching for keywords like `while` would usually be 'prefixed' by an tab causing the described issue. Making `\t` a separator fixed this issue. --- charabia/src/separators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 3d4a849..60a0048 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[ "đ‘Ș ", "đ‘ȘĄ", "đ‘Șą", "𑱁", "𑱂", "đ‘±ƒ", "𑱄", "𑱅", "𑱰", "𑱱", "đ‘»·", "𑻞", "𑿿", "𒑰", "đ’‘±", "đ’‘Č", "𒑳", "𒑮", "đ–©ź", "đ–©Ż", "đ–«”", "đ–Ź·", "𖬾", "đ–Źč", "đ–Źș", "đ–Ź»", "𖭄", "đ–ș—", "đ–ș˜", "đ–ș™", "đ–șš", "𖿱", "đ›ČŸ", "đȘ‡", "đȘˆ", "đȘ‰", "đȘŠ", "đȘ‹", "đž„ž", "đž„Ÿ", "\n", "\r", "\u{2029}", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", - " ", " ", "`" + " ", " ", "`", "\t" ]; #[rustfmt::skip]