-
Notifications
You must be signed in to change notification settings - Fork 482
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix bug 1689268 (Fulltext search can not find word which contains pun…
…ctuation marks) Full text search is unable to find words with various punctuation characters in boolean search mode, although those characters are indexed with ngram parser. Introduce this ability by a new server variable ft_query_extra_word_chars, to stay backwards and drop-in compatible by default. When it's enabled, all the non-whitespace symbols are considered to be word symbols by FTS query parser, except for the boolean search syntax symbols. The latter ones are also considered to be word symbols inside double quote. This only applies for the query tokenizer, and the indexing tokenizer is not changed in any way. Because of this, the double quote symbol itself is never considered a word symbol, as no existing indexing tokenizer does so, thus searching for it would never return documents. - Plugin API: introduced new function thd_get_ft_query_extra_word_chars to get the variable value for the current thread. As the variable itself is a core server one, not plugin, THDVAR macros from the plugin API do not work - Make InnoDB FTS tokenizer consider the new variable when parsing queries, but never when indexing documents. - Make ngram and mecab full text search plugins consider the variable value for query parsing. - Add tests for InnoDB default and ngram parser, and MyISAM ngram parser (which is shared with InnoDB).
- Loading branch information
1 parent
d46c592
commit b7cb587
Showing
23 changed files
with
981 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# | ||
# Helper include file to test ft_query_extra_word_chars functionality | ||
# | ||
# required arg: | ||
# - $ft_query_extra_word_chars = setup: to create words table and run_queries SP | ||
# - $ft_query_extra_word_chars = cleanup: to drop them | ||
# | ||
|
||
if ($ft_query_extra_word_chars == 'setup') | ||
{ | ||
--echo # | ||
--echo # Setup ft_query_extra_word_chars testing | ||
--echo # | ||
|
||
SET NAMES utf8; | ||
|
||
CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT, | ||
a TEXT COLLATE utf8mb4_bin); | ||
INSERT INTO words (a) VALUES ('abcdef'); | ||
INSERT INTO words (a) VALUES ('ąbčdėf'); | ||
INSERT INTO words (a) VALUES ('ąbč_dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč!dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč,dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč dėf'); | ||
# Words containing default FTS boolean search syntax characters | ||
INSERT INTO words (a) VALUES ('+bčę'); | ||
INSERT INTO words (a) VALUES ('>ąbč <dėf'); | ||
INSERT INTO words (a) VALUES ('-(ąbč xyz)'); | ||
INSERT INTO words (a) VALUES ('ąbč* ~dėf'); | ||
INSERT INTO words (a) VALUES ('single"'); | ||
INSERT INTO words (a) VALUES ('-ųtū'); | ||
|
||
delimiter |; | ||
CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool')) | ||
BEGIN | ||
DECLARE q, fts_query VARCHAR(100); | ||
SET @q = CONCAT("SELECT a AS matches FROM ", | ||
table_name, | ||
" WHERE MATCH(a) AGAINST (? IN "); | ||
IF query_type = 'nl' | ||
THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE"); | ||
ELSE SET @q = CONCAT(@q, "BOOLEAN"); | ||
END IF; | ||
SET @q = CONCAT(@q, " MODE)"); | ||
|
||
PREPARE query_stmt FROM @q; | ||
SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
# Surprising result with default parser, NL query, extra chars ON: '-' becomes a part | ||
# of the query token, result is empty set | ||
SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
DEALLOCATE PREPARE query_stmt; | ||
END| | ||
delimiter ;| | ||
} | ||
|
||
if ($ft_query_extra_word_chars == 'cleanup') | ||
{ | ||
DROP PROCEDURE run_queries; | ||
DROP TABLE words; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
# | ||
# Setup ft_query_extra_word_chars testing | ||
# | ||
SET NAMES utf8; | ||
CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT, | ||
a TEXT COLLATE utf8mb4_bin); | ||
INSERT INTO words (a) VALUES ('abcdef'); | ||
INSERT INTO words (a) VALUES ('ąbčdėf'); | ||
INSERT INTO words (a) VALUES ('ąbč_dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč!dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč,dėf'); | ||
INSERT INTO words (a) VALUES ('ąbč dėf'); | ||
INSERT INTO words (a) VALUES ('+bčę'); | ||
INSERT INTO words (a) VALUES ('>ąbč <dėf'); | ||
INSERT INTO words (a) VALUES ('-(ąbč xyz)'); | ||
INSERT INTO words (a) VALUES ('ąbč* ~dėf'); | ||
INSERT INTO words (a) VALUES ('single"'); | ||
INSERT INTO words (a) VALUES ('-ųtū'); | ||
CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool')) | ||
BEGIN | ||
DECLARE q, fts_query VARCHAR(100); | ||
SET @q = CONCAT("SELECT a AS matches FROM ", | ||
table_name, | ||
" WHERE MATCH(a) AGAINST (? IN "); | ||
IF query_type = 'nl' | ||
THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE"); | ||
ELSE SET @q = CONCAT(@q, "BOOLEAN"); | ||
END IF; | ||
SET @q = CONCAT(@q, " MODE)"); | ||
PREPARE query_stmt FROM @q; | ||
SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
# Surprising result with default parser, NL query, extra chars ON: '-' becomes a part | ||
# of the query token, result is empty set | ||
SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query; | ||
DEALLOCATE PREPARE query_stmt; | ||
END| | ||
# | ||
# Prepare test tables for MyISAM using ngram parser | ||
# | ||
CREATE TABLE fts_ngram (id INT PRIMARY KEY AUTO_INCREMENT, | ||
a TEXT COLLATE utf8mb4_bin, FULLTEXT KEY a(a) WITH PARSER ngram) | ||
ENGINE=MyISAM; | ||
INSERT INTO fts_ngram SELECT * FROM words; | ||
SET SESSION ft_query_extra_word_chars = ON; | ||
CREATE TABLE fts_ngram_2 LIKE fts_ngram; | ||
INSERT INTO fts_ngram_2 SELECT * FROM words; | ||
# Test querying MyISAM in natural language mode, ngram parser, extra chars off | ||
SET SESSION ft_query_extra_word_chars = OFF; | ||
CALL run_queries('fts_ngram', 'nl'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
+bčę | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
-ųtū | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test that querying the other MyISAM table gives identical results | ||
CALL run_queries('fts_ngram_2', 'nl'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
+bčę | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
-ųtū | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test querying MyISAM in natural language mode, ngram parser, extra chars on | ||
SET SESSION ft_query_extra_word_chars = ON; | ||
CALL run_queries('fts_ngram', 'nl'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
+bčę | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
-ųtū | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test that querying the other MyISAM table gives identical results | ||
CALL run_queries('fts_ngram_2', 'nl'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
+bčę | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
-ųtū | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test querying MyISAM in boolean mode, ngram parser, extra chars off | ||
SET SESSION ft_query_extra_word_chars = OFF; | ||
CALL run_queries('fts_ngram', 'bool'); | ||
@fts_query | ||
č,d | ||
matches | ||
@fts_query | ||
+bc | ||
matches | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test that querying the other MyISAM table gives identical results | ||
CALL run_queries('fts_ngram_2', 'bool'); | ||
@fts_query | ||
č,d | ||
matches | ||
@fts_query | ||
+bc | ||
matches | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test querying MyISAM in boolean mode, ngram parser, extra chars on | ||
SET SESSION ft_query_extra_word_chars = ON; | ||
CALL run_queries('fts_ngram', 'bool'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# Test that querying the other MyISAM table gives identical results | ||
CALL run_queries('fts_ngram_2', 'bool'); | ||
@fts_query | ||
č,d | ||
matches | ||
ąbč,dėf | ||
@fts_query | ||
+bc | ||
matches | ||
abcdef | ||
@fts_query | ||
single | ||
matches | ||
single" | ||
@fts_query | ||
-ųtū | ||
matches | ||
@fts_query | ||
"-ųtū" | ||
matches | ||
-ųtū | ||
# | ||
# Cleanup | ||
# | ||
DROP TABLE fts_ngram, fts_ngram_2; | ||
DROP PROCEDURE run_queries; | ||
DROP TABLE words; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.