Skip to content

Commit

Permalink
Fix bug 1689268 (Fulltext search can not find word which contains pun…
Browse files Browse the repository at this point in the history
…ctuation marks)

Full text search is unable to find words with various punctuation
characters in boolean search mode, although those characters are
indexed with ngram parser. Introduce this ability by a new server
variable ft_query_extra_word_chars, to stay backwards and drop-in
compatible by default.

When it's enabled, all the non-whitespace symbols are considered to be
word symbols by FTS query parser, except for the boolean search syntax
symbols. The latter ones are also considered to be word symbols inside
double quote. This only applies for the query tokenizer, and the
indexing tokenizer is not changed in any way. Because of this, the
double quote symbol itself is never considered a word symbol, as no
existing indexing tokenizer does so, thus searching for it would never
return documents.

- Plugin API: introduced new function
  thd_get_ft_query_extra_word_chars to get the variable value for the
  current thread. As the variable itself is a core server one, not
  plugin, THDVAR macros from the plugin API do not work
- Make InnoDB FTS tokenizer consider the new variable when parsing
  queries, but never when indexing documents.
- Make ngram and mecab full text search plugins consider the variable
  value for query parsing.
- Add tests for InnoDB default and ngram parser, and MyISAM ngram
  parser (which is shared with InnoDB).
  • Loading branch information
laurynas-biveinis committed Jan 23, 2018
1 parent d46c592 commit b7cb587
Show file tree
Hide file tree
Showing 23 changed files with 981 additions and 23 deletions.
8 changes: 8 additions & 0 deletions include/mysql/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,14 @@ int thd_command(const MYSQL_THD thd);
long long thd_start_time(const MYSQL_THD thd);
void thd_kill(unsigned long id);

/**
Check whether ft_query_extra_word_chars server variable is enabled for the
current session
@return ft_query_extra_word_chars value
*/
int thd_get_ft_query_extra_word_chars(void);

#ifdef __cplusplus
}
#endif
Expand Down
63 changes: 63 additions & 0 deletions mysql-test/include/percona_ft_query_extra_word_chars.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#
# Helper include file to test ft_query_extra_word_chars functionality
#
# required arg:
# - $ft_query_extra_word_chars = setup: to create words table and run_queries SP
# - $ft_query_extra_word_chars = cleanup: to drop them
#

if ($ft_query_extra_word_chars == 'setup')
{
--echo #
--echo # Setup ft_query_extra_word_chars testing
--echo #

SET NAMES utf8;

CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT,
a TEXT COLLATE utf8mb4_bin);
INSERT INTO words (a) VALUES ('abcdef');
INSERT INTO words (a) VALUES ('ąbčdėf');
INSERT INTO words (a) VALUES ('ąbč_dėf');
INSERT INTO words (a) VALUES ('ąbč!dėf');
INSERT INTO words (a) VALUES ('ąbč,dėf');
INSERT INTO words (a) VALUES ('ąbč dėf');
# Words containing default FTS boolean search syntax characters
INSERT INTO words (a) VALUES ('+bčę');
INSERT INTO words (a) VALUES ('>ąbč <dėf');
INSERT INTO words (a) VALUES ('-(ąbč xyz)');
INSERT INTO words (a) VALUES ('ąbč* ~dėf');
INSERT INTO words (a) VALUES ('single"');
INSERT INTO words (a) VALUES ('-ųtū');

delimiter |;
CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool'))
BEGIN
DECLARE q, fts_query VARCHAR(100);
SET @q = CONCAT("SELECT a AS matches FROM ",
table_name,
" WHERE MATCH(a) AGAINST (? IN ");
IF query_type = 'nl'
THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE");
ELSE SET @q = CONCAT(@q, "BOOLEAN");
END IF;
SET @q = CONCAT(@q, " MODE)");

PREPARE query_stmt FROM @q;
SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
# Surprising result with default parser, NL query, extra chars ON: '-' becomes a part
# of the query token, result is empty set
SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
DEALLOCATE PREPARE query_stmt;
END|
delimiter ;|
}

if ($ft_query_extra_word_chars == 'cleanup')
{
DROP PROCEDURE run_queries;
DROP TABLE words;
}
1 change: 1 addition & 0 deletions mysql-test/r/fulltext_var.result
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ft_boolean_syntax + -><()~*:""&|
ft_max_word_len 84
ft_min_word_len 4
ft_query_expansion_limit 20
ft_query_extra_word_chars OFF
ft_stopword_file (built-in)
create table t1 (b text not null);
insert t1 values ('aaaaaa bbbbbb cccccc');
Expand Down
233 changes: 233 additions & 0 deletions mysql-test/r/percona_ft_query_extra_word_chars.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
#
# Setup ft_query_extra_word_chars testing
#
SET NAMES utf8;
CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT,
a TEXT COLLATE utf8mb4_bin);
INSERT INTO words (a) VALUES ('abcdef');
INSERT INTO words (a) VALUES ('ąbčdėf');
INSERT INTO words (a) VALUES ('ąbč_dėf');
INSERT INTO words (a) VALUES ('ąbč!dėf');
INSERT INTO words (a) VALUES ('ąbč,dėf');
INSERT INTO words (a) VALUES ('ąbč dėf');
INSERT INTO words (a) VALUES ('+bčę');
INSERT INTO words (a) VALUES ('>ąbč <dėf');
INSERT INTO words (a) VALUES ('-(ąbč xyz)');
INSERT INTO words (a) VALUES ('ąbč* ~dėf');
INSERT INTO words (a) VALUES ('single"');
INSERT INTO words (a) VALUES ('-ųtū');
CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool'))
BEGIN
DECLARE q, fts_query VARCHAR(100);
SET @q = CONCAT("SELECT a AS matches FROM ",
table_name,
" WHERE MATCH(a) AGAINST (? IN ");
IF query_type = 'nl'
THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE");
ELSE SET @q = CONCAT(@q, "BOOLEAN");
END IF;
SET @q = CONCAT(@q, " MODE)");
PREPARE query_stmt FROM @q;
SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
# Surprising result with default parser, NL query, extra chars ON: '-' becomes a part
# of the query token, result is empty set
SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
DEALLOCATE PREPARE query_stmt;
END|
#
# Prepare test tables for MyISAM using ngram parser
#
CREATE TABLE fts_ngram (id INT PRIMARY KEY AUTO_INCREMENT,
a TEXT COLLATE utf8mb4_bin, FULLTEXT KEY a(a) WITH PARSER ngram)
ENGINE=MyISAM;
INSERT INTO fts_ngram SELECT * FROM words;
SET SESSION ft_query_extra_word_chars = ON;
CREATE TABLE fts_ngram_2 LIKE fts_ngram;
INSERT INTO fts_ngram_2 SELECT * FROM words;
# Test querying MyISAM in natural language mode, ngram parser, extra chars off
SET SESSION ft_query_extra_word_chars = OFF;
CALL run_queries('fts_ngram', 'nl');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
+bčę
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
-ųtū
@fts_query
"-ųtū"
matches
-ųtū
# Test that querying the other MyISAM table gives identical results
CALL run_queries('fts_ngram_2', 'nl');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
+bčę
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
-ųtū
@fts_query
"-ųtū"
matches
-ųtū
# Test querying MyISAM in natural language mode, ngram parser, extra chars on
SET SESSION ft_query_extra_word_chars = ON;
CALL run_queries('fts_ngram', 'nl');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
+bčę
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
-ųtū
@fts_query
"-ųtū"
matches
-ųtū
# Test that querying the other MyISAM table gives identical results
CALL run_queries('fts_ngram_2', 'nl');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
+bčę
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
-ųtū
@fts_query
"-ųtū"
matches
-ųtū
# Test querying MyISAM in boolean mode, ngram parser, extra chars off
SET SESSION ft_query_extra_word_chars = OFF;
CALL run_queries('fts_ngram', 'bool');
@fts_query
č,d
matches
@fts_query
+bc
matches
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
@fts_query
"-ųtū"
matches
-ųtū
# Test that querying the other MyISAM table gives identical results
CALL run_queries('fts_ngram_2', 'bool');
@fts_query
č,d
matches
@fts_query
+bc
matches
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
@fts_query
"-ųtū"
matches
-ųtū
# Test querying MyISAM in boolean mode, ngram parser, extra chars on
SET SESSION ft_query_extra_word_chars = ON;
CALL run_queries('fts_ngram', 'bool');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
@fts_query
"-ųtū"
matches
-ųtū
# Test that querying the other MyISAM table gives identical results
CALL run_queries('fts_ngram_2', 'bool');
@fts_query
č,d
matches
ąbč,dėf
@fts_query
+bc
matches
abcdef
@fts_query
single
matches
single"
@fts_query
-ųtū
matches
@fts_query
"-ųtū"
matches
-ųtū
#
# Cleanup
#
DROP TABLE fts_ngram, fts_ngram_2;
DROP PROCEDURE run_queries;
DROP TABLE words;
1 change: 1 addition & 0 deletions mysql-test/suite/innodb_fts/r/fulltext_var.result
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ft_boolean_syntax + -><()~*:""&|
ft_max_word_len 84
ft_min_word_len 4
ft_query_expansion_limit 20
ft_query_extra_word_chars OFF
ft_stopword_file (built-in)
create table t1 (b text not null, fulltext(b)) engine = innodb;
insert t1 values ('aaaaaa bbbbbb cccccc');
Expand Down
Loading

0 comments on commit b7cb587

Please sign in to comment.