Fix bug 1689268 (Fulltext search can not find word which contains pun…

…ctuation marks) Full text search is unable to find words with various punctuation characters in boolean search mode, although those characters are indexed with ngram parser. Introduce this ability by a new server variable ft_query_extra_word_chars, to stay backwards and drop-in compatible by default. When it's enabled, all the non-whitespace symbols are considered to be word symbols by FTS query parser, except for the boolean search syntax symbols. The latter ones are also considered to be word symbols inside double quote. This only applies for the query tokenizer, and the indexing tokenizer is not changed in any way. Because of this, the double quote symbol itself is never considered a word symbol, as no existing indexing tokenizer does so, thus searching for it would never return documents. - Plugin API: introduced new function thd_get_ft_query_extra_word_chars to get the variable value for the current thread. As the variable itself is a core server one, not plugin, THDVAR macros from the plugin API do not work - Make InnoDB FTS tokenizer consider the new variable when parsing queries, but never when indexing documents. - Make ngram and mecab full text search plugins consider the variable value for query parsing. - Add tests for InnoDB default and ngram parser, and MyISAM ngram parser (which is shared with InnoDB).
percona · Jan 23, 2018 · b7cb587 · b7cb587
1 parent d46c592
commit b7cb587
Show file tree

Hide file tree

Showing 23 changed files with 981 additions and 23 deletions.
diff --git a/include/mysql/plugin.h b/include/mysql/plugin.h
@@ -778,6 +778,14 @@ int thd_command(const MYSQL_THD thd);
 long long thd_start_time(const MYSQL_THD thd);
 void thd_kill(unsigned long id);
 
+/**
+  Check whether ft_query_extra_word_chars server variable is enabled for the
+  current session
+
+  @return ft_query_extra_word_chars value
+*/
+int thd_get_ft_query_extra_word_chars(void);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/mysql-test/include/percona_ft_query_extra_word_chars.inc b/mysql-test/include/percona_ft_query_extra_word_chars.inc
@@ -0,0 +1,63 @@
+#
+# Helper include file to test ft_query_extra_word_chars functionality
+#
+# required arg:
+# - $ft_query_extra_word_chars = setup: to create words table and run_queries SP
+# - $ft_query_extra_word_chars = cleanup: to drop them
+#
+
+if ($ft_query_extra_word_chars == 'setup')
+{
+  --echo #
+  --echo # Setup ft_query_extra_word_chars testing
+  --echo #
+
+  SET NAMES utf8;
+
+  CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT,
+                     a TEXT COLLATE utf8mb4_bin);
+  INSERT INTO words (a) VALUES ('abcdef');
+  INSERT INTO words (a) VALUES ('ąbčdėf');
+  INSERT INTO words (a) VALUES ('ąbč_dėf');
+  INSERT INTO words (a) VALUES ('ąbč!dėf');
+  INSERT INTO words (a) VALUES ('ąbč,dėf');
+  INSERT INTO words (a) VALUES ('ąbč dėf');
+  # Words containing default FTS boolean search syntax characters
+  INSERT INTO words (a) VALUES ('+bčę');
+  INSERT INTO words (a) VALUES ('>ąbč <dėf');
+  INSERT INTO words (a) VALUES ('-(ąbč xyz)');
+  INSERT INTO words (a) VALUES ('ąbč* ~dėf');
+  INSERT INTO words (a) VALUES ('single"');
+  INSERT INTO words (a) VALUES ('-ųtū');
+
+  delimiter |;
+  CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool'))
+  BEGIN
+          DECLARE q, fts_query VARCHAR(100);
+          SET @q = CONCAT("SELECT a AS matches FROM ",
+                          table_name,
+                          " WHERE MATCH(a) AGAINST (? IN ");
+          IF query_type = 'nl'
+             THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE");
+             ELSE SET @q = CONCAT(@q, "BOOLEAN");
+          END IF;
+          SET @q = CONCAT(@q, " MODE)");
+
+          PREPARE query_stmt FROM @q;
+          SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+          SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+          SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+          # Surprising result with default parser, NL query, extra chars ON: '-' becomes a part
+          # of the query token, result is empty set
+          SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+          SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+          DEALLOCATE PREPARE query_stmt;
+  END|
+  delimiter ;|
+}
+
+if ($ft_query_extra_word_chars == 'cleanup')
+{
+  DROP PROCEDURE run_queries;
+  DROP TABLE words;
+}
diff --git a/mysql-test/r/fulltext_var.result b/mysql-test/r/fulltext_var.result
@@ -5,6 +5,7 @@ ft_boolean_syntax	+ -><()~*:""&|
 ft_max_word_len	84
 ft_min_word_len	4
 ft_query_expansion_limit	20
+ft_query_extra_word_chars	OFF
 ft_stopword_file	(built-in)
 create table t1 (b text not null);
 insert t1 values ('aaaaaa bbbbbb cccccc');

diff --git a/mysql-test/r/percona_ft_query_extra_word_chars.result b/mysql-test/r/percona_ft_query_extra_word_chars.result
@@ -0,0 +1,233 @@
+#
+# Setup ft_query_extra_word_chars testing
+#
+SET NAMES utf8;
+CREATE TABLE words(id INT PRIMARY KEY AUTO_INCREMENT,
+a TEXT COLLATE utf8mb4_bin);
+INSERT INTO words (a) VALUES ('abcdef');
+INSERT INTO words (a) VALUES ('ąbčdėf');
+INSERT INTO words (a) VALUES ('ąbč_dėf');
+INSERT INTO words (a) VALUES ('ąbč!dėf');
+INSERT INTO words (a) VALUES ('ąbč,dėf');
+INSERT INTO words (a) VALUES ('ąbč dėf');
+INSERT INTO words (a) VALUES ('+bčę');
+INSERT INTO words (a) VALUES ('>ąbč <dėf');
+INSERT INTO words (a) VALUES ('-(ąbč xyz)');
+INSERT INTO words (a) VALUES ('ąbč* ~dėf');
+INSERT INTO words (a) VALUES ('single"');
+INSERT INTO words (a) VALUES ('-ųtū');
+CREATE PROCEDURE run_queries(table_name VARCHAR(20), query_type ENUM('nl', 'bool'))
+BEGIN
+DECLARE q, fts_query VARCHAR(100);
+SET @q = CONCAT("SELECT a AS matches FROM ",
+table_name,
+" WHERE MATCH(a) AGAINST (? IN ");
+IF query_type = 'nl'
+             THEN SET @q = CONCAT(@q, "NATURAL LANGUAGE");
+ELSE SET @q = CONCAT(@q, "BOOLEAN");
+END IF;
+SET @q = CONCAT(@q, " MODE)");
+PREPARE query_stmt FROM @q;
+SET @fts_query = 'č,d'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+SET @fts_query = '+bc'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+SET @fts_query = 'single'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+# Surprising result with default parser, NL query, extra chars ON: '-' becomes a part
+# of the query token, result is empty set
+SET @fts_query = '-ųtū'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+SET @fts_query = '"-ųtū"'; SELECT @fts_query; EXECUTE query_stmt USING @fts_query;
+DEALLOCATE PREPARE query_stmt;
+END|
+#
+# Prepare test tables for MyISAM using ngram parser
+#
+CREATE TABLE fts_ngram (id INT PRIMARY KEY AUTO_INCREMENT,
+a TEXT COLLATE utf8mb4_bin, FULLTEXT KEY a(a) WITH PARSER ngram)
+ENGINE=MyISAM;
+INSERT INTO fts_ngram SELECT * FROM words;
+SET SESSION ft_query_extra_word_chars = ON;
+CREATE TABLE fts_ngram_2 LIKE fts_ngram;
+INSERT INTO fts_ngram_2 SELECT * FROM words;
+# Test querying MyISAM in natural language mode, ngram parser, extra chars off
+SET SESSION ft_query_extra_word_chars = OFF;
+CALL run_queries('fts_ngram', 'nl');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
++bčę
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+-ųtū
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test that querying the other MyISAM table gives identical results
+CALL run_queries('fts_ngram_2', 'nl');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
++bčę
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+-ųtū
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test querying MyISAM in natural language mode, ngram parser, extra chars on
+SET SESSION ft_query_extra_word_chars = ON;
+CALL run_queries('fts_ngram', 'nl');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
++bčę
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+-ųtū
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test that querying the other MyISAM table gives identical results
+CALL run_queries('fts_ngram_2', 'nl');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
++bčę
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+-ųtū
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test querying MyISAM in boolean mode, ngram parser, extra chars off
+SET SESSION ft_query_extra_word_chars = OFF;
+CALL run_queries('fts_ngram', 'bool');
+@fts_query
+č,d
+matches
+@fts_query
++bc
+matches
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test that querying the other MyISAM table gives identical results
+CALL run_queries('fts_ngram_2', 'bool');
+@fts_query
+č,d
+matches
+@fts_query
++bc
+matches
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test querying MyISAM in boolean mode, ngram parser, extra chars on
+SET SESSION ft_query_extra_word_chars = ON;
+CALL run_queries('fts_ngram', 'bool');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+@fts_query
+"-ųtū"
+matches
+-ųtū
+# Test that querying the other MyISAM table gives identical results
+CALL run_queries('fts_ngram_2', 'bool');
+@fts_query
+č,d
+matches
+ąbč,dėf
+@fts_query
++bc
+matches
+abcdef
+@fts_query
+single
+matches
+single"
+@fts_query
+-ųtū
+matches
+@fts_query
+"-ųtū"
+matches
+-ųtū
+#
+# Cleanup
+#
+DROP TABLE fts_ngram, fts_ngram_2;
+DROP PROCEDURE run_queries;
+DROP TABLE words;
diff --git a/mysql-test/suite/innodb_fts/r/fulltext_var.result b/mysql-test/suite/innodb_fts/r/fulltext_var.result
@@ -5,6 +5,7 @@ ft_boolean_syntax	+ -><()~*:""&|
 ft_max_word_len	84
 ft_min_word_len	4
 ft_query_expansion_limit	20
+ft_query_extra_word_chars	OFF
 ft_stopword_file	(built-in)
 create table t1 (b text not null, fulltext(b)) engine = innodb;
 insert t1 values ('aaaaaa bbbbbb cccccc');