From d49af471ca6c7825b49ff4fae46aa53356b3d8be Mon Sep 17 00:00:00 2001 From: James Salter Date: Wed, 22 Dec 2021 12:42:11 +1100 Subject: [PATCH 01/53] use websearch_to_tsquery --- synapse/storage/databases/main/search.py | 53 ++++++++++++------------ tests/storage/test_room_search.py | 39 +++++++++++++++++ 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index f87acfb86604..c2bffe149cf2 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -26,7 +26,7 @@ LoggingTransaction, ) from synapse.storage.databases.main.events_worker import EventRedactBehaviour -from synapse.storage.engines import PostgresEngine, Sqlite3Engine +from synapse.storage.engines import PostgresEngine, Sqlite3Engine, BaseDatabaseEngine if TYPE_CHECKING: from synapse.server import HomeServer @@ -388,9 +388,7 @@ async def search_msgs(self, room_ids, search_term, keys): list of dicts """ clauses = [] - - search_query = _parse_query(self.database_engine, search_term) - + args = [] # Make sure we don't explode because the person is in too many rooms. @@ -411,21 +409,25 @@ async def search_msgs(self, room_ids, search_term, keys): count_args = args count_clauses = clauses - if isinstance(self.database_engine, PostgresEngine): + if isinstance(self.database_engine, PostgresEngine): + search_query = _parse_query_for_pgsql(search_term) + print(search_query) sql = ( - "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank," + "SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) AS rank," " room_id, event_id" " FROM event_search" - " WHERE vector @@ to_tsquery('english', ?)" + " WHERE vector @@ websearch_to_tsquery('english', ?)" ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - " WHERE vector @@ to_tsquery('english', ?)" + " WHERE vector @@ websearch_to_tsquery('english', ?)" ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): + search_query = _parse_query_for_sqlite(search_term) + sql = ( "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" " FROM event_search" @@ -510,9 +512,7 @@ async def search_rooms( Each match as a dictionary. """ clauses = [] - - search_query = _parse_query(self.database_engine, search_term) - + args = [] # Make sure we don't explode because the person is in too many rooms. @@ -550,20 +550,22 @@ async def search_rooms( args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): + search_query = _parse_query_for_pgsql(search_term) sql = ( - "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank," + "SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" " FROM event_search" - " WHERE vector @@ to_tsquery('english', ?) AND " + " WHERE vector @@ websearch_to_tsquery('english', ?) AND " ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - " WHERE vector @@ to_tsquery('english', ?) AND " + " WHERE vector @@ websearch_to_tsquery('english', ?) AND " ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): + # We use CROSS JOIN here to ensure we use the right indexes. # https://sqlite.org/optoverview.html#crossjoin # @@ -582,6 +584,7 @@ async def search_rooms( " CROSS JOIN events USING (event_id)" " WHERE " ) + search_query = _parse_query_for_sqlite(search_term) args = [search_query] + args count_sql = ( @@ -696,7 +699,7 @@ def f(txn): while stop_sel in value: stop_sel += ">" - query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % ( + query = "SELECT ts_headline(?, websearch_to_tsquery('english', ?), %s)" % ( _to_postgres_options( { "StartSel": start_sel, @@ -727,20 +730,18 @@ def _to_postgres_options(options_dict): return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),) -def _parse_query(database_engine, search_term): +def _parse_query_for_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form - that can be passed to database. - We use this so that we can add prefix matching, which isn't something - that is supported by default. + that can be passed to sqllite's matchinfo(). """ # Pull out the individual words, discarding any non-word characters. results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + return " & ".join(results) + - if isinstance(database_engine, PostgresEngine): - return " & ".join(result + ":*" for result in results) - elif isinstance(database_engine, Sqlite3Engine): - return " & ".join(result + "*" for result in results) - else: - # This should be unreachable. - raise Exception("Unrecognized database engine") +def _parse_query_for_pgsql(search_term: str) -> str: + """Takes a plain unicode string from the user and converts it into a form + that can be passed to pgsql's websearch_to_tsquery. + """ + return search_term \ No newline at end of file diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 8971ecccbd6d..242557d86f46 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest.case import SkipTest import synapse.rest.admin from synapse.rest.client import login, room from synapse.storage.engines import PostgresEngine @@ -72,3 +73,41 @@ def test_null_byte(self): ) if isinstance(store.database_engine, PostgresEngine): self.assertIn("alice", result.get("highlights")) + + def test_web_search_for_phrase(self): + """ + Test searching for phrases using typical web search syntax, as per pgsql's phraseto_tsquery + """ + + store = self.hs.get_datastore() + if isinstance(store.database_engine, PostgresEngine): + raise SkipTest("Test only applies when PGSQL is used as the database") + + phrase = "the quick brown fox jumped over the lazy dog" + cases = [ + ("brown", True), + ("quick brown", True), + ("brown quick", True), + ("\"brown quick\"", False), + ("\"quick brown\"", True), + ("\"quick fox\"", False), + ("furphy OR fox", True), + ("nope OR doublenope", False), + ("-fox", False), + ("-nope", True), + ] + + # Register a user and create a room, create some messages + self.register_user("alice", "password") + access_token = self.login("alice", "password") + room_id = self.helper.create_room_as("alice", tok=access_token) + + # Send the phrase as a message and check it was created + response = self.helper.send(room_id, phrase, tok=access_token) + self.assertIn("event_id", response) + + # Run all the test cases + for query, has_results in cases: + result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) + \ No newline at end of file From fea2848a3981206b47c4d58438aab8d78df94ba7 Mon Sep 17 00:00:00 2001 From: James Salter Date: Wed, 22 Dec 2021 14:43:18 +1100 Subject: [PATCH 02/53] Support fallback using plainto_tsquery --- synapse/storage/databases/main/search.py | 35 ++++++++------ synapse/storage/engines/postgres.py | 4 ++ tests/storage/test_room_search.py | 59 ++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index c2bffe149cf2..650b15b82a4a 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -15,7 +15,7 @@ import logging import re from collections import namedtuple -from typing import TYPE_CHECKING, Collection, Iterable, List, Optional, Set +from typing import TYPE_CHECKING, Collection, Iterable, List, Optional, Set, Tuple from synapse.api.errors import SynapseError from synapse.events import EventBase @@ -410,21 +410,20 @@ async def search_msgs(self, room_ids, search_term, keys): count_clauses = clauses if isinstance(self.database_engine, PostgresEngine): - search_query = _parse_query_for_pgsql(search_term) - print(search_query) + search_query, tsquery_func = _parse_query_for_pgsql(search_term, self.database_engine) sql = ( - "SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) AS rank," + f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank," " room_id, event_id" " FROM event_search" - " WHERE vector @@ websearch_to_tsquery('english', ?)" + f" WHERE vector @@ {tsquery_func}('english', ?)" ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - " WHERE vector @@ websearch_to_tsquery('english', ?)" + f" WHERE vector @@ {tsquery_func}('english', ?)" ) - count_args = [search_query] + count_args + count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): search_query = _parse_query_for_sqlite(search_term) @@ -480,7 +479,7 @@ async def search_msgs(self, room_ids, search_term, keys): ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) - + return { "results": [ {"event": event_map[r["event_id"]], "rank": r["rank"]} @@ -550,18 +549,18 @@ async def search_rooms( args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): - search_query = _parse_query_for_pgsql(search_term) + search_query, tsquery_func = _parse_query_for_pgsql(search_term, self.database_engine) sql = ( - "SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) as rank," + f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" " FROM event_search" - " WHERE vector @@ websearch_to_tsquery('english', ?) AND " + f" WHERE vector @@ {tsquery_func}('english', ?) AND " ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - " WHERE vector @@ websearch_to_tsquery('english', ?) AND " + f" WHERE vector @@ {tsquery_func}('english', ?) AND " ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): @@ -740,8 +739,14 @@ def _parse_query_for_sqlite(search_term: str) -> str: return " & ".join(results) -def _parse_query_for_pgsql(search_term: str) -> str: +def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: """Takes a plain unicode string from the user and converts it into a form that can be passed to pgsql's websearch_to_tsquery. - """ - return search_term \ No newline at end of file + """ + return search_term, _get_tsquery_func(engine) + +def _get_tsquery_func(engine: PostgresEngine) -> str: + if engine.supports_websearch_to_tsquery: + return "websearch_to_tsquery" + else: + return "plainto_tsquery" \ No newline at end of file diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 30f948a0f77d..b742733878c7 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -138,6 +138,10 @@ def supports_returning(self) -> bool: """Do we support the `RETURNING` clause in insert/update/delete?""" return True + @property + def supports_websearch_to_tsquery(self) -> bool: + return int(self.server_version.split(".")[0]) >= 11 + def is_deadlock(self, error): if isinstance(error, self.module.DatabaseError): # https://www.postgresql.org/docs/current/static/errcodes-appendix.html diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 242557d86f46..7bd8ac3e5336 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -13,6 +13,7 @@ # limitations under the License. from unittest.case import SkipTest +from unittest.mock import PropertyMock, patch import synapse.rest.admin from synapse.rest.client import login, room from synapse.storage.engines import PostgresEngine @@ -74,16 +75,28 @@ def test_null_byte(self): if isinstance(store.database_engine, PostgresEngine): self.assertIn("alice", result.get("highlights")) + +class PostgresMessageSearchTest(HomeserverTestCase): + servlets = [ + synapse.rest.admin.register_servlets_for_client_rest_resource, + login.register_servlets, + room.register_servlets, + ] + def test_web_search_for_phrase(self): """ - Test searching for phrases using typical web search syntax, as per pgsql's phraseto_tsquery + Test searching for phrases using typical web search syntax, as per pgsql's websearch_to_tsquery. + This test is skipped unless the PG instance supports websearch_to_tsquery. """ store = self.hs.get_datastore() - if isinstance(store.database_engine, PostgresEngine): + if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when PGSQL is used as the database") + + if not store.database_engine.supports_websearch_to_tsquery: + raise SkipTest("Test only applies when PGSQL supporting websearch_to_tsquery is used as the database") - phrase = "the quick brown fox jumped over the lazy dog" + phrase = "the quick brown fox jumps over the lazy dog" cases = [ ("brown", True), ("quick brown", True), @@ -110,4 +123,44 @@ def test_web_search_for_phrase(self): for query, has_results in cases: result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) self.assertEquals(result.get("count"), 1 if has_results else 0, query) + + def test_plain_search_for_phrase(self): + """ + Test searching for phrases using plainto_tsquery, which is used when websearch_to_tsquery isn't + supported by the PG version. + """ + + store = self.hs.get_datastore() + if not isinstance(store.database_engine, PostgresEngine): + raise SkipTest("Test only applies when PGSQL is used as the database") + + phrase = "the quick brown fox jumps over the lazy dog" + cases = [ + ("nope", False), + ("brown", True), + ("quick brown", True), + ("brown quick", True), + ("brown nope", False), + ("furphy OR fox", False), # syntax not supported + ("\"quick brown\"", True), # syntax not supported, but strips quotes + ("-nope", False), # syntax not supported + ] + + # Register a user and create a room, create some messages + self.register_user("alice", "password") + access_token = self.login("alice", "password") + room_id = self.helper.create_room_as("alice", tok=access_token) + + # Send the phrase as a message and check it was created + response = self.helper.send(room_id, phrase, tok=access_token) + self.assertIn("event_id", response) + + with patch("synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", + new_callable=PropertyMock) as supports_websearch_to_tsquery: + supports_websearch_to_tsquery.return_value = False + + # Run all the test cases + for query, has_results in cases: + result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) \ No newline at end of file From ccb0d6cefe2ed1f10dd06b6aa664b929dda08624 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 11:32:51 +1100 Subject: [PATCH 03/53] cleanup --- synapse/storage/databases/main/search.py | 19 ++++++++++++------- tests/storage/test_room_search.py | 24 +++++++++++++++--------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 650b15b82a4a..3c31865dc67a 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -740,13 +740,18 @@ def _parse_query_for_sqlite(search_term: str) -> str: def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: - """Takes a plain unicode string from the user and converts it into a form - that can be passed to pgsql's websearch_to_tsquery. - """ - return search_term, _get_tsquery_func(engine) + """ + Return a tuple of (parsed search_term, tsquery func to use). + + The parsed search_term will be transformed into a syntax suitable for passing as an + argument to the tsquery func. + """ -def _get_tsquery_func(engine: PostgresEngine) -> str: if engine.supports_websearch_to_tsquery: - return "websearch_to_tsquery" + return search_term, "websearch_to_tsquery" else: - return "plainto_tsquery" \ No newline at end of file + # Pull out the individual words, discarding any non-word characters. + results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + # Join words with & (AND), using prefix matching for each word + parsed_query = " & ".join(result + ":*" for result in results) + return parsed_query, "to_tsquery" diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 7bd8ac3e5336..1206521ce146 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -85,16 +85,16 @@ class PostgresMessageSearchTest(HomeserverTestCase): def test_web_search_for_phrase(self): """ - Test searching for phrases using typical web search syntax, as per pgsql's websearch_to_tsquery. - This test is skipped unless the PG instance supports websearch_to_tsquery. + Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery. + This test is skipped unless the postgres instance supports websearch_to_tsquery. """ store = self.hs.get_datastore() if not isinstance(store.database_engine, PostgresEngine): - raise SkipTest("Test only applies when PGSQL is used as the database") + raise SkipTest("Test only applies when postgres is used as the database") if not store.database_engine.supports_websearch_to_tsquery: - raise SkipTest("Test only applies when PGSQL supporting websearch_to_tsquery is used as the database") + raise SkipTest("Test only applies when postgres supporting websearch_to_tsquery is used as the database") phrase = "the quick brown fox jumps over the lazy dog" cases = [ @@ -124,15 +124,15 @@ def test_web_search_for_phrase(self): result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) self.assertEquals(result.get("count"), 1 if has_results else 0, query) - def test_plain_search_for_phrase(self): + def test_non_web_search_for_phrase(self): """ - Test searching for phrases using plainto_tsquery, which is used when websearch_to_tsquery isn't - supported by the PG version. + Test searching for phrases without using web search, which is used when websearch_to_tsquery isn't + supported by the current postgres version. """ store = self.hs.get_datastore() if not isinstance(store.database_engine, PostgresEngine): - raise SkipTest("Test only applies when PGSQL is used as the database") + raise SkipTest("Test only applies when postgres is used as the database") phrase = "the quick brown fox jumps over the lazy dog" cases = [ @@ -155,6 +155,7 @@ def test_plain_search_for_phrase(self): response = self.helper.send(room_id, phrase, tok=access_token) self.assertIn("event_id", response) + # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. with patch("synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", new_callable=PropertyMock) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False @@ -163,4 +164,9 @@ def test_plain_search_for_phrase(self): for query, has_results in cases: result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) self.assertEquals(result.get("count"), 1 if has_results else 0, query) - \ No newline at end of file + +class PostgresRoomSearchTest(HomeserverTestCase): + # Register a user and create a room + self.register_user("alice", "password") + access_token = self.login("alice", "password") + room_id = self.helper.create_room_as("alice", tok=access_token \ No newline at end of file From 409afd6b94f4623369d4c5c22da759642a82bf54 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:06:03 +1100 Subject: [PATCH 04/53] add tests for search_rooms --- synapse/storage/databases/main/search.py | 41 +++++++------ tests/storage/test_room_search.py | 74 ++++++++++++++---------- 2 files changed, 69 insertions(+), 46 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 3c31865dc67a..9862e26bbf04 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -388,7 +388,7 @@ async def search_msgs(self, room_ids, search_term, keys): list of dicts """ clauses = [] - + args = [] # Make sure we don't explode because the person is in too many rooms. @@ -409,8 +409,10 @@ async def search_msgs(self, room_ids, search_term, keys): count_args = args count_clauses = clauses - if isinstance(self.database_engine, PostgresEngine): - search_query, tsquery_func = _parse_query_for_pgsql(search_term, self.database_engine) + if isinstance(self.database_engine, PostgresEngine): + search_query, tsquery_func = _parse_query_for_pgsql( + search_term, self.database_engine + ) sql = ( f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank," " room_id, event_id" @@ -423,7 +425,7 @@ async def search_msgs(self, room_ids, search_term, keys): "SELECT room_id, count(*) as count FROM event_search" f" WHERE vector @@ {tsquery_func}('english', ?)" ) - count_args = [search_query] + count_args + count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): search_query = _parse_query_for_sqlite(search_term) @@ -479,7 +481,7 @@ async def search_msgs(self, room_ids, search_term, keys): ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) - + return { "results": [ {"event": event_map[r["event_id"]], "rank": r["rank"]} @@ -511,7 +513,7 @@ async def search_rooms( Each match as a dictionary. """ clauses = [] - + args = [] # Make sure we don't explode because the person is in too many rooms. @@ -549,7 +551,9 @@ async def search_rooms( args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): - search_query, tsquery_func = _parse_query_for_pgsql(search_term, self.database_engine) + search_query, tsquery_func = _parse_query_for_pgsql( + search_term, self.database_engine + ) sql = ( f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" @@ -564,7 +568,7 @@ async def search_rooms( ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): - + # We use CROSS JOIN here to ensure we use the right indexes. # https://sqlite.org/optoverview.html#crossjoin # @@ -698,13 +702,16 @@ def f(txn): while stop_sel in value: stop_sel += ">" - query = "SELECT ts_headline(?, websearch_to_tsquery('english', ?), %s)" % ( - _to_postgres_options( - { - "StartSel": start_sel, - "StopSel": stop_sel, - "MaxFragments": "50", - } + query = ( + "SELECT ts_headline(?, websearch_to_tsquery('english', ?), %s)" + % ( + _to_postgres_options( + { + "StartSel": start_sel, + "StopSel": stop_sel, + "MaxFragments": "50", + } + ) ) ) txn.execute(query, (value, search_query)) @@ -731,7 +738,7 @@ def _to_postgres_options(options_dict): def _parse_query_for_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form - that can be passed to sqllite's matchinfo(). + that can be passed to sqllite's matchinfo(). """ # Pull out the individual words, discarding any non-word characters. @@ -743,7 +750,7 @@ def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[st """ Return a tuple of (parsed search_term, tsquery func to use). - The parsed search_term will be transformed into a syntax suitable for passing as an + The parsed search_term will be transformed into a syntax suitable for passing as an argument to the tsquery func. """ diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 1206521ce146..ffdcfc102d59 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -85,25 +85,27 @@ class PostgresMessageSearchTest(HomeserverTestCase): def test_web_search_for_phrase(self): """ - Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery. + Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery. This test is skipped unless the postgres instance supports websearch_to_tsquery. """ store = self.hs.get_datastore() if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - + if not store.database_engine.supports_websearch_to_tsquery: - raise SkipTest("Test only applies when postgres supporting websearch_to_tsquery is used as the database") + raise SkipTest( + "Test only applies when postgres supporting websearch_to_tsquery is used as the database" + ) phrase = "the quick brown fox jumps over the lazy dog" cases = [ ("brown", True), - ("quick brown", True), + ("quick brown", True), ("brown quick", True), - ("\"brown quick\"", False), - ("\"quick brown\"", True), - ("\"quick fox\"", False), + ('"brown quick"', False), + ('"quick brown"', True), + ('"quick fox"', False), ("furphy OR fox", True), ("nope OR doublenope", False), ("-fox", False), @@ -114,26 +116,35 @@ def test_web_search_for_phrase(self): self.register_user("alice", "password") access_token = self.login("alice", "password") room_id = self.helper.create_room_as("alice", tok=access_token) - + # Send the phrase as a message and check it was created response = self.helper.send(room_id, phrase, tok=access_token) self.assertIn("event_id", response) - - # Run all the test cases + + # Run all the test cases versus search_msgs + for query, has_results in cases: + result = self.get_success( + store.search_msgs([room_id], query, ["content.body"]) + ) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) + + # Run them again versus search_rooms for query, has_results in cases: - result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) + result = self.get_success( + store.search_rooms([room_id], query, ["content.body"], 10) + ) self.assertEquals(result.get("count"), 1 if has_results else 0, query) def test_non_web_search_for_phrase(self): """ - Test searching for phrases without using web search, which is used when websearch_to_tsquery isn't - supported by the current postgres version. + Test searching for phrases without using web search, which is used when websearch_to_tsquery isn't + supported by the current postgres version. """ - + store = self.hs.get_datastore() if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - + phrase = "the quick brown fox jumps over the lazy dog" cases = [ ("nope", False), @@ -141,32 +152,37 @@ def test_non_web_search_for_phrase(self): ("quick brown", True), ("brown quick", True), ("brown nope", False), - ("furphy OR fox", False), # syntax not supported - ("\"quick brown\"", True), # syntax not supported, but strips quotes - ("-nope", False), # syntax not supported + ("furphy OR fox", False), # syntax not supported + ('"quick brown"', True), # syntax not supported, but strips quotes + ("-nope", False), # syntax not supported ] # Register a user and create a room, create some messages self.register_user("alice", "password") access_token = self.login("alice", "password") room_id = self.helper.create_room_as("alice", tok=access_token) - + # Send the phrase as a message and check it was created response = self.helper.send(room_id, phrase, tok=access_token) self.assertIn("event_id", response) - + # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. - with patch("synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", - new_callable=PropertyMock) as supports_websearch_to_tsquery: + with patch( + "synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", + new_callable=PropertyMock, + ) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False - # Run all the test cases + # Run all the test cases for query, has_results in cases: - result = self.get_success(store.search_msgs([room_id], query, ["content.body"])) + result = self.get_success( + store.search_msgs([room_id], query, ["content.body"]) + ) self.assertEquals(result.get("count"), 1 if has_results else 0, query) -class PostgresRoomSearchTest(HomeserverTestCase): - # Register a user and create a room - self.register_user("alice", "password") - access_token = self.login("alice", "password") - room_id = self.helper.create_room_as("alice", tok=access_token \ No newline at end of file + # Run them again versus search_rooms + for query, has_results in cases: + result = self.get_success( + store.search_rooms([room_id], query, ["content.body"], 10) + ) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) From ac002cbf6cb16d1f09f05f5f2865628df6b04dc5 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:07:13 +1100 Subject: [PATCH 05/53] deflake --- synapse/storage/databases/main/search.py | 2 +- tests/storage/test_room_search.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 9862e26bbf04..7f4e7e69c825 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -26,7 +26,7 @@ LoggingTransaction, ) from synapse.storage.databases.main.events_worker import EventRedactBehaviour -from synapse.storage.engines import PostgresEngine, Sqlite3Engine, BaseDatabaseEngine +from synapse.storage.engines import PostgresEngine, Sqlite3Engine if TYPE_CHECKING: from synapse.server import HomeServer diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index ffdcfc102d59..a3810f4c06c1 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -14,6 +14,7 @@ from unittest.case import SkipTest from unittest.mock import PropertyMock, patch + import synapse.rest.admin from synapse.rest.client import login, room from synapse.storage.engines import PostgresEngine From bbbb6517aca42b50e4572a2741c9f65499b44a3e Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:09:50 +1100 Subject: [PATCH 06/53] improve docstring --- synapse/storage/databases/main/search.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 7f4e7e69c825..90671c8b5ff2 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -747,11 +747,14 @@ def _parse_query_for_sqlite(search_term: str) -> str: def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: - """ - Return a tuple of (parsed search_term, tsquery func to use). - - The parsed search_term will be transformed into a syntax suitable for passing as an - argument to the tsquery func. + """Selects a tsquery_* func to use and transforms the search_term into syntax appropriate for it. + + Args: + search_term: A user supplied search query. + engine: The database engine. + + Returns: + A tuple of (parsed search_term, tsquery func to use). """ if engine.supports_websearch_to_tsquery: From 02defb257ac5a40eb011fdb71e88c246ced8b7d9 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:12:31 +1100 Subject: [PATCH 07/53] pass tsquery_func to _find_highlights_in_postgres --- synapse/storage/databases/main/search.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 90671c8b5ff2..589b836b7e80 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -472,7 +472,7 @@ async def search_msgs(self, room_ids, search_term, keys): highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres(search_query, events) + highlights = await self._find_highlights_in_postgres(search_query, events, tsquery_func) count_sql += " GROUP BY room_id" @@ -633,7 +633,7 @@ async def search_rooms( highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres(search_query, events) + highlights = await self._find_highlights_in_postgres(search_query, events, tsquery_func) count_sql += " GROUP BY room_id" @@ -659,7 +659,7 @@ async def search_rooms( } async def _find_highlights_in_postgres( - self, search_query: str, events: List[EventBase] + self, search_query: str, events: List[EventBase], tsquery_func: str ) -> Set[str]: """Given a list of events and a search term, return a list of words that match from the content of the event. @@ -670,6 +670,7 @@ async def _find_highlights_in_postgres( Args: search_query events: A list of events + tsquery_func: The tsquery_* function to use when making queries Returns: A set of strings. @@ -703,7 +704,7 @@ def f(txn): stop_sel += ">" query = ( - "SELECT ts_headline(?, websearch_to_tsquery('english', ?), %s)" + f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % ( _to_postgres_options( { From 2eef5edd1f40484b3b229beafccd13404a480564 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:53:11 +1100 Subject: [PATCH 08/53] Add tests for sqlite --- synapse/storage/databases/main/search.py | 10 +- tests/storage/test_room_search.py | 114 ++++++++++++----------- 2 files changed, 67 insertions(+), 57 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 589b836b7e80..aede93fe7943 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -428,7 +428,7 @@ async def search_msgs(self, room_ids, search_term, keys): count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): search_query = _parse_query_for_sqlite(search_term) - + sql = ( "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" " FROM event_search" @@ -440,7 +440,7 @@ async def search_msgs(self, room_ids, search_term, keys): "SELECT room_id, count(*) as count FROM event_search" " WHERE value MATCH ?" ) - count_args = [search_term] + count_args + count_args = [search_query] + count_args else: # This should be unreachable. raise Exception("Unrecognized database engine") @@ -454,7 +454,7 @@ async def search_msgs(self, room_ids, search_term, keys): # We add an arbitrary limit here to ensure we don't try to pull the # entire table from the database. sql += " ORDER BY rank DESC LIMIT 500" - + results = await self.db_pool.execute( "search_msgs", self.db_pool.cursor_to_dict, sql, *args ) @@ -594,7 +594,7 @@ async def search_rooms( "SELECT room_id, count(*) as count FROM event_search" " WHERE value MATCH ? AND " ) - count_args = [search_term] + count_args + count_args = [search_query] + count_args else: # This should be unreachable. raise Exception("Unrecognized database engine") @@ -744,7 +744,7 @@ def _parse_query_for_sqlite(search_term: str) -> str: # Pull out the individual words, discarding any non-word characters. results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) - return " & ".join(results) + return " ".join(results) def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index a3810f4c06c1..770cd66a325e 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Tuple from unittest.case import SkipTest from unittest.mock import PropertyMock, patch import synapse.rest.admin from synapse.rest.client import login, room +from synapse.storage.databases.main import DataStore from synapse.storage.engines import PostgresEngine +from synapse.storage.engines.sqlite import Sqlite3Engine from tests.unittest import HomeserverTestCase @@ -77,14 +80,43 @@ def test_null_byte(self): self.assertIn("alice", result.get("highlights")) -class PostgresMessageSearchTest(HomeserverTestCase): +class MessageSearchTest(HomeserverTestCase): servlets = [ synapse.rest.admin.register_servlets_for_client_rest_resource, login.register_servlets, room.register_servlets, ] - def test_web_search_for_phrase(self): + PHRASE = "the quick brown fox jumps over the lazy dog" + + def setUp(self): + super().setUp() + + # Register a user and create a room, create some messages + self.register_user("alice", "password") + self.access_token = self.login("alice", "password") + self.room_id = self.helper.create_room_as("alice", tok=self.access_token) + + # Send the phrase as a message and check it was created + response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token) + self.assertIn("event_id", response) + + def _check_test_cases(self, store: DataStore, cases: list[Tuple[str, bool]]) -> None: + # Run all the test cases versus search_msgs + for query, has_results in cases: + result = self.get_success( + store.search_msgs([self.room_id], query, ["content.body"]) + ) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) + + # Run them again versus search_rooms + for query, has_results in cases: + result = self.get_success( + store.search_rooms([self.room_id], query, ["content.body"], 10) + ) + self.assertEquals(result.get("count"), 1 if has_results else 0, query) + + def test_postgres_web_search_for_phrase(self): """ Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery. This test is skipped unless the postgres instance supports websearch_to_tsquery. @@ -99,13 +131,12 @@ def test_web_search_for_phrase(self): "Test only applies when postgres supporting websearch_to_tsquery is used as the database" ) - phrase = "the quick brown fox jumps over the lazy dog" cases = [ ("brown", True), ("quick brown", True), ("brown quick", True), ('"brown quick"', False), - ('"quick brown"', True), + ('"jumps over"', True), ('"quick fox"', False), ("furphy OR fox", True), ("nope OR doublenope", False), @@ -113,32 +144,11 @@ def test_web_search_for_phrase(self): ("-nope", True), ] - # Register a user and create a room, create some messages - self.register_user("alice", "password") - access_token = self.login("alice", "password") - room_id = self.helper.create_room_as("alice", tok=access_token) - - # Send the phrase as a message and check it was created - response = self.helper.send(room_id, phrase, tok=access_token) - self.assertIn("event_id", response) + self._check_test_cases(store, cases) - # Run all the test cases versus search_msgs - for query, has_results in cases: - result = self.get_success( - store.search_msgs([room_id], query, ["content.body"]) - ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) - - # Run them again versus search_rooms - for query, has_results in cases: - result = self.get_success( - store.search_rooms([room_id], query, ["content.body"], 10) - ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) - - def test_non_web_search_for_phrase(self): + def test_postgres_non_web_search_for_phrase(self): """ - Test searching for phrases without using web search, which is used when websearch_to_tsquery isn't + Test postgres searching for phrases without using web search, which is used when websearch_to_tsquery isn't supported by the current postgres version. """ @@ -146,7 +156,6 @@ def test_non_web_search_for_phrase(self): if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - phrase = "the quick brown fox jumps over the lazy dog" cases = [ ("nope", False), ("brown", True), @@ -154,36 +163,37 @@ def test_non_web_search_for_phrase(self): ("brown quick", True), ("brown nope", False), ("furphy OR fox", False), # syntax not supported - ('"quick brown"', True), # syntax not supported, but strips quotes + ('"jumps over"', True), # syntax not supported, but strips quotes ("-nope", False), # syntax not supported ] - # Register a user and create a room, create some messages - self.register_user("alice", "password") - access_token = self.login("alice", "password") - room_id = self.helper.create_room_as("alice", tok=access_token) - - # Send the phrase as a message and check it was created - response = self.helper.send(room_id, phrase, tok=access_token) - self.assertIn("event_id", response) - # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. with patch( "synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", new_callable=PropertyMock, ) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False + self._check_test_cases(store, cases) + + def test_sqlite_search(self): + """ + Test sqlite searching for phrases. + """ + store = self.hs.get_datastore() + if not isinstance(store.database_engine, Sqlite3Engine): + raise SkipTest("Test only applies when sqlite is used as the database") + + cases = [ + ("nope", False), + ("brown", True), + ("quick brown", True), + ("brown quick", True), + ("brown nope", False), + ("furphy OR fox", True), # sqllite supports OR + ('"jumps over"', True), + ('quick fox', True), # syntax supports quotes, but we strip them out + ('"quick fox"', True), # syntax supports quotes, but we strip them out + ("-nope", False), # sqllite supports -, but we strip them out + ] - # Run all the test cases - for query, has_results in cases: - result = self.get_success( - store.search_msgs([room_id], query, ["content.body"]) - ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) - - # Run them again versus search_rooms - for query, has_results in cases: - result = self.get_success( - store.search_rooms([room_id], query, ["content.body"], 10) - ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) + self._check_test_cases(store, cases) From 28dc6427c8337c35cb152c53ef22ce199012ead5 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 12:56:37 +1100 Subject: [PATCH 09/53] Don't preprocess sqllite queries --- synapse/storage/databases/main/search.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index aede93fe7943..abe0c8afc137 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -741,10 +741,7 @@ def _parse_query_for_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form that can be passed to sqllite's matchinfo(). """ - - # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) - return " ".join(results) + return search_term def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: From 8ac3ea128c5abe711d6f0d0d604946669969fc41 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 13:02:02 +1100 Subject: [PATCH 10/53] Also test the size of "results", as its generated by a different query to count --- synapse/storage/databases/main/search.py | 2 +- tests/storage/test_room_search.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index abe0c8afc137..fa4f91bf0524 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -478,7 +478,7 @@ async def search_msgs(self, room_ids, search_term, keys): count_results = await self.db_pool.execute( "search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args - ) + ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 770cd66a325e..c1cf7e849b42 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -107,14 +107,16 @@ def _check_test_cases(self, store: DataStore, cases: list[Tuple[str, bool]]) -> result = self.get_success( store.search_msgs([self.room_id], query, ["content.body"]) ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) + self.assertEquals(result["count"], 1 if has_results else 0, query) + self.assertEquals(len(result["results"]), 1 if has_results else 0, query) # Run them again versus search_rooms for query, has_results in cases: result = self.get_success( store.search_rooms([self.room_id], query, ["content.body"], 10) ) - self.assertEquals(result.get("count"), 1 if has_results else 0, query) + self.assertEquals(result["count"], 1 if has_results else 0, query) + self.assertEquals(len(result["results"]), 1 if has_results else 0, query) def test_postgres_web_search_for_phrase(self): """ @@ -162,9 +164,9 @@ def test_postgres_non_web_search_for_phrase(self): ("quick brown", True), ("brown quick", True), ("brown nope", False), - ("furphy OR fox", False), # syntax not supported + ("furphy OR fox", False), # syntax not supported, quotes will be ignored ('"jumps over"', True), # syntax not supported, but strips quotes - ("-nope", False), # syntax not supported + ("-nope", False), # syntax not supported, - will be ignored ] # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. @@ -190,10 +192,9 @@ def test_sqlite_search(self): ("brown quick", True), ("brown nope", False), ("furphy OR fox", True), # sqllite supports OR - ('"jumps over"', True), - ('quick fox', True), # syntax supports quotes, but we strip them out - ('"quick fox"', True), # syntax supports quotes, but we strip them out - ("-nope", False), # sqllite supports -, but we strip them out + ('"jumps over"', True), + ('"quick fox"', False), # syntax supports quotes + ("fox NOT nope", True), # sqllite supports NOT ] self._check_test_cases(store, cases) From 5198cb2a1e549191d9a64f1bdd3a702865fd3d92 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 13:02:52 +1100 Subject: [PATCH 11/53] fix comment --- tests/storage/test_room_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index c1cf7e849b42..f59cc0297401 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -164,8 +164,8 @@ def test_postgres_non_web_search_for_phrase(self): ("quick brown", True), ("brown quick", True), ("brown nope", False), - ("furphy OR fox", False), # syntax not supported, quotes will be ignored - ('"jumps over"', True), # syntax not supported, but strips quotes + ("furphy OR fox", False), # syntax not supported, OR will be ignored as it'll be between & + ('"jumps over"', True), # syntax not supported, we strip quotes ("-nope", False), # syntax not supported, - will be ignored ] From b3c7e0d13c9ad5311022861f429011f33e9102e0 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 13:08:40 +1100 Subject: [PATCH 12/53] Use plainto_tsquery instead of crafting the query ourselves --- synapse/storage/databases/main/search.py | 7 ++----- tests/storage/test_room_search.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index fa4f91bf0524..1500c75d4f31 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -758,8 +758,5 @@ def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[st if engine.supports_websearch_to_tsquery: return search_term, "websearch_to_tsquery" else: - # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) - # Join words with & (AND), using prefix matching for each word - parsed_query = " & ".join(result + ":*" for result in results) - return parsed_query, "to_tsquery" + return search_term, "plainto_tsquery" + \ No newline at end of file diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index f59cc0297401..fe773d5be3e1 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -165,7 +165,7 @@ def test_postgres_non_web_search_for_phrase(self): ("brown quick", True), ("brown nope", False), ("furphy OR fox", False), # syntax not supported, OR will be ignored as it'll be between & - ('"jumps over"', True), # syntax not supported, we strip quotes + ('"jumps over"', True), # syntax not supported, quotes are ignored ("-nope", False), # syntax not supported, - will be ignored ] From 10f181a9c22b13057344bf697fa99ff33d9039fa Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 13:12:28 +1100 Subject: [PATCH 13/53] black --- synapse/storage/databases/main/search.py | 34 ++++++++++++------------ tests/storage/test_room_search.py | 17 +++++++----- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 1500c75d4f31..b62e86808e1e 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -428,7 +428,7 @@ async def search_msgs(self, room_ids, search_term, keys): count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): search_query = _parse_query_for_sqlite(search_term) - + sql = ( "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" " FROM event_search" @@ -454,7 +454,7 @@ async def search_msgs(self, room_ids, search_term, keys): # We add an arbitrary limit here to ensure we don't try to pull the # entire table from the database. sql += " ORDER BY rank DESC LIMIT 500" - + results = await self.db_pool.execute( "search_msgs", self.db_pool.cursor_to_dict, sql, *args ) @@ -472,13 +472,15 @@ async def search_msgs(self, room_ids, search_term, keys): highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres(search_query, events, tsquery_func) + highlights = await self._find_highlights_in_postgres( + search_query, events, tsquery_func + ) count_sql += " GROUP BY room_id" count_results = await self.db_pool.execute( "search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args - ) + ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) @@ -633,7 +635,9 @@ async def search_rooms( highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres(search_query, events, tsquery_func) + highlights = await self._find_highlights_in_postgres( + search_query, events, tsquery_func + ) count_sql += " GROUP BY room_id" @@ -703,16 +707,13 @@ def f(txn): while stop_sel in value: stop_sel += ">" - query = ( - f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" - % ( - _to_postgres_options( - { - "StartSel": start_sel, - "StopSel": stop_sel, - "MaxFragments": "50", - } - ) + query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % ( + _to_postgres_options( + { + "StartSel": start_sel, + "StopSel": stop_sel, + "MaxFragments": "50", + } ) ) txn.execute(query, (value, search_query)) @@ -746,7 +747,7 @@ def _parse_query_for_sqlite(search_term: str) -> str: def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: """Selects a tsquery_* func to use and transforms the search_term into syntax appropriate for it. - + Args: search_term: A user supplied search query. engine: The database engine. @@ -759,4 +760,3 @@ def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[st return search_term, "websearch_to_tsquery" else: return search_term, "plainto_tsquery" - \ No newline at end of file diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index fe773d5be3e1..cc4eea1d232e 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -89,7 +89,7 @@ class MessageSearchTest(HomeserverTestCase): PHRASE = "the quick brown fox jumps over the lazy dog" - def setUp(self): + def setUp(self): super().setUp() # Register a user and create a room, create some messages @@ -101,7 +101,9 @@ def setUp(self): response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token) self.assertIn("event_id", response) - def _check_test_cases(self, store: DataStore, cases: list[Tuple[str, bool]]) -> None: + def _check_test_cases( + self, store: DataStore, cases: list[Tuple[str, bool]] + ) -> None: # Run all the test cases versus search_msgs for query, has_results in cases: result = self.get_success( @@ -164,7 +166,10 @@ def test_postgres_non_web_search_for_phrase(self): ("quick brown", True), ("brown quick", True), ("brown nope", False), - ("furphy OR fox", False), # syntax not supported, OR will be ignored as it'll be between & + ( + "furphy OR fox", + False, + ), # syntax not supported, OR will be ignored as it'll be between & ('"jumps over"', True), # syntax not supported, quotes are ignored ("-nope", False), # syntax not supported, - will be ignored ] @@ -175,7 +180,7 @@ def test_postgres_non_web_search_for_phrase(self): new_callable=PropertyMock, ) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False - self._check_test_cases(store, cases) + self._check_test_cases(store, cases) def test_sqlite_search(self): """ @@ -192,8 +197,8 @@ def test_sqlite_search(self): ("brown quick", True), ("brown nope", False), ("furphy OR fox", True), # sqllite supports OR - ('"jumps over"', True), - ('"quick fox"', False), # syntax supports quotes + ('"jumps over"', True), + ('"quick fox"', False), # syntax supports quotes ("fox NOT nope", True), # sqllite supports NOT ] From 05a0cdd9adbe63d9afc7926d45fa943661dbf38f Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 14:12:56 +1100 Subject: [PATCH 14/53] Add feature file --- changelog.d/11635.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/11635.feature diff --git a/changelog.d/11635.feature b/changelog.d/11635.feature new file mode 100644 index 000000000000..94c8a83212d1 --- /dev/null +++ b/changelog.d/11635.feature @@ -0,0 +1 @@ +Allow use of postgres and sqllite full-text search operators in search queries. \ No newline at end of file From b0417e9ef5ac528b2d14b387a7b62fd9eda3c80e Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 23 Dec 2021 14:48:12 +1100 Subject: [PATCH 15/53] Use common cases for all tests --- synapse/storage/databases/main/search.py | 1 - tests/storage/test_room_search.py | 27 ++++++++++-------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index b62e86808e1e..d9d8ea5cadbe 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -483,7 +483,6 @@ async def search_msgs(self, room_ids, search_term, keys): ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) - return { "results": [ {"event": event_map[r["event_id"]], "rank": r["rank"]} diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index cc4eea1d232e..57722aa91c32 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -88,6 +88,14 @@ class MessageSearchTest(HomeserverTestCase): ] PHRASE = "the quick brown fox jumps over the lazy dog" + COMMON_CASES = [ + ("nope", False), + ("brown", True), + ("quick brown", True), + ("brown quick", True), + ("jump", True), # tests stemming + ("brown nope", False), + ] def setUp(self): super().setUp() @@ -135,10 +143,7 @@ def test_postgres_web_search_for_phrase(self): "Test only applies when postgres supporting websearch_to_tsquery is used as the database" ) - cases = [ - ("brown", True), - ("quick brown", True), - ("brown quick", True), + cases = self.COMMON_CASES + [ ('"brown quick"', False), ('"jumps over"', True), ('"quick fox"', False), @@ -160,12 +165,7 @@ def test_postgres_non_web_search_for_phrase(self): if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - cases = [ - ("nope", False), - ("brown", True), - ("quick brown", True), - ("brown quick", True), - ("brown nope", False), + cases = self.COMMON_CASES + [ ( "furphy OR fox", False, @@ -190,12 +190,7 @@ def test_sqlite_search(self): if not isinstance(store.database_engine, Sqlite3Engine): raise SkipTest("Test only applies when sqlite is used as the database") - cases = [ - ("nope", False), - ("brown", True), - ("quick brown", True), - ("brown quick", True), - ("brown nope", False), + cases = self.COMMON_CASES + [ ("furphy OR fox", True), # sqllite supports OR ('"jumps over"', True), ('"quick fox"', False), # syntax supports quotes From d30a211a37f3f7701f82592eb7a8cfb6343a720f Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:10:41 +0100 Subject: [PATCH 16/53] fix for removal of get_datastore() --- tests/storage/test_room_search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index a9cf50f0ae9a..8e96ade9e42b 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -247,7 +247,7 @@ def test_postgres_web_search_for_phrase(self): This test is skipped unless the postgres instance supports websearch_to_tsquery. """ - store = self.hs.get_datastore() + store = self.hs.get_datastores().main if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") @@ -274,7 +274,7 @@ def test_postgres_non_web_search_for_phrase(self): supported by the current postgres version. """ - store = self.hs.get_datastore() + store = self.hs.get_datastores().main if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") @@ -299,7 +299,7 @@ def test_sqlite_search(self): """ Test sqlite searching for phrases. """ - store = self.hs.get_datastore() + store = self.hs.get_datastores().main if not isinstance(store.database_engine, Sqlite3Engine): raise SkipTest("Test only applies when sqlite is used as the database") From 68cae260b91344fbe206d7377b1dcc5bb1f70e35 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:13:37 +0100 Subject: [PATCH 17/53] isort --- synapse/storage/databases/main/search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 44eb16028305..dd4922550bb7 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -14,7 +14,6 @@ import logging import re - from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple import attr From 5dbb2c04cb8e4e9258ab4a020b72ab4c4a9cec8c Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:32:03 +0100 Subject: [PATCH 18/53] add migration --- .../65/12_update_sqlite_fts4_tokenizer.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py new file mode 100644 index 000000000000..3215bc60a517 --- /dev/null +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -0,0 +1,34 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from synapse.storage.engines import Sqlite3Engine + + +def update_event_search_to_use_porter_stemmer(cur, database_engine): + # Upgrade the event_search table to use the porter tokenizer + if isinstance(database_engine, Sqlite3Engine): + cur.execute("DROP TABLE event_search") + cur.execute("""CREATE VIRTUAL TABLE event_search + USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""") + + # TODO: we just dropped the table .. do we need to do stuff to ensure its repopulated? + + +def run_create(cur, database_engine, *args, **kwargs): + update_event_search_to_use_porter_stemmer(cur, database_engine) + + +def run_upgrade(cur, database_engine, *args, **kwargs): + pass + From 64dd3572386828cb12faa0dafbb3d64070767b43 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:32:17 +0100 Subject: [PATCH 19/53] comment --- tests/storage/test_room_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 8e96ade9e42b..155a9a3eb3a2 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -202,6 +202,7 @@ class MessageSearchTest(HomeserverTestCase): PHRASE = "the quick brown fox jumps over the lazy dog" COMMON_CASES = [ + # (query, whether PHRASE contains query) ("nope", False), ("brown", True), ("quick brown", True), From 20ab98ec9c9b364a673b7922566c33b55a927b69 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:33:04 +0100 Subject: [PATCH 20/53] black --- .../main/delta/65/12_update_sqlite_fts4_tokenizer.py | 11 ++++++----- tests/storage/test_room_search.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 3215bc60a517..94a1491d053f 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -19,16 +19,17 @@ def update_event_search_to_use_porter_stemmer(cur, database_engine): # Upgrade the event_search table to use the porter tokenizer if isinstance(database_engine, Sqlite3Engine): cur.execute("DROP TABLE event_search") - cur.execute("""CREATE VIRTUAL TABLE event_search - USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""") - + cur.execute( + """CREATE VIRTUAL TABLE event_search + USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""" + ) + # TODO: we just dropped the table .. do we need to do stuff to ensure its repopulated? def run_create(cur, database_engine, *args, **kwargs): update_event_search_to_use_porter_stemmer(cur, database_engine) - + def run_upgrade(cur, database_engine, *args, **kwargs): pass - diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 155a9a3eb3a2..31b594dc38e5 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -311,4 +311,4 @@ def test_sqlite_search(self): ("fox NOT nope", True), # sqllite supports NOT ] - self._check_test_cases(store, cases) \ No newline at end of file + self._check_test_cases(store, cases) From e5aa9166cf4410c5a8480378eceb9bead316a3fd Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:37:31 +0100 Subject: [PATCH 21/53] isort --- .../schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 94a1491d053f..4d58cfc6ddbe 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json + from synapse.storage.engines import Sqlite3Engine From 7941d8a74606d5ce2854928f842b58697e833cd8 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 15:55:10 +0100 Subject: [PATCH 22/53] flake8 --- synapse/storage/databases/main/search.py | 2 +- .../schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index dd4922550bb7..ced2335f075d 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -27,7 +27,7 @@ LoggingTransaction, ) from synapse.storage.databases.main.events_worker import EventRedactBehaviour -from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine +from synapse.storage.engines import PostgresEngine, Sqlite3Engine from synapse.types import JsonDict if TYPE_CHECKING: diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 4d58cfc6ddbe..84548731cb05 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json - from synapse.storage.engines import Sqlite3Engine From f7362f105ea56902026a488e8623258e49852429 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 16:42:39 +0100 Subject: [PATCH 23/53] import List for python 3.7 --- tests/storage/test_room_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 31b594dc38e5..5fc549320dde 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Tuple +from typing import List, Tuple from unittest.case import SkipTest from unittest.mock import PropertyMock, patch @@ -224,7 +224,7 @@ def setUp(self): self.assertIn("event_id", response) def _check_test_cases( - self, store: DataStore, cases: list[Tuple[str, bool]] + self, store: DataStore, cases: List[Tuple[str, bool]] ) -> None: # Run all the test cases versus search_msgs for query, has_results in cases: From f1769f9a17320f6743438d6e6848f3a3c0c4d88b Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 17:14:15 +0100 Subject: [PATCH 24/53] create a background job, and don't do anything if the tokenizer is already porter --- .../65/12_update_sqlite_fts4_tokenizer.py | 43 ++++++++++++++----- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 84548731cb05..1459576d53f4 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -14,20 +14,41 @@ from synapse.storage.engines import Sqlite3Engine -def update_event_search_to_use_porter_stemmer(cur, database_engine): - # Upgrade the event_search table to use the porter tokenizer - if isinstance(database_engine, Sqlite3Engine): - cur.execute("DROP TABLE event_search") - cur.execute( - """CREATE VIRTUAL TABLE event_search - USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""" - ) +def run_create(cur, database_engine, *args, **kwargs): + # Upgrade the event_search table to use the porter tokenizer if it isn't already + if isinstance(database_engine, Sqlite3Engine): + cur.execute("SELECT sql FROM sqlite_master WHERE name='event_search'") + sql = cur.fetchone() + if sql is None: + raise Exception("The event_search table doesn't exist") + if "tokenize=porter" not in sql[0]: + cur.execute("DROP TABLE event_search") + cur.execute("""CREATE VIRTUAL TABLE event_search + USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""") - # TODO: we just dropped the table .. do we need to do stuff to ensure its repopulated? + # Run a background job to re-populate the event_search table. + cur.execute("SELECT MIN(stream_ordering) FROM events") + rows = cur.fetchall() + min_stream_id = rows[0][0] + cur.execute("SELECT MAX(stream_ordering) FROM events") + rows = cur.fetchall() + max_stream_id = rows[0][0] -def run_create(cur, database_engine, *args, **kwargs): - update_event_search_to_use_porter_stemmer(cur, database_engine) + if min_stream_id is not None and max_stream_id is not None: + progress = { + "target_min_stream_id_inclusive": min_stream_id, + "max_stream_id_exclusive": max_stream_id + 1, + "rows_inserted": 0, + } + progress_json = json.dumps(progress) + + sql = ( + "INSERT into background_updates (update_name, progress_json)" + " VALUES (?, ?)" + ) + + cur.execute(sql, ("event_search", progress_json)) def run_upgrade(cur, database_engine, *args, **kwargs): From de07c83ec710c87b30b1c3980bd1be97a3e91392 Mon Sep 17 00:00:00 2001 From: James Salter Date: Thu, 26 May 2022 17:14:58 +0100 Subject: [PATCH 25/53] When creating a new db, create it with tokenize=porter in the first place --- synapse/storage/schema/main/delta/25/fts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/schema/main/delta/25/fts.py b/synapse/storage/schema/main/delta/25/fts.py index 21f57825d4ed..7b7acea9e328 100644 --- a/synapse/storage/schema/main/delta/25/fts.py +++ b/synapse/storage/schema/main/delta/25/fts.py @@ -37,7 +37,7 @@ SQLITE_TABLE = ( "CREATE VIRTUAL TABLE event_search" - " USING fts4 ( event_id, room_id, sender, key, value )" + " USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" ) From 1170e07bf682b47c93b8e971e15cbf5971b789cc Mon Sep 17 00:00:00 2001 From: James Salter Date: Fri, 27 May 2022 09:23:12 +0100 Subject: [PATCH 26/53] black --- .../main/delta/65/12_update_sqlite_fts4_tokenizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 1459576d53f4..239b53cbad5a 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -16,15 +16,17 @@ def run_create(cur, database_engine, *args, **kwargs): # Upgrade the event_search table to use the porter tokenizer if it isn't already - if isinstance(database_engine, Sqlite3Engine): + if isinstance(database_engine, Sqlite3Engine): cur.execute("SELECT sql FROM sqlite_master WHERE name='event_search'") sql = cur.fetchone() if sql is None: raise Exception("The event_search table doesn't exist") if "tokenize=porter" not in sql[0]: cur.execute("DROP TABLE event_search") - cur.execute("""CREATE VIRTUAL TABLE event_search - USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""") + cur.execute( + """CREATE VIRTUAL TABLE event_search + USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""" + ) # Run a background job to re-populate the event_search table. cur.execute("SELECT MIN(stream_ordering) FROM events") From b987203bef0828428c3bd145a43281d895592b54 Mon Sep 17 00:00:00 2001 From: James Salter Date: Fri, 27 May 2022 09:28:23 +0100 Subject: [PATCH 27/53] Revert change to 25/fts.py This aims to optimise creation of a new site by avoiding creating and recreating the event_search table, but it has no effect without also updating full_schemas which I don't think we want to do --- synapse/storage/schema/main/delta/25/fts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/schema/main/delta/25/fts.py b/synapse/storage/schema/main/delta/25/fts.py index 7b7acea9e328..21f57825d4ed 100644 --- a/synapse/storage/schema/main/delta/25/fts.py +++ b/synapse/storage/schema/main/delta/25/fts.py @@ -37,7 +37,7 @@ SQLITE_TABLE = ( "CREATE VIRTUAL TABLE event_search" - " USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" + " USING fts4 ( event_id, room_id, sender, key, value )" ) From 6ef1ef89d550d10d748380d90e0fe352df8b6342 Mon Sep 17 00:00:00 2001 From: James Salter Date: Fri, 27 May 2022 09:35:39 +0100 Subject: [PATCH 28/53] fix json import --- .../schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 239b53cbad5a..bea34674ca43 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json + from synapse.storage.engines import Sqlite3Engine From 63c4270d1bba01cb8fdde2735c3f44d62df30395 Mon Sep 17 00:00:00 2001 From: James Salter Date: Fri, 27 May 2022 09:36:53 +0100 Subject: [PATCH 29/53] slightly neater quote formatting --- .../schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index bea34674ca43..3f5bafa1841d 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -26,8 +26,8 @@ def run_create(cur, database_engine, *args, **kwargs): if "tokenize=porter" not in sql[0]: cur.execute("DROP TABLE event_search") cur.execute( - """CREATE VIRTUAL TABLE event_search - USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )""" + "CREATE VIRTUAL TABLE event_search" + "USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" ) # Run a background job to re-populate the event_search table. From 0ddf6b1c160ef67f377b0e7a77770c6f44ad27b1 Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 10:37:43 +0100 Subject: [PATCH 30/53] fix missing space --- .../schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py index 3f5bafa1841d..82b3fb310180 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py @@ -27,7 +27,7 @@ def run_create(cur, database_engine, *args, **kwargs): cur.execute("DROP TABLE event_search") cur.execute( "CREATE VIRTUAL TABLE event_search" - "USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" + " USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" ) # Run a background job to re-populate the event_search table. From 9d77bc45c3f6f0127f77e3e45e4625c498dbbc04 Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 14:51:34 +0100 Subject: [PATCH 31/53] Add a parser to produce uniform results on all DBs --- synapse/storage/databases/main/search.py | 151 ++++++++++++++++++++++- tests/storage/test_room_search.py | 78 ++++++------ 2 files changed, 186 insertions(+), 43 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index ced2335f075d..0f768cf7436c 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -14,7 +14,19 @@ import logging import re -from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple +from dataclasses import dataclass +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Collection, + Iterable, + List, + Optional, + Set, + Tuple, + Union, +) import attr @@ -779,11 +791,131 @@ def _to_postgres_options(options_dict: JsonDict) -> str: return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),) -def _parse_query_for_sqlite(search_term: str) -> str: - """Takes a plain unicode string from the user and converts it into a form - that can be passed to sqllite's matchinfo(). +@dataclass +class Phrase: + phrase: List[str] + + +class Not: + pass + + +class Or: + pass + + +class And: + pass + + +class LParen: + pass + + +class RParen: + pass + + +Token = Union[str, Phrase, Not, Or, And, LParen, RParen] +TokenList = List[Token] + + +def _tokenize_query(query: str) -> TokenList: + tokens = [] + words = query.split(" ") + i = 0 + while i < len(words): + word = words[i] + i = i + 1 + if word[0] == '"': + phrase = [word[1:]] + while i < len(words): + word = words[i] + i = i + 1 + if word[-1] == '"': + phrase.append(word[:-1]) + break + else: + phrase.append(word) + tokens.append(Phrase(phrase)) + elif word[0] == "-": + tokens.append(Not()) + tokens.append(word[1:]) + elif word.lower() == "or": + tokens.append(Or()) + elif word.lower() == "and": + tokens.append(And()) + elif word == "(": + tokens.append(LParen()) + elif word == ")": + tokens.append(RParen()) + else: + tokens.append(word) + return tokens + + +def _tokens_to_tsquery(tokens: TokenList) -> str: """ - return search_term + Convert the list of tokens to a string suitable for passing to postgresql's to_tsquery + + Ref: https://www.postgresql.org/docs/current/textsearch-controls.html + """ + + tsquery = [] + for i, token in enumerate(tokens): + if isinstance(token, str): + tsquery.append(token) + elif isinstance(token, Phrase): + tsquery.append(" ( " + " <-> ".join(token.phrase) + " ) ") + elif isinstance(token, Not): + tsquery.append("!") + elif isinstance(token, Or): + tsquery.append(" | ") + elif isinstance(token, And): + tsquery.append(" & ") + elif isinstance(token, LParen): + tsquery.append(" ( ") + elif isinstance(token, RParen): + tsquery.append(" ) ") + else: + raise Exception("unknown token " + token) + if ( + i != len(tokens) - 1 + and isinstance(token, (str, Phrase)) + and not isinstance(tokens[i + 1], (Or, And, RParen)) + ): + tsquery.append(" & ") + return "".join(tsquery) + + +def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: + """ + Convert the list of tokens to a string suitable for passing to sqlite's MATCH. + Assume sqlite was compiled with enhanced query syntax. + + Ref: https://www.sqlite.org/fts3.html#full_text_index_queries + """ + match_query = [] + for i, token in enumerate(tokens): + if isinstance(token, str): + match_query.append(token) + match_query.append(" ") + elif isinstance(token, Phrase): + match_query.append(' "' + " ".join(token.phrase) + '" ') + elif isinstance(token, Not): + match_query.append("NOT ") + elif isinstance(token, Or): + match_query.append("OR ") + elif isinstance(token, And): + match_query.append("AND ") + elif isinstance(token, LParen): + match_query.append("( ") + elif isinstance(token, RParen): + match_query.append(") ") + else: + raise Exception("unknown token " + str(token)) + + return "".join(match_query) def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: @@ -800,4 +932,11 @@ def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[st if engine.supports_websearch_to_tsquery: return search_term, "websearch_to_tsquery" else: - return search_term, "plainto_tsquery" + return _tokens_to_tsquery(_tokenize_query(search_term)), "to_tsquery" + + +def _parse_query_for_sqlite(search_term: str) -> str: + """Takes a plain unicode string from the user and converts it into a form + that can be passed to sqllite's matchinfo(). + """ + return _tokens_to_sqlite_match_query(_tokenize_query(search_term)) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 5fc549320dde..f5ecf7603268 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -201,14 +201,24 @@ class MessageSearchTest(HomeserverTestCase): ] PHRASE = "the quick brown fox jumps over the lazy dog" + + # (query, whether PHRASE contains query) COMMON_CASES = [ - # (query, whether PHRASE contains query) ("nope", False), ("brown", True), ("quick brown", True), ("brown quick", True), - ("jump", True), # tests stemming + ("jump", True), ("brown nope", False), + ('"brown quick"', False), + ('"jumps over"', True), + ('"quick fox"', False), + ("nope OR doublenope", False), + ("furphy OR fox", True), + ("fox -nope", True), + ("fox -brown", False), + ("fox AND ( brown OR nope )", True), + ("fox AND ( nope OR doublenope )", False), ] def setUp(self): @@ -227,20 +237,40 @@ def _check_test_cases( self, store: DataStore, cases: List[Tuple[str, bool]] ) -> None: # Run all the test cases versus search_msgs - for query, has_results in cases: + for query, expect_to_contain in cases: result = self.get_success( store.search_msgs([self.room_id], query, ["content.body"]) ) - self.assertEquals(result["count"], 1 if has_results else 0, query) - self.assertEquals(len(result["results"]), 1 if has_results else 0, query) + self.assertEquals( + result["count"], + 1 if expect_to_contain else 0, + f"expected '{query}' to match '{self.PHRASE}'" + if expect_to_contain + else f"'{query}' unexpectedly matched '{self.PHRASE}'", + ) + self.assertEquals( + len(result["results"]), + 1 if expect_to_contain else 0, + "results array length should match count", + ) # Run them again versus search_rooms - for query, has_results in cases: + for query, expect_to_contain in cases: result = self.get_success( store.search_rooms([self.room_id], query, ["content.body"], 10) ) - self.assertEquals(result["count"], 1 if has_results else 0, query) - self.assertEquals(len(result["results"]), 1 if has_results else 0, query) + self.assertEquals( + result["count"], + 1 if expect_to_contain else 0, + f"expected '{query}' to match '{self.PHRASE}'" + if expect_to_contain + else f"'{query}' unexpectedly matched '{self.PHRASE}'", + ) + self.assertEquals( + len(result["results"]), + 1 if expect_to_contain else 0, + "results array length should match count", + ) def test_postgres_web_search_for_phrase(self): """ @@ -257,17 +287,7 @@ def test_postgres_web_search_for_phrase(self): "Test only applies when postgres supporting websearch_to_tsquery is used as the database" ) - cases = self.COMMON_CASES + [ - ('"brown quick"', False), - ('"jumps over"', True), - ('"quick fox"', False), - ("furphy OR fox", True), - ("nope OR doublenope", False), - ("-fox", False), - ("-nope", True), - ] - - self._check_test_cases(store, cases) + self._check_test_cases(store, self.COMMON_CASES) def test_postgres_non_web_search_for_phrase(self): """ @@ -279,22 +299,13 @@ def test_postgres_non_web_search_for_phrase(self): if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - cases = self.COMMON_CASES + [ - ( - "furphy OR fox", - False, - ), # syntax not supported, OR will be ignored as it'll be between & - ('"jumps over"', True), # syntax not supported, quotes are ignored - ("-nope", False), # syntax not supported, - will be ignored - ] - # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. with patch( "synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", new_callable=PropertyMock, ) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False - self._check_test_cases(store, cases) + self._check_test_cases(store, self.COMMON_CASES) def test_sqlite_search(self): """ @@ -304,11 +315,4 @@ def test_sqlite_search(self): if not isinstance(store.database_engine, Sqlite3Engine): raise SkipTest("Test only applies when sqlite is used as the database") - cases = self.COMMON_CASES + [ - ("furphy OR fox", True), # sqllite supports OR - ('"jumps over"', True), - ('"quick fox"', False), # syntax supports quotes - ("fox NOT nope", True), # sqllite supports NOT - ] - - self._check_test_cases(store, cases) + self._check_test_cases(store, self.COMMON_CASES) From b8b2e280b7e0a69a16ad0af9b4eb80cb84356f70 Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 15:19:48 +0100 Subject: [PATCH 32/53] address flake8 --- synapse/storage/databases/main/search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 0f768cf7436c..7b13a546e4b0 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -15,7 +15,6 @@ import logging import re from dataclasses import dataclass -from enum import Enum from typing import ( TYPE_CHECKING, Any, @@ -896,7 +895,7 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: Ref: https://www.sqlite.org/fts3.html#full_text_index_queries """ match_query = [] - for i, token in enumerate(tokens): + for token in tokens: if isinstance(token, str): match_query.append(token) match_query.append(" ") From 8506b2123b158e90cd8c43254820d78c02dcb733 Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 15:25:53 +0100 Subject: [PATCH 33/53] give mypy a hint --- synapse/storage/databases/main/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 7b13a546e4b0..7b18aefe24a5 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -820,7 +820,7 @@ class RParen: def _tokenize_query(query: str) -> TokenList: - tokens = [] + tokens: TokenList = [] words = query.split(" ") i = 0 while i < len(words): From e279be58177c0148c3d26b39c2c024e53b895b1b Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 16:27:49 +0100 Subject: [PATCH 34/53] Fix phrase handling of "word" --- synapse/storage/databases/main/search.py | 19 +++++++++++-------- tests/storage/test_room_search.py | 3 +++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 7b18aefe24a5..1d237e48c306 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -825,17 +825,19 @@ def _tokenize_query(query: str) -> TokenList: i = 0 while i < len(words): word = words[i] - i = i + 1 if word[0] == '"': - phrase = [word[1:]] - while i < len(words): - word = words[i] - i = i + 1 - if word[-1] == '"': - phrase.append(word[:-1]) + phrase_word = word[1:] + phrase = [] + while True: + if phrase_word[-1] == '"': + phrase.append(phrase_word[:-1]) break else: - phrase.append(word) + phrase.append(phrase_word) + i = i + 1 + if i == len(words): + break + phrase_word = words[i] tokens.append(Phrase(phrase)) elif word[0] == "-": tokens.append(Not()) @@ -850,6 +852,7 @@ def _tokenize_query(query: str) -> TokenList: tokens.append(RParen()) else: tokens.append(word) + i = i + 1 return tokens diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index f5ecf7603268..b689cf3df605 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -219,6 +219,9 @@ class MessageSearchTest(HomeserverTestCase): ("fox -brown", False), ("fox AND ( brown OR nope )", True), ("fox AND ( nope OR doublenope )", False), + ('"fox" quick', True), + ('"fox quick', False), + ('"quick brown', True), ] def setUp(self): From 6a7cf4975a6038639ec48e9d072e07f8a1e9df11 Mon Sep 17 00:00:00 2001 From: James Salter Date: Mon, 30 May 2022 17:20:34 +0100 Subject: [PATCH 35/53] Add comment --- synapse/storage/databases/main/search.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 1d237e48c306..bc8bd0f22e2d 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -820,6 +820,17 @@ class RParen: def _tokenize_query(query: str) -> TokenList: + """ + Convert the user-supplied `query` into a TokenList, which can be translated into + some DB-specific syntax. + + The following constructs are supported: + + - phrase queries using "double quotes" + - case-insensitive or and and operators, with parentheses to change operator precedence + - unary hyphen to denote NOT e.g. 'include -exclude' + + """ tokens: TokenList = [] words = query.split(" ") i = 0 @@ -881,6 +892,7 @@ def _tokens_to_tsquery(tokens: TokenList) -> str: tsquery.append(" ) ") else: raise Exception("unknown token " + token) + if ( i != len(tokens) - 1 and isinstance(token, (str, Phrase)) From a5f298bed8de50f4c05f3c4efbefccc5872f6a96 Mon Sep 17 00:00:00 2001 From: James Salter Date: Tue, 31 May 2022 11:39:09 +0100 Subject: [PATCH 36/53] document negation via - --- synapse/storage/databases/main/search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index bc8bd0f22e2d..13aac9dda522 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -828,6 +828,7 @@ def _tokenize_query(query: str) -> TokenList: - phrase queries using "double quotes" - case-insensitive or and and operators, with parentheses to change operator precedence + - negation of a keyword via unary - - unary hyphen to denote NOT e.g. 'include -exclude' """ From c5444098045921aaf063d6d76c152ddd9445cac7 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 17 Oct 2022 14:04:41 -0400 Subject: [PATCH 37/53] Move the database schema to the updated directory. --- .../10_update_sqlite_fts4_tokenizer.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename synapse/storage/schema/main/delta/{65/12_update_sqlite_fts4_tokenizer.py => 73/10_update_sqlite_fts4_tokenizer.py} (97%) diff --git a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py similarity index 97% rename from synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py rename to synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py index 82b3fb310180..cc206b713545 100644 --- a/synapse/storage/schema/main/delta/65/12_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py @@ -1,4 +1,4 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. +# Copyright 2022 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From be742f1711f68cfe3d745dee7319d1d7c2b0f337 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 17 Oct 2022 14:28:01 -0400 Subject: [PATCH 38/53] Use a deque. --- synapse/storage/databases/main/search.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 62abf3210a7c..b4fca2f19459 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import deque import logging import re from dataclasses import dataclass @@ -817,29 +818,31 @@ def _tokenize_query(query: str) -> TokenList: The following constructs are supported: - phrase queries using "double quotes" - - case-insensitive or and and operators, with parentheses to change operator precedence - - negation of a keyword via unary - + - case-insensitive `or` and `and` operators, with parentheses to change operator precedence + - negation of a keyword via unary `-` - unary hyphen to denote NOT e.g. 'include -exclude' """ tokens: TokenList = [] - words = query.split(" ") - i = 0 - while i < len(words): - word = words[i] + words = deque(query.split(" ")) + while words: + word = words.popleft() if word[0] == '"': phrase_word = word[1:] + # Continue to handle additional words until a word ends in a + # double quote. phrase = [] while True: + # XXX Will this break if the first word is just a double quote. if phrase_word[-1] == '"': phrase.append(phrase_word[:-1]) break else: phrase.append(phrase_word) - i = i + 1 - if i == len(words): + # Nothing left to process. + if not words: break - phrase_word = words[i] + phrase_word = words.popleft() tokens.append(Phrase(phrase)) elif word[0] == "-": tokens.append(Not()) @@ -854,7 +857,6 @@ def _tokenize_query(query: str) -> TokenList: tokens.append(RParen()) else: tokens.append(word) - i = i + 1 return tokens From 52700b05f6b1d521098e90893d018e16c4d8364e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 17 Oct 2022 15:19:06 -0400 Subject: [PATCH 39/53] Add basic tests for _tokenize_query. --- synapse/storage/databases/main/search.py | 68 ++++++++++-------------- tests/storage/test_room_search.py | 42 +++++++++++++-- 2 files changed, 68 insertions(+), 42 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index b4fca2f19459..98f67ede6fe4 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from collections import deque +import enum import logging import re +from collections import deque from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -29,6 +29,7 @@ ) import attr +from typing_extensions import Literal from synapse.api.errors import SynapseError from synapse.events import EventBase @@ -786,27 +787,15 @@ class Phrase: phrase: List[str] -class Not: - pass - - -class Or: - pass - - -class And: - pass - - -class LParen: - pass - - -class RParen: - pass +class SearchToken(enum.Enum): + Not = enum.auto() + Or = enum.auto() + And = enum.auto() + LParen = enum.auto() + RParen = enum.auto() -Token = Union[str, Phrase, Not, Or, And, LParen, RParen] +Token = Union[str, Phrase, SearchToken] TokenList = List[Token] @@ -845,16 +834,16 @@ def _tokenize_query(query: str) -> TokenList: phrase_word = words.popleft() tokens.append(Phrase(phrase)) elif word[0] == "-": - tokens.append(Not()) + tokens.append(SearchToken.Not) tokens.append(word[1:]) elif word.lower() == "or": - tokens.append(Or()) + tokens.append(SearchToken.Or) elif word.lower() == "and": - tokens.append(And()) + tokens.append(SearchToken.And) elif word == "(": - tokens.append(LParen()) + tokens.append(SearchToken.LParen) elif word == ")": - tokens.append(RParen()) + tokens.append(SearchToken.RParen) else: tokens.append(word) return tokens @@ -873,23 +862,24 @@ def _tokens_to_tsquery(tokens: TokenList) -> str: tsquery.append(token) elif isinstance(token, Phrase): tsquery.append(" ( " + " <-> ".join(token.phrase) + " ) ") - elif isinstance(token, Not): + elif token == SearchToken.Not: tsquery.append("!") - elif isinstance(token, Or): + elif token == SearchToken.Or: tsquery.append(" | ") - elif isinstance(token, And): + elif token == SearchToken.And: tsquery.append(" & ") - elif isinstance(token, LParen): + elif token == SearchToken.LParen: tsquery.append(" ( ") - elif isinstance(token, RParen): + elif token == SearchToken.RParen: tsquery.append(" ) ") else: - raise Exception("unknown token " + token) + raise ValueError(f"unknown token {token}") if ( i != len(tokens) - 1 and isinstance(token, (str, Phrase)) - and not isinstance(tokens[i + 1], (Or, And, RParen)) + and tokens[i + 1] + not in (SearchToken.Or, SearchToken.And, SearchToken.RParen) ): tsquery.append(" & ") return "".join(tsquery) @@ -909,18 +899,18 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: match_query.append(" ") elif isinstance(token, Phrase): match_query.append(' "' + " ".join(token.phrase) + '" ') - elif isinstance(token, Not): + elif token == SearchToken.Not: match_query.append("NOT ") - elif isinstance(token, Or): + elif token == SearchToken.Or: match_query.append("OR ") - elif isinstance(token, And): + elif token == SearchToken.And: match_query.append("AND ") - elif isinstance(token, LParen): + elif token == SearchToken.LParen: match_query.append("( ") - elif isinstance(token, RParen): + elif token == SearchToken.RParen: match_query.append(") ") else: - raise Exception("unknown token " + str(token)) + raise ValueError(f"unknown token {token}") return "".join(match_query) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 18dadce74f48..9cf0b8295c07 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -16,13 +16,18 @@ from unittest.case import SkipTest from unittest.mock import PropertyMock, patch +from twisted.test.proto_helpers import MemoryReactor + import synapse.rest.admin from synapse.api.constants import EventTypes from synapse.api.errors import StoreError from synapse.rest.client import login, room +from synapse.server import HomeServer from synapse.storage.databases.main import DataStore +from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query from synapse.storage.engines import PostgresEngine from synapse.storage.engines.sqlite import Sqlite3Engine +from synapse.util import Clock from tests.unittest import HomeserverTestCase, skip_unless from tests.utils import USE_POSTGRES_FOR_TESTS @@ -226,9 +231,9 @@ class MessageSearchTest(HomeserverTestCase): ('"quick brown', True), ] - def setUp(self): - super().setUp() - + def prepare( + self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer + ) -> None: # Register a user and create a room, create some messages self.register_user("alice", "password") self.access_token = self.login("alice", "password") @@ -238,6 +243,37 @@ def setUp(self): response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token) self.assertIn("event_id", response) + def test_tokenize_query(self) -> None: + """Test the custom logic to token a user's query.""" + cases = ( + ("brown", ["brown"]), + ("quick brown", ["quick", "brown"]), + ('"brown quick"', [Phrase(["brown", "quick"])]), + ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), + ("fox -brown", ["fox", SearchToken.Not, "brown"]), + ( + "fox AND ( brown OR nope )", + [ + "fox", + SearchToken.And, + SearchToken.LParen, + "brown", + SearchToken.Or, + "nope", + SearchToken.RParen, + ], + ), + ('"fox" quick', [Phrase(["fox"]), "quick"]), + # No trailing double quoe. + ('"fox quick', [Phrase(["fox", "quick"])]), + ) + + for query, expected in cases: + tokenized = _tokenize_query(query) + self.assertEqual( + tokenized, expected, f"{tokenized} != {expected} for {query}" + ) + def _check_test_cases( self, store: DataStore, cases: List[Tuple[str, bool]] ) -> None: From 5d9d1831583370e2d444071e024de9ddf745896d Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 17 Oct 2022 16:10:47 -0400 Subject: [PATCH 40/53] Handle edge-cases. --- synapse/storage/databases/main/search.py | 18 ++++++++++++------ tests/storage/test_room_search.py | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 98f67ede6fe4..ae4547cf4f32 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -816,14 +816,14 @@ def _tokenize_query(query: str) -> TokenList: words = deque(query.split(" ")) while words: word = words.popleft() - if word[0] == '"': + if word.startswith('"'): phrase_word = word[1:] # Continue to handle additional words until a word ends in a # double quote. phrase = [] while True: - # XXX Will this break if the first word is just a double quote. - if phrase_word[-1] == '"': + # If the phrase word is blank, then a bare double quote was found. + if phrase_word and phrase_word[-1] == '"': phrase.append(phrase_word[:-1]) break else: @@ -833,16 +833,22 @@ def _tokenize_query(query: str) -> TokenList: break phrase_word = words.popleft() tokens.append(Phrase(phrase)) - elif word[0] == "-": + elif word.startswith("-"): tokens.append(SearchToken.Not) tokens.append(word[1:]) elif word.lower() == "or": tokens.append(SearchToken.Or) elif word.lower() == "and": tokens.append(SearchToken.And) - elif word == "(": + elif word.startswith("("): tokens.append(SearchToken.LParen) - elif word == ")": + word = word[1:] + if word: + tokens.append(word) + elif word.endswith(")"): + word = word[:-1] + if word: + tokens.append(word) tokens.append(SearchToken.RParen) else: tokens.append(word) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 9cf0b8295c07..ebffa3af5015 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -226,9 +226,13 @@ class MessageSearchTest(HomeserverTestCase): ("fox -brown", False), ("fox AND ( brown OR nope )", True), ("fox AND ( nope OR doublenope )", False), + ("fox AND (brown OR nope)", True), + ("fox AND (nope OR doublenope)", False), ('"fox" quick', True), ('"fox quick', False), ('"quick brown', True), + ('" quick"', True), + ('" nope"', False), ] def prepare( @@ -263,9 +267,22 @@ def test_tokenize_query(self) -> None: SearchToken.RParen, ], ), + ( + "fox AND (brown OR nope)", + [ + "fox", + SearchToken.And, + SearchToken.LParen, + "brown", + SearchToken.Or, + "nope", + SearchToken.RParen, + ], + ), ('"fox" quick', [Phrase(["fox"]), "quick"]), # No trailing double quoe. ('"fox quick', [Phrase(["fox", "quick"])]), + ('" quick"', [Phrase(["", "quick"])]), ) for query, expected in cases: From 219321b48c33608d12c1e364d966a6df12fabcef Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 07:25:45 -0400 Subject: [PATCH 41/53] temp --- synapse/storage/databases/main/search.py | 9 ++++++--- tests/storage/test_room_search.py | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index ae4547cf4f32..0bf9fa0306e1 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -823,7 +823,7 @@ def _tokenize_query(query: str) -> TokenList: phrase = [] while True: # If the phrase word is blank, then a bare double quote was found. - if phrase_word and phrase_word[-1] == '"': + if phrase_word.endswith('"'): phrase.append(phrase_word[:-1]) break else: @@ -834,8 +834,11 @@ def _tokenize_query(query: str) -> TokenList: phrase_word = words.popleft() tokens.append(Phrase(phrase)) elif word.startswith("-"): - tokens.append(SearchToken.Not) - tokens.append(word[1:]) + word = word[1:] + # If there's no word after the hyphen, ignore. + if word: + tokens.append(SearchToken.Not) + tokens.append(word) elif word.lower() == "or": tokens.append(SearchToken.Or) elif word.lower() == "and": diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index ebffa3af5015..b5dd83ce1fb4 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -224,6 +224,7 @@ class MessageSearchTest(HomeserverTestCase): ("furphy OR fox", True), ("fox -nope", True), ("fox -brown", False), + ("- fox", True), ("fox AND ( brown OR nope )", True), ("fox AND ( nope OR doublenope )", False), ("fox AND (brown OR nope)", True), @@ -233,6 +234,7 @@ class MessageSearchTest(HomeserverTestCase): ('"quick brown', True), ('" quick"', True), ('" nope"', False), + ("'nope", False), ] def prepare( @@ -255,6 +257,7 @@ def test_tokenize_query(self) -> None: ('"brown quick"', [Phrase(["brown", "quick"])]), ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), + ("- fox", ["fox"]), ( "fox AND ( brown OR nope )", [ From 916603556ecfcd5492a6956ec6ad1dbedfb8735e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 11:14:45 -0400 Subject: [PATCH 42/53] Fix phrase handling. --- synapse/storage/databases/main/search.py | 37 +++++++++++++++++++----- tests/storage/test_room_search.py | 12 +++++--- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 0bf9fa0306e1..42333adfe21c 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -811,9 +811,15 @@ def _tokenize_query(query: str) -> TokenList: - negation of a keyword via unary `-` - unary hyphen to denote NOT e.g. 'include -exclude' + The following differs from websearch_to_tsquery: + + - Stop words are not removed. + - Unclosed phrases are treated differently. + """ tokens: TokenList = [] - words = deque(query.split(" ")) + # Split by runs on whitespace. + words = deque(query.split()) while words: word = words.popleft() if word.startswith('"'): @@ -822,17 +828,34 @@ def _tokenize_query(query: str) -> TokenList: # double quote. phrase = [] while True: - # If the phrase word is blank, then a bare double quote was found. + # If the word ends in a double quote, the phrase is over. Strip + # the double quote from the word. + found_end_quote = False if phrase_word.endswith('"'): - phrase.append(phrase_word[:-1]) - break - else: + phrase_word = phrase_word[:-1] + found_end_quote = True + + # If the phrase word is blank, then a bare double quote was found, + # do not add it to the phrase. + if phrase_word: phrase.append(phrase_word) - # Nothing left to process. + + # When a double quote is found, all done with the phrase. + if found_end_quote: + break + + # Nothing left to process, but no end double quote was found. if not words: + # Place each word in the phrase word back onto the deque. + # + # (Note that this word use the stripped double quote on the + # first word, this is correct behavior to match websearch_to_tsquery.) + words.extendleft(reversed(phrase)) break + phrase_word = words.popleft() - tokens.append(Phrase(phrase)) + if found_end_quote: + tokens.append(Phrase(phrase)) elif word.startswith("-"): word = word[1:] # If there's no word after the hyphen, ignore. diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index b5dd83ce1fb4..dceace0e6786 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -215,6 +215,7 @@ class MessageSearchTest(HomeserverTestCase): ("brown", True), ("quick brown", True), ("brown quick", True), + ("quick \t brown", True), ("jump", True), ("brown nope", False), ('"brown quick"', False), @@ -230,9 +231,10 @@ class MessageSearchTest(HomeserverTestCase): ("fox AND (brown OR nope)", True), ("fox AND (nope OR doublenope)", False), ('"fox" quick', True), - ('"fox quick', False), + ('"fox quick', True), + ('"-fox quick', False), ('"quick brown', True), - ('" quick"', True), + ('" quick "', True), ('" nope"', False), ("'nope", False), ] @@ -254,6 +256,7 @@ def test_tokenize_query(self) -> None: cases = ( ("brown", ["brown"]), ("quick brown", ["quick", "brown"]), + ("quick \t brown", ["quick", "brown"]), ('"brown quick"', [Phrase(["brown", "quick"])]), ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), @@ -284,8 +287,9 @@ def test_tokenize_query(self) -> None: ), ('"fox" quick', [Phrase(["fox"]), "quick"]), # No trailing double quoe. - ('"fox quick', [Phrase(["fox", "quick"])]), - ('" quick"', [Phrase(["", "quick"])]), + ('"fox quick', ["fox", "quick"]), + ('"-fox quick', [SearchToken.Not, "fox", "quick"]), + ('" quick "', [Phrase(["quick"])]), ) for query, expected in cases: From a7fd4f62f496d2a8e4e19e99688cefc162109104 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 11:42:10 -0400 Subject: [PATCH 43/53] Handle not with a space after. --- synapse/storage/databases/main/search.py | 7 +++++-- tests/storage/test_room_search.py | 18 +++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 42333adfe21c..a1e26ae9ba40 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -857,10 +857,11 @@ def _tokenize_query(query: str) -> TokenList: if found_end_quote: tokens.append(Phrase(phrase)) elif word.startswith("-"): - word = word[1:] + tokens.append(SearchToken.Not) + # If there's no word after the hyphen, ignore. + word = word[1:] if word: - tokens.append(SearchToken.Not) tokens.append(word) elif word.lower() == "or": tokens.append(SearchToken.Or) @@ -932,6 +933,8 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: elif isinstance(token, Phrase): match_query.append(' "' + " ".join(token.phrase) + '" ') elif token == SearchToken.Not: + # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search + # term has already been added before this. match_query.append("NOT ") elif token == SearchToken.Or: match_query.append("OR ") diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index dceace0e6786..a2207db5a493 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -225,18 +225,22 @@ class MessageSearchTest(HomeserverTestCase): ("furphy OR fox", True), ("fox -nope", True), ("fox -brown", False), - ("- fox", True), ("fox AND ( brown OR nope )", True), ("fox AND ( nope OR doublenope )", False), ("fox AND (brown OR nope)", True), ("fox AND (nope OR doublenope)", False), ('"fox" quick', True), ('"fox quick', True), - ('"-fox quick', False), ('"quick brown', True), ('" quick "', True), ('" nope"', False), - ("'nope", False), + ] + + # Case that fail on sqlite. + POSTGRES_CASES = [ + ("- fox", False), + ("- nope", True), + ('"-fox quick', False), ] def prepare( @@ -252,7 +256,7 @@ def prepare( self.assertIn("event_id", response) def test_tokenize_query(self) -> None: - """Test the custom logic to token a user's query.""" + """Test the custom logic to tokenize a user's query.""" cases = ( ("brown", ["brown"]), ("quick brown", ["quick", "brown"]), @@ -260,7 +264,7 @@ def test_tokenize_query(self) -> None: ('"brown quick"', [Phrase(["brown", "quick"])]), ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), - ("- fox", ["fox"]), + ("- fox", [SearchToken.Not, "fox"]), ( "fox AND ( brown OR nope )", [ @@ -352,7 +356,7 @@ def test_postgres_web_search_for_phrase(self): "Test only applies when postgres supporting websearch_to_tsquery is used as the database" ) - self._check_test_cases(store, self.COMMON_CASES) + self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES) def test_postgres_non_web_search_for_phrase(self): """ @@ -370,7 +374,7 @@ def test_postgres_non_web_search_for_phrase(self): new_callable=PropertyMock, ) as supports_websearch_to_tsquery: supports_websearch_to_tsquery.return_value = False - self._check_test_cases(store, self.COMMON_CASES) + self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES) def test_sqlite_search(self): """ From 6751f5fa7a55a7e18cf68795c54b82df12867c8d Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 11:54:02 -0400 Subject: [PATCH 44/53] Use the int version number to check if the feature is supported. --- synapse/storage/engines/postgres.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index e2fbbe0af667..986b144ac25c 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -172,7 +172,8 @@ def supports_returning(self) -> bool: @property def supports_websearch_to_tsquery(self) -> bool: - return int(self.server_version.split(".")[0]) >= 11 + # Postgres 11 added support for websearch_to_tsquery. + return self._version >= 110000 def is_deadlock(self, error: Exception) -> bool: if isinstance(error, psycopg2.DatabaseError): From 6e6ebd9f8be60ed59bc3b78c4d04031b6dd4bf3a Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 11:55:35 -0400 Subject: [PATCH 45/53] Lint --- synapse/storage/databases/main/search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index a1e26ae9ba40..a95cf65ac156 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -29,7 +29,6 @@ ) import attr -from typing_extensions import Literal from synapse.api.errors import SynapseError from synapse.events import EventBase From 3272819f23901a0129b8fbbc5afc3244914e3be2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 12:16:21 -0400 Subject: [PATCH 46/53] Simplify schema delta. --- .../73/10_update_sqlite_fts4_tokenizer.py | 91 ++++++++++--------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py index cc206b713545..3de0a709eba7 100644 --- a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py +++ b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py @@ -13,47 +13,50 @@ # limitations under the License. import json -from synapse.storage.engines import Sqlite3Engine - - -def run_create(cur, database_engine, *args, **kwargs): - # Upgrade the event_search table to use the porter tokenizer if it isn't already - if isinstance(database_engine, Sqlite3Engine): - cur.execute("SELECT sql FROM sqlite_master WHERE name='event_search'") - sql = cur.fetchone() - if sql is None: - raise Exception("The event_search table doesn't exist") - if "tokenize=porter" not in sql[0]: - cur.execute("DROP TABLE event_search") - cur.execute( - "CREATE VIRTUAL TABLE event_search" - " USING fts4 (tokenize=porter, event_id, room_id, sender, key, value )" - ) - - # Run a background job to re-populate the event_search table. - cur.execute("SELECT MIN(stream_ordering) FROM events") - rows = cur.fetchall() - min_stream_id = rows[0][0] - - cur.execute("SELECT MAX(stream_ordering) FROM events") - rows = cur.fetchall() - max_stream_id = rows[0][0] - - if min_stream_id is not None and max_stream_id is not None: - progress = { - "target_min_stream_id_inclusive": min_stream_id, - "max_stream_id_exclusive": max_stream_id + 1, - "rows_inserted": 0, - } - progress_json = json.dumps(progress) - - sql = ( - "INSERT into background_updates (update_name, progress_json)" - " VALUES (?, ?)" - ) - - cur.execute(sql, ("event_search", progress_json)) - - -def run_upgrade(cur, database_engine, *args, **kwargs): - pass +from synapse.storage.engines import BaseDatabaseEngine, Sqlite3Engine +from synapse.storage.types import Cursor + + +def run_create(cur: Cursor, database_engine: BaseDatabaseEngine) -> None: + """ + Upgrade the event_search table to use the porter tokenizer if it isn't already + + Applies only for sqlite. + """ + if not isinstance(database_engine, Sqlite3Engine): + return + + # Rebuild the table event_search table with tokenize=porter configured. + cur.execute("DROP TABLE event_search") + cur.execute( + """ + CREATE VIRTUAL TABLE event_search + USING fts4 (tokenize=porter, event_id, room_id, sender, key, value ) + """ + ) + + # Re-run the background job to re-populate the event_search table. + cur.execute("SELECT MIN(stream_ordering) FROM events") + row = cur.fetchone() + min_stream_id = row[0] + + # If there are not any events, nothing to do. + if min_stream_id is None: + return + + cur.execute("SELECT MAX(stream_ordering) FROM events") + row = cur.fetchone() + max_stream_id = row[0] + + progress = { + "target_min_stream_id_inclusive": min_stream_id, + "max_stream_id_exclusive": max_stream_id + 1, + } + progress_json = json.dumps(progress) + + sql = """ + INSERT into background_updates (ordering, update_name, progress_json) + VALUES (?, ?, ?) + """ + + cur.execute(sql, (7310, "event_search", progress_json)) From 4993e1c4f1ae1882ed5f1de7c3d4288431a425ff Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 18 Oct 2022 13:39:31 -0400 Subject: [PATCH 47/53] Add docstring. --- tests/storage/test_room_search.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index a2207db5a493..258133430041 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -201,6 +201,16 @@ def test_sqlite_non_string_deletion_background_update(self): class MessageSearchTest(HomeserverTestCase): + """ + Check message search. + + A powerful way to check the behaviour is to run the following in Postgres >= 11: + + # SELECT websearch_to_tsquery('english', ); + + The result can be compared to the tokenized version for SQLite and Postgres < 11. + + """ servlets = [ synapse.rest.admin.register_servlets_for_client_rest_resource, login.register_servlets, From 9b7bb0814b88d683b93fa51085f8658af17259b0 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 24 Oct 2022 11:34:23 -0400 Subject: [PATCH 48/53] Remove support for parens. --- synapse/storage/databases/main/search.py | 25 ++------------------- tests/storage/test_room_search.py | 28 ------------------------ 2 files changed, 2 insertions(+), 51 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index a95cf65ac156..b8a93413c642 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -790,8 +790,6 @@ class SearchToken(enum.Enum): Not = enum.auto() Or = enum.auto() And = enum.auto() - LParen = enum.auto() - RParen = enum.auto() Token = Union[str, Phrase, SearchToken] @@ -806,7 +804,7 @@ def _tokenize_query(query: str) -> TokenList: The following constructs are supported: - phrase queries using "double quotes" - - case-insensitive `or` and `and` operators, with parentheses to change operator precedence + - case-insensitive `or` and `and` operators - negation of a keyword via unary `-` - unary hyphen to denote NOT e.g. 'include -exclude' @@ -866,16 +864,6 @@ def _tokenize_query(query: str) -> TokenList: tokens.append(SearchToken.Or) elif word.lower() == "and": tokens.append(SearchToken.And) - elif word.startswith("("): - tokens.append(SearchToken.LParen) - word = word[1:] - if word: - tokens.append(word) - elif word.endswith(")"): - word = word[:-1] - if word: - tokens.append(word) - tokens.append(SearchToken.RParen) else: tokens.append(word) return tokens @@ -900,18 +888,13 @@ def _tokens_to_tsquery(tokens: TokenList) -> str: tsquery.append(" | ") elif token == SearchToken.And: tsquery.append(" & ") - elif token == SearchToken.LParen: - tsquery.append(" ( ") - elif token == SearchToken.RParen: - tsquery.append(" ) ") else: raise ValueError(f"unknown token {token}") if ( i != len(tokens) - 1 and isinstance(token, (str, Phrase)) - and tokens[i + 1] - not in (SearchToken.Or, SearchToken.And, SearchToken.RParen) + and tokens[i + 1] not in (SearchToken.Or, SearchToken.And) ): tsquery.append(" & ") return "".join(tsquery) @@ -939,10 +922,6 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: match_query.append("OR ") elif token == SearchToken.And: match_query.append("AND ") - elif token == SearchToken.LParen: - match_query.append("( ") - elif token == SearchToken.RParen: - match_query.append(") ") else: raise ValueError(f"unknown token {token}") diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 258133430041..46efbcf372ea 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -235,10 +235,6 @@ class MessageSearchTest(HomeserverTestCase): ("furphy OR fox", True), ("fox -nope", True), ("fox -brown", False), - ("fox AND ( brown OR nope )", True), - ("fox AND ( nope OR doublenope )", False), - ("fox AND (brown OR nope)", True), - ("fox AND (nope OR doublenope)", False), ('"fox" quick', True), ('"fox quick', True), ('"quick brown', True), @@ -275,30 +271,6 @@ def test_tokenize_query(self) -> None: ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), ("- fox", [SearchToken.Not, "fox"]), - ( - "fox AND ( brown OR nope )", - [ - "fox", - SearchToken.And, - SearchToken.LParen, - "brown", - SearchToken.Or, - "nope", - SearchToken.RParen, - ], - ), - ( - "fox AND (brown OR nope)", - [ - "fox", - SearchToken.And, - SearchToken.LParen, - "brown", - SearchToken.Or, - "nope", - SearchToken.RParen, - ], - ), ('"fox" quick', [Phrase(["fox"]), "quick"]), # No trailing double quoe. ('"fox quick', ["fox", "quick"]), From 13f530615c3d95e1bf9319c389fca4df833f499e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 24 Oct 2022 14:33:51 -0400 Subject: [PATCH 49/53] Fix edge cases with double quotes. --- synapse/storage/databases/main/search.py | 148 ++++++++++++++--------- tests/storage/test_room_search.py | 24 ++-- 2 files changed, 106 insertions(+), 66 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index b8a93413c642..920c8ad6aa27 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -783,7 +783,12 @@ def _to_postgres_options(options_dict: JsonDict) -> str: @dataclass class Phrase: - phrase: List[str] + phrase: List[Union[str, "FollowedBy"]] + + +@dataclass +class FollowedBy: + proximity: int class SearchToken(enum.Enum): @@ -792,10 +797,16 @@ class SearchToken(enum.Enum): And = enum.auto() -Token = Union[str, Phrase, SearchToken] +Token = Union[str, Phrase, FollowedBy, SearchToken] TokenList = List[Token] +def _is_stop_word(word: str) -> bool: + # TODO Pull these out of the dictionary: + # https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop + return word in {"the", "a", "you", "me", "and", "but"} + + def _tokenize_query(query: str) -> TokenList: """ Convert the user-supplied `query` into a TokenList, which can be translated into @@ -815,57 +826,68 @@ def _tokenize_query(query: str) -> TokenList: """ tokens: TokenList = [] - # Split by runs on whitespace. - words = deque(query.split()) - while words: - word = words.popleft() - if word.startswith('"'): - phrase_word = word[1:] - # Continue to handle additional words until a word ends in a - # double quote. + + # Find phrases. + in_phrase = False + parts = deque(query.split('"')) + for i, part in enumerate(parts): + # The contents inside double quotes is treated as a phrase, a trailing + # double quote is not implied. + in_phrase = bool(i % 2) and i != (len(parts) - 1) + + # Pull out the individual words, discarding any non-word characters. + words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE)) + + # Phrases have simplified handling of words. + if in_phrase: + proximity = 1 phrase = [] - while True: - # If the word ends in a double quote, the phrase is over. Strip - # the double quote from the word. - found_end_quote = False - if phrase_word.endswith('"'): - phrase_word = phrase_word[:-1] - found_end_quote = True - - # If the phrase word is blank, then a bare double quote was found, - # do not add it to the phrase. - if phrase_word: - phrase.append(phrase_word) - - # When a double quote is found, all done with the phrase. - if found_end_quote: - break - - # Nothing left to process, but no end double quote was found. - if not words: - # Place each word in the phrase word back onto the deque. - # - # (Note that this word use the stripped double quote on the - # first word, this is correct behavior to match websearch_to_tsquery.) - words.extendleft(reversed(phrase)) - break - - phrase_word = words.popleft() - if found_end_quote: - tokens.append(Phrase(phrase)) - elif word.startswith("-"): - tokens.append(SearchToken.Not) - - # If there's no word after the hyphen, ignore. - word = word[1:] - if word: + for word in words: + # Skip stop words, but track the number that are skipped. + if _is_stop_word(word): + proximity += 1 + continue + + # If words already exist in the phrase, add a followed-by operator. + if phrase: + phrase.append(FollowedBy(proximity)) + + phrase.append(word) + proximity = 1 + + # Consecutive words are implictly ANDed together. + if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): + tokens.append(SearchToken.And) + + # Add the phrase. + tokens.append(Phrase(phrase)) + continue + + # Otherwise, not in a phrase. + while words: + word = words.popleft() + + if word.startswith("-"): + tokens.append(SearchToken.Not) + + # If there's more word, put it back to be processed again. + word = word[1:] + if word: + words.appendleft(word) + elif word.lower() == "or": + tokens.append(SearchToken.Or) + else: + # Skip stop words. + if _is_stop_word(word): + continue + + # Consecutive words are implictly ANDed together. + if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): + tokens.append(SearchToken.And) + + # Add the search term. tokens.append(word) - elif word.lower() == "or": - tokens.append(SearchToken.Or) - elif word.lower() == "and": - tokens.append(SearchToken.And) - else: - tokens.append(word) + return tokens @@ -881,9 +903,16 @@ def _tokens_to_tsquery(tokens: TokenList) -> str: if isinstance(token, str): tsquery.append(token) elif isinstance(token, Phrase): - tsquery.append(" ( " + " <-> ".join(token.phrase) + " ) ") + tsquery.append( + " ( " + + "".join( + t if isinstance(t, str) else f" <{t.proximity}> " + for t in token.phrase + ) + + " ) " + ) elif token == SearchToken.Not: - tsquery.append("!") + tsquery.append(" !") elif token == SearchToken.Or: tsquery.append(" | ") elif token == SearchToken.And: @@ -911,17 +940,20 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: for token in tokens: if isinstance(token, str): match_query.append(token) - match_query.append(" ") elif isinstance(token, Phrase): - match_query.append(' "' + " ".join(token.phrase) + '" ') + # SQLite doesn't support followed by operators. NEAR is close, but + # discards order. + match_query.append( + '"' + " ".join(t for t in token.phrase if isinstance(t, str)) + '"' + ) elif token == SearchToken.Not: # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search # term has already been added before this. - match_query.append("NOT ") + match_query.append(" NOT ") elif token == SearchToken.Or: - match_query.append("OR ") + match_query.append(" OR ") elif token == SearchToken.And: - match_query.append("AND ") + match_query.append(" AND ") else: raise ValueError(f"unknown token {token}") diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 46efbcf372ea..b29dca75bae6 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -24,7 +24,7 @@ from synapse.rest.client import login, room from synapse.server import HomeServer from synapse.storage.databases.main import DataStore -from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query +from synapse.storage.databases.main.search import FollowedBy, Phrase, SearchToken, _tokenize_query from synapse.storage.engines import PostgresEngine from synapse.storage.engines.sqlite import Sqlite3Engine from synapse.util import Clock @@ -211,6 +211,7 @@ class MessageSearchTest(HomeserverTestCase): The result can be compared to the tokenized version for SQLite and Postgres < 11. """ + servlets = [ synapse.rest.admin.register_servlets_for_client_rest_resource, login.register_servlets, @@ -241,12 +242,17 @@ class MessageSearchTest(HomeserverTestCase): ('" quick "', True), ('" nope"', False), ] + # TODO Test non-ASCII cases. - # Case that fail on sqlite. + # Case that fail on SQLite. POSTGRES_CASES = [ + # SQLite treats NOT as a binary operator. ("- fox", False), ("- nope", True), ('"-fox quick', False), + # PostgreSQL skips stop words. + ('"the quick brown"', True), + ('"over lazy"', True), ] def prepare( @@ -265,17 +271,19 @@ def test_tokenize_query(self) -> None: """Test the custom logic to tokenize a user's query.""" cases = ( ("brown", ["brown"]), - ("quick brown", ["quick", "brown"]), - ("quick \t brown", ["quick", "brown"]), - ('"brown quick"', [Phrase(["brown", "quick"])]), + ("quick brown", ["quick", SearchToken.And, "brown"]), + ("quick \t brown", ["quick", SearchToken.And, "brown"]), + ('"brown quick"', [Phrase(["brown", FollowedBy(1), "quick"])]), ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), ("- fox", [SearchToken.Not, "fox"]), - ('"fox" quick', [Phrase(["fox"]), "quick"]), + ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]), # No trailing double quoe. - ('"fox quick', ["fox", "quick"]), - ('"-fox quick', [SearchToken.Not, "fox", "quick"]), + ('"fox quick', ["fox", SearchToken.And, "quick"]), + ('"-fox quick', [SearchToken.Not, "fox", SearchToken.And, "quick"]), ('" quick "', [Phrase(["quick"])]), + ('q"uick brow"n', ["q", SearchToken.And, Phrase(["uick", FollowedBy(1), "brow"]), SearchToken.And, "n"]), + ('-"quick brown"', [SearchToken.Not, Phrase(["quick", FollowedBy(1), "brown"])]), ) for query, expected in cases: From 62f6f181b2fd5df1217eb803b4104ba2e07f7932 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 24 Oct 2022 14:46:34 -0400 Subject: [PATCH 50/53] Lint. --- synapse/storage/databases/main/search.py | 2 +- synapse/storage/engines/postgres.py | 1 + tests/storage/test_room_search.py | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 920c8ad6aa27..7949a769ea77 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -841,7 +841,7 @@ def _tokenize_query(query: str) -> TokenList: # Phrases have simplified handling of words. if in_phrase: proximity = 1 - phrase = [] + phrase: List[Union[str, FollowedBy]] = [] for word in words: # Skip stop words, but track the number that are skipped. if _is_stop_word(word): diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 986b144ac25c..3f0082fc5819 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -173,6 +173,7 @@ def supports_returning(self) -> bool: @property def supports_websearch_to_tsquery(self) -> bool: # Postgres 11 added support for websearch_to_tsquery. + assert self._version is not None return self._version >= 110000 def is_deadlock(self, error: Exception) -> bool: diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index b29dca75bae6..1ce9c7e92edb 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -24,7 +24,12 @@ from synapse.rest.client import login, room from synapse.server import HomeServer from synapse.storage.databases.main import DataStore -from synapse.storage.databases.main.search import FollowedBy, Phrase, SearchToken, _tokenize_query +from synapse.storage.databases.main.search import ( + FollowedBy, + Phrase, + SearchToken, + _tokenize_query, +) from synapse.storage.engines import PostgresEngine from synapse.storage.engines.sqlite import Sqlite3Engine from synapse.util import Clock @@ -282,8 +287,20 @@ def test_tokenize_query(self) -> None: ('"fox quick', ["fox", SearchToken.And, "quick"]), ('"-fox quick', [SearchToken.Not, "fox", SearchToken.And, "quick"]), ('" quick "', [Phrase(["quick"])]), - ('q"uick brow"n', ["q", SearchToken.And, Phrase(["uick", FollowedBy(1), "brow"]), SearchToken.And, "n"]), - ('-"quick brown"', [SearchToken.Not, Phrase(["quick", FollowedBy(1), "brown"])]), + ( + 'q"uick brow"n', + [ + "q", + SearchToken.And, + Phrase(["uick", FollowedBy(1), "brow"]), + SearchToken.And, + "n", + ], + ), + ( + '-"quick brown"', + [SearchToken.Not, Phrase(["quick", FollowedBy(1), "brown"])], + ), ) for query, expected in cases: From b3755aea0675c4a9a3d45894798c92346f9a2377 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 25 Oct 2022 08:13:12 -0400 Subject: [PATCH 51/53] Remove backwards compat code for Postgres < 11 since (almost) EOL. --- synapse/storage/databases/main/search.py | 65 ++---------------------- synapse/storage/engines/postgres.py | 14 ++++- tests/storage/test_room_search.py | 51 ++++++++++++------- 3 files changed, 48 insertions(+), 82 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 7949a769ea77..016e432db86f 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -454,9 +454,8 @@ async def search_msgs( count_clauses = clauses if isinstance(self.database_engine, PostgresEngine): - search_query, tsquery_func = _parse_query_for_pgsql( - search_term, self.database_engine - ) + search_query = search_term + tsquery_func = self.database_engine.tsquery_func sql = ( f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank," " room_id, event_id" @@ -595,9 +594,8 @@ async def search_rooms( args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): - search_query, tsquery_func = _parse_query_for_pgsql( - search_term, self.database_engine - ) + search_query = search_term + tsquery_func = self.database_engine.tsquery_func sql = ( f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" @@ -891,44 +889,6 @@ def _tokenize_query(query: str) -> TokenList: return tokens -def _tokens_to_tsquery(tokens: TokenList) -> str: - """ - Convert the list of tokens to a string suitable for passing to postgresql's to_tsquery - - Ref: https://www.postgresql.org/docs/current/textsearch-controls.html - """ - - tsquery = [] - for i, token in enumerate(tokens): - if isinstance(token, str): - tsquery.append(token) - elif isinstance(token, Phrase): - tsquery.append( - " ( " - + "".join( - t if isinstance(t, str) else f" <{t.proximity}> " - for t in token.phrase - ) - + " ) " - ) - elif token == SearchToken.Not: - tsquery.append(" !") - elif token == SearchToken.Or: - tsquery.append(" | ") - elif token == SearchToken.And: - tsquery.append(" & ") - else: - raise ValueError(f"unknown token {token}") - - if ( - i != len(tokens) - 1 - and isinstance(token, (str, Phrase)) - and tokens[i + 1] not in (SearchToken.Or, SearchToken.And) - ): - tsquery.append(" & ") - return "".join(tsquery) - - def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: """ Convert the list of tokens to a string suitable for passing to sqlite's MATCH. @@ -960,23 +920,6 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: return "".join(match_query) -def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]: - """Selects a tsquery_* func to use and transforms the search_term into syntax appropriate for it. - - Args: - search_term: A user supplied search query. - engine: The database engine. - - Returns: - A tuple of (parsed search_term, tsquery func to use). - """ - - if engine.supports_websearch_to_tsquery: - return search_term, "websearch_to_tsquery" - else: - return _tokens_to_tsquery(_tokenize_query(search_term)), "to_tsquery" - - def _parse_query_for_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form that can be passed to sqllite's matchinfo(). diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 3f0082fc5819..9bf74bbf5920 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -171,10 +171,20 @@ def supports_returning(self) -> bool: return True @property - def supports_websearch_to_tsquery(self) -> bool: + def tsquery_func(self) -> str: + """ + Selects a tsquery_* func to use. + + Ref: https://www.postgresql.org/docs/current/textsearch-controls.html + + Returns: + The function name. + """ # Postgres 11 added support for websearch_to_tsquery. assert self._version is not None - return self._version >= 110000 + if self._version >= 110000: + return "websearch_to_tsquery" + return "plainto_tsquery" def is_deadlock(self, error: Exception) -> bool: if isinstance(error, psycopg2.DatabaseError): diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 1ce9c7e92edb..e4ba2f3c0351 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple +from typing import List, Tuple, Union from unittest.case import SkipTest from unittest.mock import PropertyMock, patch @@ -225,8 +225,10 @@ class MessageSearchTest(HomeserverTestCase): PHRASE = "the quick brown fox jumps over the lazy dog" - # (query, whether PHRASE contains query) - COMMON_CASES = [ + # Each entry is a search query, followed by either a boolean of whether it is + # in the phrase OR a tuple of booleans: whether it matches using websearch + # and using plain search. + COMMON_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [ ("nope", False), ("brown", True), ("quick brown", True), @@ -234,13 +236,13 @@ class MessageSearchTest(HomeserverTestCase): ("quick \t brown", True), ("jump", True), ("brown nope", False), - ('"brown quick"', False), + ('"brown quick"', (False, True)), ('"jumps over"', True), - ('"quick fox"', False), + ('"quick fox"', (False, True)), ("nope OR doublenope", False), - ("furphy OR fox", True), - ("fox -nope", True), - ("fox -brown", False), + ("furphy OR fox", (True, False)), + ("fox -nope", (True, False)), + ("fox -brown", (False, True)), ('"fox" quick', True), ('"fox quick', True), ('"quick brown', True), @@ -250,11 +252,11 @@ class MessageSearchTest(HomeserverTestCase): # TODO Test non-ASCII cases. # Case that fail on SQLite. - POSTGRES_CASES = [ + POSTGRES_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [ # SQLite treats NOT as a binary operator. - ("- fox", False), - ("- nope", True), - ('"-fox quick', False), + ("- fox", (False, True)), + ("- nope", (True, False)), + ('"-fox quick', (False, True)), # PostgreSQL skips stop words. ('"the quick brown"', True), ('"over lazy"', True), @@ -310,10 +312,16 @@ def test_tokenize_query(self) -> None: ) def _check_test_cases( - self, store: DataStore, cases: List[Tuple[str, bool]] + self, + store: DataStore, + cases: List[Tuple[str, Union[bool, Tuple[bool, bool]]]], + index=0, ) -> None: # Run all the test cases versus search_msgs for query, expect_to_contain in cases: + if isinstance(expect_to_contain, tuple): + expect_to_contain = expect_to_contain[index] + result = self.get_success( store.search_msgs([self.room_id], query, ["content.body"]) ) @@ -332,6 +340,9 @@ def _check_test_cases( # Run them again versus search_rooms for query, expect_to_contain in cases: + if isinstance(expect_to_contain, tuple): + expect_to_contain = expect_to_contain[index] + result = self.get_success( store.search_rooms([self.room_id], query, ["content.body"], 10) ) @@ -358,12 +369,12 @@ def test_postgres_web_search_for_phrase(self): if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - if not store.database_engine.supports_websearch_to_tsquery: + if store.database_engine.tsquery_func == "websearch_to_tsquery": raise SkipTest( "Test only applies when postgres supporting websearch_to_tsquery is used as the database" ) - self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES) + self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES, index=0) def test_postgres_non_web_search_for_phrase(self): """ @@ -377,11 +388,13 @@ def test_postgres_non_web_search_for_phrase(self): # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. with patch( - "synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery", + "synapse.storage.engines.postgres.PostgresEngine.tsquery_func", new_callable=PropertyMock, ) as supports_websearch_to_tsquery: - supports_websearch_to_tsquery.return_value = False - self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES) + supports_websearch_to_tsquery.return_value = "plainto_tsquery" + self._check_test_cases( + store, self.COMMON_CASES + self.POSTGRES_CASES, index=1 + ) def test_sqlite_search(self): """ @@ -391,4 +404,4 @@ def test_sqlite_search(self): if not isinstance(store.database_engine, Sqlite3Engine): raise SkipTest("Test only applies when sqlite is used as the database") - self._check_test_cases(store, self.COMMON_CASES) + self._check_test_cases(store, self.COMMON_CASES, index=0) From 5e5fc8deb9bb455304dc0e3824c5cb147dda4291 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 25 Oct 2022 08:23:55 -0400 Subject: [PATCH 52/53] Simplify phrase handling. --- synapse/storage/databases/main/search.py | 35 +++++------------------- tests/storage/test_room_search.py | 13 +++------ 2 files changed, 11 insertions(+), 37 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 016e432db86f..a89fc54c2cbe 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -781,12 +781,7 @@ def _to_postgres_options(options_dict: JsonDict) -> str: @dataclass class Phrase: - phrase: List[Union[str, "FollowedBy"]] - - -@dataclass -class FollowedBy: - proximity: int + phrase: List[str] class SearchToken(enum.Enum): @@ -795,7 +790,7 @@ class SearchToken(enum.Enum): And = enum.auto() -Token = Union[str, Phrase, FollowedBy, SearchToken] +Token = Union[str, Phrase, SearchToken] TokenList = List[Token] @@ -838,22 +833,10 @@ def _tokenize_query(query: str) -> TokenList: # Phrases have simplified handling of words. if in_phrase: - proximity = 1 - phrase: List[Union[str, FollowedBy]] = [] - for word in words: - # Skip stop words, but track the number that are skipped. - if _is_stop_word(word): - proximity += 1 - continue - - # If words already exist in the phrase, add a followed-by operator. - if phrase: - phrase.append(FollowedBy(proximity)) + # Skip stop words. + phrase = [word for word in words if not _is_stop_word(word)] - phrase.append(word) - proximity = 1 - - # Consecutive words are implictly ANDed together. + # Consecutive words are implicitly ANDed together. if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): tokens.append(SearchToken.And) @@ -879,7 +862,7 @@ def _tokenize_query(query: str) -> TokenList: if _is_stop_word(word): continue - # Consecutive words are implictly ANDed together. + # Consecutive words are implicitly ANDed together. if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): tokens.append(SearchToken.And) @@ -901,11 +884,7 @@ def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: if isinstance(token, str): match_query.append(token) elif isinstance(token, Phrase): - # SQLite doesn't support followed by operators. NEAR is close, but - # discards order. - match_query.append( - '"' + " ".join(t for t in token.phrase if isinstance(t, str)) + '"' - ) + match_query.append('"' + " ".join(token.phrase) + '"') elif token == SearchToken.Not: # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search # term has already been added before this. diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index e4ba2f3c0351..7e512355993d 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -24,12 +24,7 @@ from synapse.rest.client import login, room from synapse.server import HomeServer from synapse.storage.databases.main import DataStore -from synapse.storage.databases.main.search import ( - FollowedBy, - Phrase, - SearchToken, - _tokenize_query, -) +from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query from synapse.storage.engines import PostgresEngine from synapse.storage.engines.sqlite import Sqlite3Engine from synapse.util import Clock @@ -280,7 +275,7 @@ def test_tokenize_query(self) -> None: ("brown", ["brown"]), ("quick brown", ["quick", SearchToken.And, "brown"]), ("quick \t brown", ["quick", SearchToken.And, "brown"]), - ('"brown quick"', [Phrase(["brown", FollowedBy(1), "quick"])]), + ('"brown quick"', [Phrase(["brown", "quick"])]), ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), ("fox -brown", ["fox", SearchToken.Not, "brown"]), ("- fox", [SearchToken.Not, "fox"]), @@ -294,14 +289,14 @@ def test_tokenize_query(self) -> None: [ "q", SearchToken.And, - Phrase(["uick", FollowedBy(1), "brow"]), + Phrase(["uick", "brow"]), SearchToken.And, "n", ], ), ( '-"quick brown"', - [SearchToken.Not, Phrase(["quick", FollowedBy(1), "brown"])], + [SearchToken.Not, Phrase(["quick", "brown"])], ), ) From 223580a9f688378ea93453a7228bc8c7dc94539c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 25 Oct 2022 13:19:41 -0400 Subject: [PATCH 53/53] Fix tests on postgres 10. --- tests/storage/test_room_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 7e512355993d..9ddc19900afc 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -364,7 +364,7 @@ def test_postgres_web_search_for_phrase(self): if not isinstance(store.database_engine, PostgresEngine): raise SkipTest("Test only applies when postgres is used as the database") - if store.database_engine.tsquery_func == "websearch_to_tsquery": + if store.database_engine.tsquery_func != "websearch_to_tsquery": raise SkipTest( "Test only applies when postgres supporting websearch_to_tsquery is used as the database" )