diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 1c2565d1a..05ec7690c 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -97,6 +97,7 @@ def from_db_row(row: SaRow) -> 'ICUToken': """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@ -123,7 +124,8 @@ def from_db_row(row: SaRow) -> 'ICUToken': return ICUToken(penalty=penalty, token=row.word_id, count=count, lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=addr_count) @@ -257,7 +259,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 86d42a543..bd17706e5 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -210,6 +210,7 @@ def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]: return LegacyToken(penalty=penalty, token=row.word_id, count=row.search_name_count or 1, + addr_count=1, # not supported lookup_word=lookup_word, word_token=row.word_token.strip(), category=(rowclass, row.type) if rowclass is not None else None, @@ -226,7 +227,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None: if len(part) <= 4 and part.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - LegacyToken(penalty=0.5, token=0, count=1, + LegacyToken(penalty=0.5, token=0, count=1, addr_count=1, lookup_word=part, word_token=part, category=None, country=None, operator=None, is_indexed=True)) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index bd91c2ece..a0d7add1b 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -99,10 +99,10 @@ class Token(ABC): penalty: float token: int count: int + addr_count: int lookup_word: str is_indexed: bool - addr_count: int = 1 @abstractmethod def get_category(self) -> Tuple[str, str]: diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 29bcc8e19..12c826eb2 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]: @abstractmethod - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute any tokenizer statistics necessary for efficient lookup. This function is meant to be called from time to time by the user to improve performance. However, the tokenizer must not depend on diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index f3a00839a..93808cc39 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None: self._save_config(conn, config) - def update_statistics(self, _: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index fe850ce90..bfdceb416 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -18,7 +18,8 @@ def get_category(self): def mktoken(tid: int): - return MyToken(3.0, tid, 1, 'foo', True) + return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) @pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'), diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index d3aea9000..68f71298c 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -31,7 +31,9 @@ def make_query(*args): for end, ttype, tinfo in tlist: for tid, word in tinfo: q.add_token(TokenRange(start, end), ttype, - MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True)) + MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0, + token=tid, count=1, addr_count=1, + lookup_word=word, is_indexed=True)) return q @@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full, q.add_node(BreakType.END, PhraseType.NONE) q.add_token(TokenRange(0, 1), TokenType.PARTIAL, - MyToken(0.5, 1, name_part, 'name_part', True)) + MyToken(0.5, 1, name_part, 1, 'name_part', True)) q.add_token(TokenRange(0, 1), TokenType.WORD, - MyToken(0, 101, name_full, 'name_full', True)) + MyToken(0, 101, name_full, 1, 'name_full', True)) for i in range(num_address_parts): q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL, - MyToken(0.5, 2, address_part, 'address_part', True)) + MyToken(0.5, 2, address_part, 1, 'address_part', True)) q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD, - MyToken(0, 102, address_full, 'address_full', True)) + MyToken(0, 102, address_full, 1, 'address_full', True)) builder = SearchBuilder(q, SearchDetails()) diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index 54e8af14c..cde8495d0 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -19,7 +19,8 @@ def get_category(self): def make_query(*args): q = QueryStruct([Phrase(args[0][1], '')]) - dummy = MyToken(3.0, 45, 1, 'foo', True) + dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) for btype, ptype, _ in args[1:]: q.add_node(btype, ptype) diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index 1bb393fb2..28aba597e 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -32,16 +32,16 @@ def __init__(self, *args, **kwargs): self.update_statistics_called = False self.update_word_tokens_called = False - def update_sql_functions(self, *args): + def update_sql_functions(self, *args, **kwargs): self.update_sql_functions_called = True - def finalize_import(self, *args): + def finalize_import(self, *args, **kwargs): self.finalize_import_called = True - def update_statistics(self, *args): + def update_statistics(self, *args, **kwargs): self.update_statistics_called = True - def update_word_tokens(self, *args): + def update_word_tokens(self, *args, **kwargs): self.update_word_tokens_called = True diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index aa1afe160..9f6eae62e 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config): word_table.add_full_word(1000, 'hello') + word_table.add_full_word(1001, 'bye') table_factory('search_name', - 'place_id BIGINT, name_vector INT[]', - [(12, [1000])]) + 'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]', + [(12, [1000], [1001])]) tok = tokenizer_factory() tok.update_statistics(test_config) assert temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE type = 'W' and - (info->>'count')::int > 0""") > 0 + WHERE type = 'W' and word_id = 1000 and + (info->>'count')::int > 0""") == 1 + assert temp_db_cursor.scalar("""SELECT count(*) FROM word + WHERE type = 'W' and word_id = 1001 and + (info->>'addr_count')::int > 0""") == 1 def test_normalize_postcode(analyzer):