add address counts to tokens

lonvia · Mar 15, 2024 · 016434f · 016434f
1 parent 23ad804
commit 016434f
Show file tree

Hide file tree

Showing 10 changed files with 32 additions and 21 deletions.
diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py
@@ -97,6 +97,7 @@ def from_db_row(row: SaRow) -> 'ICUToken':
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
+        addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
         penalty = 0.0
         if row.type == 'w':
@@ -123,7 +124,8 @@ def from_db_row(row: SaRow) -> 'ICUToken':
 
         return ICUToken(penalty=penalty, token=row.word_id, count=count,
                         lookup_word=lookup_word, is_indexed=True,
-                        word_token=row.word_token, info=row.info)
+                        word_token=row.word_token, info=row.info,
+                        addr_count=addr_count)
 
 
 
@@ -257,7 +259,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                ICUToken(0.5, 0, 1, part.token, True, part.token, None))
+                                ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:

diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py
@@ -210,6 +210,7 @@ def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
 
         return LegacyToken(penalty=penalty, token=row.word_id,
                            count=row.search_name_count or 1,
+                           addr_count=1, # not supported
                            lookup_word=lookup_word,
                            word_token=row.word_token.strip(),
                            category=(rowclass, row.type) if rowclass is not None else None,
@@ -226,7 +227,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
             if len(part) <= 4 and part.isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                LegacyToken(penalty=0.5, token=0, count=1,
+                                LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
                                             lookup_word=part, word_token=part,
                                             category=None, country=None,
                                             operator=None, is_indexed=True))

diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py
@@ -99,10 +99,10 @@ class Token(ABC):
     penalty: float
     token: int
     count: int
+    addr_count: int
     lookup_word: str
     is_indexed: bool
 
-    addr_count: int = 1
 
     @abstractmethod
     def get_category(self) -> Tuple[str, str]:

diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py
@@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]:
 
 
     @abstractmethod
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
         """ Recompute any tokenizer statistics necessary for efficient lookup.
             This function is meant to be called from time to time by the user
             to improve performance. However, the tokenizer must not depend on

diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
@@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None:
             self._save_config(conn, config)
 
 
-    def update_statistics(self, _: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
         """ Recompute the frequency of full words.
         """
         with connect(self.dsn) as conn:

diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py
@@ -18,7 +18,8 @@ def get_category(self):
 
 
 def mktoken(tid: int):
-    return MyToken(3.0, tid, 1, 'foo', True)
+    return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
+                   lookup_word='foo', is_indexed=True)
 
 
 @pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),

diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py
@@ -31,7 +31,9 @@ def make_query(*args):
         for end, ttype, tinfo in tlist:
             for tid, word in tinfo:
                 q.add_token(TokenRange(start, end), ttype,
-                            MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
+                            MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
+                                    token=tid, count=1, addr_count=1,
+                                    lookup_word=word, is_indexed=True))
 
 
     return q
@@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
     q.add_node(BreakType.END, PhraseType.NONE)
 
     q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
-                MyToken(0.5, 1, name_part, 'name_part', True))
+                MyToken(0.5, 1, name_part, 1, 'name_part', True))
     q.add_token(TokenRange(0, 1), TokenType.WORD,
-                MyToken(0, 101, name_full, 'name_full', True))
+                MyToken(0, 101, name_full, 1, 'name_full', True))
     for i in range(num_address_parts):
         q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
-                    MyToken(0.5, 2, address_part, 'address_part', True))
+                    MyToken(0.5, 2, address_part, 1, 'address_part', True))
         q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
-                    MyToken(0, 102, address_full, 'address_full', True))
+                    MyToken(0, 102, address_full, 1, 'address_full', True))
 
     builder = SearchBuilder(q, SearchDetails())
 

diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py
@@ -19,7 +19,8 @@ def get_category(self):
 
 def make_query(*args):
     q = QueryStruct([Phrase(args[0][1], '')])
-    dummy = MyToken(3.0, 45, 1, 'foo', True)
+    dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
+                    lookup_word='foo', is_indexed=True)
 
     for btype, ptype, _ in args[1:]:
         q.add_node(btype, ptype)

diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py
@@ -32,16 +32,16 @@ def __init__(self, *args, **kwargs):
         self.update_statistics_called = False
         self.update_word_tokens_called = False
 
-    def update_sql_functions(self, *args):
+    def update_sql_functions(self, *args, **kwargs):
         self.update_sql_functions_called = True
 
-    def finalize_import(self, *args):
+    def finalize_import(self, *args, **kwargs):
         self.finalize_import_called = True
 
-    def update_statistics(self, *args):
+    def update_statistics(self, *args, **kwargs):
         self.update_statistics_called = True
 
-    def update_word_tokens(self, *args):
+    def update_word_tokens(self, *args, **kwargs):
         self.update_word_tokens_called = True
 
 

diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
@@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
 def test_update_statistics(word_table, table_factory, temp_db_cursor,
                            tokenizer_factory, test_config):
     word_table.add_full_word(1000, 'hello')
+    word_table.add_full_word(1001, 'bye')
     table_factory('search_name',
-                  'place_id BIGINT, name_vector INT[]',
-                  [(12, [1000])])
+                  'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
+                  [(12, [1000], [1001])])
     tok = tokenizer_factory()
 
     tok.update_statistics(test_config)
 
     assert temp_db_cursor.scalar("""SELECT count(*) FROM word
-                                    WHERE type = 'W' and
-                                          (info->>'count')::int > 0""") > 0
+                                    WHERE type = 'W' and word_id = 1000 and
+                                          (info->>'count')::int > 0""") == 1
+    assert temp_db_cursor.scalar("""SELECT count(*) FROM word
+                                    WHERE type = 'W' and word_id = 1001 and
+                                          (info->>'addr_count')::int > 0""") == 1
 
 
 def test_normalize_postcode(analyzer):