Skip to content

Commit

Permalink
add address counts to tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
lonvia committed Mar 15, 2024
1 parent 23ad804 commit 016434f
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 21 deletions.
6 changes: 4 additions & 2 deletions nominatim/api/search/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def from_db_row(row: SaRow) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)

penalty = 0.0
if row.type == 'w':
Expand All @@ -123,7 +124,8 @@ def from_db_row(row: SaRow) -> 'ICUToken':

return ICUToken(penalty=penalty, token=row.word_id, count=count,
lookup_word=lookup_word, is_indexed=True,
word_token=row.word_token, info=row.info)
word_token=row.word_token, info=row.info,
addr_count=addr_count)



Expand Down Expand Up @@ -257,7 +259,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
if len(part.token) <= 4 and part[0].isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(0.5, 0, 1, part.token, True, part.token, None))
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))


def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
Expand Down
3 changes: 2 additions & 1 deletion nominatim/api/search/legacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:

return LegacyToken(penalty=penalty, token=row.word_id,
count=row.search_name_count or 1,
addr_count=1, # not supported
lookup_word=lookup_word,
word_token=row.word_token.strip(),
category=(rowclass, row.type) if rowclass is not None else None,
Expand All @@ -226,7 +227,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
if len(part) <= 4 and part.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
LegacyToken(penalty=0.5, token=0, count=1,
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
lookup_word=part, word_token=part,
category=None, country=None,
operator=None, is_indexed=True))
Expand Down
2 changes: 1 addition & 1 deletion nominatim/api/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ class Token(ABC):
penalty: float
token: int
count: int
addr_count: int
lookup_word: str
is_indexed: bool

addr_count: int = 1

@abstractmethod
def get_category(self) -> Tuple[str, str]:
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def check_database(self, config: Configuration) -> Optional[str]:


@abstractmethod
def update_statistics(self, config: Configuration) -> None:
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/legacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def migrate_database(self, config: Configuration) -> None:
self._save_config(conn, config)


def update_statistics(self, _: Configuration) -> None:
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
Expand Down
3 changes: 2 additions & 1 deletion test/python/api/search/test_api_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def get_category(self):


def mktoken(tid: int):
return MyToken(3.0, tid, 1, 'foo', True)
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)


@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
Expand Down
12 changes: 7 additions & 5 deletions test/python/api/search/test_db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def make_query(*args):
for end, ttype, tinfo in tlist:
for tid, word in tinfo:
q.add_token(TokenRange(start, end), ttype,
MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
token=tid, count=1, addr_count=1,
lookup_word=word, is_indexed=True))


return q
Expand Down Expand Up @@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
q.add_node(BreakType.END, PhraseType.NONE)

q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
MyToken(0.5, 1, name_part, 'name_part', True))
MyToken(0.5, 1, name_part, 1, 'name_part', True))
q.add_token(TokenRange(0, 1), TokenType.WORD,
MyToken(0, 101, name_full, 'name_full', True))
MyToken(0, 101, name_full, 1, 'name_full', True))
for i in range(num_address_parts):
q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
MyToken(0.5, 2, address_part, 'address_part', True))
MyToken(0.5, 2, address_part, 1, 'address_part', True))
q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
MyToken(0, 102, address_full, 'address_full', True))
MyToken(0, 102, address_full, 1, 'address_full', True))

builder = SearchBuilder(q, SearchDetails())

Expand Down
3 changes: 2 additions & 1 deletion test/python/api/search/test_token_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def get_category(self):

def make_query(*args):
q = QueryStruct([Phrase(args[0][1], '')])
dummy = MyToken(3.0, 45, 1, 'foo', True)
dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
lookup_word='foo', is_indexed=True)

for btype, ptype, _ in args[1:]:
q.add_node(btype, ptype)
Expand Down
8 changes: 4 additions & 4 deletions test/python/cli/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ def __init__(self, *args, **kwargs):
self.update_statistics_called = False
self.update_word_tokens_called = False

def update_sql_functions(self, *args):
def update_sql_functions(self, *args, **kwargs):
self.update_sql_functions_called = True

def finalize_import(self, *args):
def finalize_import(self, *args, **kwargs):
self.finalize_import_called = True

def update_statistics(self, *args):
def update_statistics(self, *args, **kwargs):
self.update_statistics_called = True

def update_word_tokens(self, *args):
def update_word_tokens(self, *args, **kwargs):
self.update_word_tokens_called = True


Expand Down
12 changes: 8 additions & 4 deletions test/python/tokenizer/test_icu.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
def test_update_statistics(word_table, table_factory, temp_db_cursor,
tokenizer_factory, test_config):
word_table.add_full_word(1000, 'hello')
word_table.add_full_word(1001, 'bye')
table_factory('search_name',
'place_id BIGINT, name_vector INT[]',
[(12, [1000])])
'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
[(12, [1000], [1001])])
tok = tokenizer_factory()

tok.update_statistics(test_config)

assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and
(info->>'count')::int > 0""") > 0
WHERE type = 'W' and word_id = 1000 and
(info->>'count')::int > 0""") == 1
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
WHERE type = 'W' and word_id = 1001 and
(info->>'addr_count')::int > 0""") == 1


def test_normalize_postcode(analyzer):
Expand Down

0 comments on commit 016434f

Please sign in to comment.