Skip to content

Commit

Permalink
try function for computing word frequencies
Browse files Browse the repository at this point in the history
  • Loading branch information
lonvia committed Mar 13, 2024
1 parent 184b666 commit 5c809ec
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 9 deletions.
1 change: 1 addition & 0 deletions nominatim/api/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class Token(ABC):
lookup_word: str
is_indexed: bool

addr_count: int = 1

@abstractmethod
def get_category(self) -> Tuple[str, str]:
Expand Down
34 changes: 25 additions & 9 deletions nominatim/tokenizer/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,21 +122,37 @@ def update_statistics(self, config: Configuration) -> None:
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.count is null and af.count is null THEN info
ELSE info ||
CASE WHEN wf.count is null THEN '{}'::jsonb
ELSE jsonb_build_object('count', wf.count) END ||
CASE WHEN af.count is null THEN '{}'::jsonb
ELSE jsonb_build_object('addr_count', af.count) END
END) as info
word_freq_update(word_id, info) as info
FROM word
LEFT JOIN word_frequencies wf ON word.word_id = wf.id
LEFT JOIN addressword_frequencies af ON word.word_id = af.id""")
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')

sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
Expand Down

0 comments on commit 5c809ec

Please sign in to comment.