Skip to content

Commit

Permalink
Fix for spatial-tree index - use S2 index tokens
Browse files Browse the repository at this point in the history
Spatial-tree index wasn't using the S2 geometry library properly, so
queries would often return the wrong results. Specifically, the
S2 cell covering of a region is somehow different (but related) to
the S2 cell index terms of a feature - we were taking the first
instead of the second.
  • Loading branch information
olsen232 committed Sep 29, 2021
1 parent 77ae868 commit 92e590a
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 68 deletions.
146 changes: 101 additions & 45 deletions kart/spatial_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,19 @@

L = logging.getLogger("kart.spatial_tree")


# These three values cannot be changed without rewriting the entire index:
S2_MIN_LEVEL = 4
S2_MAX_LEVEL = 16
S2_LEVEL_MOD = 1

S2_PARAMETERS = {
"min_level": S2_MIN_LEVEL,
"max_level": S2_MAX_LEVEL,
"level_mod": S2_LEVEL_MOD,
}

# But this value can be changed at any time.
S2_MAX_CELLS_INDEX = 8
S2_MAX_LEVEL = 15


def _revlist_command(repo):
Expand Down Expand Up @@ -147,11 +157,21 @@ def transform_from_wkt(self, wkt):


class SpatialTreeTables(TableSet):
"""Tables for associating a variable number of S2 cells with each feature."""
"""Tables for associating a variable number of S2 tokens with each feature."""

def __init__(self):
super().__init__()

# "parameters" records the parameters used to create this index. The same parameters must be used
# when updating the index and when querying the index - it won't work if not kept consistent.
self.parameters = Table(
"parameters",
self.sqlalchemy_metadata,
Column("min_level", Integer, nullable=False),
Column("max_level", Integer, nullable=False),
Column("level_mod", Integer, nullable=False),
)

# "commits" tracks all the commits we have indexed.
# A commit is only considered indexed if ALL of its ancestors are also indexed - this means
# relatively few commits need to be recorded as being indexed in this table.
Expand All @@ -163,14 +183,14 @@ def __init__(self):
Column("commit_id", BLOB, nullable=False, primary_key=True),
)

# "blobs" tracks all the features we have indexed (even if they do not overlap any s2 cells).
# "blobs" tracks all the features we have indexed (even if they have no associated S2 tokens).
self.blobs = Table(
"blobs",
self.sqlalchemy_metadata,
# From a user-perspective, "rowid" isjust an arbitrary integer primary key.
# In more detail: This column aliases to the sqlite rowid of the table.
# See https://www.sqlite.org/lang_createtable.html#rowid
# Using the rowid directly as a foreign key (see "blob_cells") means faster joins.
# Using the rowid directly as a foreign key (see "blob_tokens") means faster joins.
# The rowid can be used without creating a column that aliases to it, but you shouldn't -
# rowids might change if they are not aliased. See https://sqlite.org/lang_vacuum.html)
Column("rowid", Integer, nullable=False, primary_key=True),
Expand All @@ -180,9 +200,12 @@ def __init__(self):
sqlite_autoincrement=True,
)

# "blob_cells" associates 0 or more S2 cell tokens with each feature that we have indexed.
self.blob_cells = Table(
"blob_cells",
# "blob_tokens" associates 0 or more S2 cell tokens with each feature that we have indexed.
# Technically these are "index terms" not S2 cell tokens since they are slightly more complicated
# - there are two types of term, ANCESTOR and COVERING. COVERING terms start with a "$" prefix.
# For more details on how indexing works, consult the S2RegionTermIndexer documentation.
self.blob_tokens = Table(
"blob_tokens",
self.sqlalchemy_metadata,
# Reference to blobs.rowid.
Column(
Expand All @@ -192,10 +215,11 @@ def __init__(self):
nullable=False,
primary_key=True,
),
# S2 cell token eg "6d6dd90351b31cbf".
# S2 index term eg "6d6dd90351b31cbf" or "$6e1".
# To locate an S2 cell by token, see https://s2.sidewalklabs.com/regioncoverer/
# (and remove any $ prefix from these index terms so that they are simply S2 cell tokens).
Column(
"cell_token",
"s2_token",
Text,
nullable=False,
primary_key=True,
Expand All @@ -207,7 +231,9 @@ def __init__(self):


def drop_tables(sess):
sess.execute("DROP TABLE IF EXISTS blob_cells;")
sess.execute("DROP TABLE IF EXISTS parameters;")
sess.execute("DROP TABLE IF EXISTS commits;")
sess.execute("DROP TABLE IF EXISTS blob_tokens;")
sess.execute("DROP TABLE IF EXISTS blobs;")


Expand Down Expand Up @@ -250,7 +276,7 @@ def _minimal_description_of_commit_set(repo, commits):
return set(r.stdout.splitlines())


def _build_on_last_index(repo, start_commits, sess):
def _build_on_last_index(repo, start_commits, engine, clear_existing=False):
"""
Given a set of commits to index (including their ancestors) - the "start-commits" - returns the following:
- the minimal description of the "start-commits"
Expand All @@ -264,15 +290,18 @@ def _build_on_last_index(repo, start_commits, sess):
last time the index was brought up to date (or completed up to a certain point).
"""

commits_table_exists = sess.scalar(
"SELECT count(*) FROM sqlite_master WHERE name = 'commits';"
)
if commits_table_exists:
stop_commits = {
row[0].hex() for row in sess.execute("SELECT commit_id FROM commits;")
}
else:
stop_commits = set()
stop_commits = set()

if not clear_existing:
with sessionmaker(bind=engine)() as sess:
commits_table_exists = sess.scalar(
"SELECT count(*) FROM sqlite_master WHERE name = 'commits';"
)
if commits_table_exists:
stop_commits = {
row[0].hex()
for row in sess.execute("SELECT commit_id FROM commits;")
}

all_independent_commits = _minimal_description_of_commit_set(
repo, start_commits | stop_commits
Expand All @@ -281,6 +310,39 @@ def _build_on_last_index(repo, start_commits, sess):
return (start_commits, stop_commits, all_independent_commits)


def _create_indexer_from_parameters(sess):
"""
Return an S2RegionTermIndexer configured according to the parameters table.
Storing these parameters in a table means we can change them if needed without breaking Kart.
"""

import s2_py as s2

parameters_count = sess.scalar("SELECT COUNT(*) FROM parameters;")
assert parameters_count <= 1
if not parameters_count:
sess.execute(
"""
INSERT INTO parameters (min_level, max_level, level_mod)
VALUES (:min_level, :max_level, :level_mod);
""",
S2_PARAMETERS,
)

row = sess.execute(
"SELECT min_level, max_level, level_mod FROM parameters;"
).fetchone()
assert row is not None
min_level, max_level, level_mod = row

s2_indexer = s2.S2RegionTermIndexer()
s2_indexer.set_min_level(min_level)
s2_indexer.set_max_level(max_level)
s2_indexer.set_level_mod(level_mod)
s2_indexer.set_max_cells(S2_MAX_CELLS_INDEX)
return s2_indexer


def _format_commits(repo, commit_ids):
if not commit_ids:
return None
Expand All @@ -297,29 +359,22 @@ def update_spatial_tree(repo, commits, verbosity=1, clear_existing=False):
verbosity - how much non-essential information to output.
clear_existing - when true, deletes any pre-existing data before re-indexing.
"""
import s2_py as s2

crs_helper = CrsHelper(repo)

db_path = repo.gitdir_file(KartRepoFiles.S2_INDEX)
engine = sqlite_engine(db_path)

# Find out where we were up to last time, don't reindex anything that's already indexed.
with sessionmaker(bind=engine)() as sess:
start_commits, stop_commits, all_independent_commits = _build_on_last_index(
repo, commits, sess
)
start_commits, stop_commits, all_independent_commits = _build_on_last_index(
repo, commits, engine, clear_existing=clear_existing
)

if not start_commits:
click.echo("Nothing to do: index already up to date.")
return

feature_oid_iter = iter_feature_oids(repo, start_commits, stop_commits)

s2_coverer = s2.S2RegionCoverer()
s2_coverer.set_max_cells(S2_MAX_CELLS_INDEX)
s2_coverer.set_max_level(S2_MAX_LEVEL)

progress_every = None
if verbosity >= 1:
progress_every = max(100, 100_000 // (10 ** (verbosity - 1)))
Expand All @@ -329,6 +384,7 @@ def update_spatial_tree(repo, commits, verbosity=1, clear_existing=False):
drop_tables(sess)

SpatialTreeTables.create_all(sess)
s2_indexer = _create_indexer_from_parameters(sess)

# We index from the most recent commits, and stop at the already-indexed ancestors -
# but in terms of logging it makes more sense to say: indexing from <ANCESTORS> to <CURRENT>.
Expand Down Expand Up @@ -360,9 +416,9 @@ def update_spatial_tree(repo, commits, verbosity=1, clear_existing=False):
if geom is None:
continue
try:
s2_cell_tokens = find_s2_cells(s2_coverer, geom, transforms)
s2_tokens = get_s2_tokens(s2_indexer, geom, transforms)
except Exception as e:
L.warning(f"Couldn't locate S2 cells for {feature_oid}:\n{e}")
L.warning(f"Couldn't generate S2 index for {feature_oid}:\n{e}")
continue

params = (bytes.fromhex(feature_oid),)
Expand All @@ -375,12 +431,12 @@ def update_spatial_tree(repo, commits, verbosity=1, clear_existing=False):
dbcur.execute("INSERT INTO blobs (blob_id) VALUES (?);", params)
rowid = dbcur.lastrowid

if not s2_cell_tokens:
if not s2_tokens:
continue

params = [(rowid, token) for token in s2_cell_tokens]
params = [(rowid, token) for token in s2_tokens]
dbcur.executemany(
"INSERT OR IGNORE INTO blob_cells (blob_rowid, cell_token) VALUES (?, ?);",
"INSERT OR IGNORE INTO blob_tokens (blob_rowid, s2_token) VALUES (?, ?);",
params,
)

Expand Down Expand Up @@ -418,13 +474,13 @@ def _find_geometry_column(fields):
return result


def find_s2_cells(s2_coverer, geom, transforms):
def get_s2_tokens(s2_indexer, geom, transforms):
is_point = geom.geometry_type == GeometryType.POINT

return (
_point_f2_cells(s2_coverer, geom, transforms)
_point_s2_tokens(s2_indexer, geom, transforms)
if is_point
else _general_s2_cells(s2_coverer, geom, transforms)
else _general_s2_tokens(s2_indexer, geom, transforms)
)


Expand All @@ -436,7 +492,7 @@ def _apply_transform(original, transform, overwrite_original=False):
return result


def _point_f2_cells(s2_coverer, geom, transforms):
def _point_s2_tokens(s2_indexer, geom, transforms):
import s2_py as s2

g = gpkg_geom_to_ogr(geom)
Expand All @@ -447,13 +503,13 @@ def _point_f2_cells(s2_coverer, geom, transforms):
g_transformed = _apply_transform(g, transform, overwrite_original=one_transform)
p = g_transformed.GetPoint()[:2]
s2_ll = s2.S2LatLng.FromDegrees(p[1], p[0]).Normalized()
s2_token = s2.S2CellId(s2_ll.ToPoint()).ToToken()
result.add(s2_token)
query_terms = s2_indexer.GetIndexTerms(s2_ll.ToPoint(), "")
result.update(query_terms)

return result


def _general_s2_cells(s2_coverer, geom, transforms):
def _general_s2_tokens(s2_indexer, geom, transforms):
import s2_py as s2

e = geom_envelope(geom)
Expand All @@ -474,8 +530,8 @@ def _general_s2_cells(s2_coverer, geom, transforms):
s2_ll.append(s2.S2LatLng.FromDegrees(p_dest[1], p_dest[0]).Normalized())

s2_llrect = s2.S2LatLngRect.FromPointPair(*s2_ll)
for s2_cell_id in s2_coverer.GetCovering(s2_llrect):
result.add(s2_cell_id.ToToken())
query_terms = s2_indexer.GetIndexTerms(s2_llrect, "")
result.update(query_terms)

return result

Expand Down
1 change: 0 additions & 1 deletion kart/sqlalchemy/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def sqlite_engine(path):
def _on_connect(pysqlite_conn, connection_record):
pysqlite_conn.isolation_level = None
dbcur = pysqlite_conn.cursor()
dbcur.execute("PRAGMA journal_mode = 'wal';")
dbcur.execute("PRAGMA foreign_keys = ON;")

path = os.path.expanduser(path)
Expand Down
Loading

0 comments on commit 92e590a

Please sign in to comment.