From 1c39cada9affa734b36a0717b40cbfc50d8bc0ba Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Wed, 3 Mar 2021 14:41:04 +1300 Subject: [PATCH] Address SQL Server review comments --- sno/base_dataset.py | 4 + sno/checkout.py | 2 +- sno/clone.py | 2 +- sno/geometry.py | 8 + sno/init.py | 2 +- sno/repo.py | 4 +- sno/sqlalchemy.py | 32 ++- sno/working_copy/base.py | 123 +++++++----- sno/working_copy/db_server.py | 103 ++++++++++ sno/working_copy/gpkg.py | 172 +++++++++-------- sno/working_copy/postgis.py | 161 ++++------------ sno/working_copy/sqlserver.py | 267 ++++++++++---------------- sno/working_copy/sqlserver_adapter.py | 39 +--- sno/working_copy/table_defs.py | 4 +- tests/conftest.py | 6 +- tests/test_working_copy_sqlserver.py | 2 +- 16 files changed, 466 insertions(+), 465 deletions(-) create mode 100644 sno/working_copy/db_server.py diff --git a/sno/base_dataset.py b/sno/base_dataset.py index 56325600f..0a25f0592 100644 --- a/sno/base_dataset.py +++ b/sno/base_dataset.py @@ -9,6 +9,10 @@ class BaseDataset(ImportSource): """ Common interface for all datasets - mainly Dataset2, but there is also Dataset0 and Dataset1 used by `sno upgrade`. + + A Dataset instance is immutable since it is a view of a particular git tree. + To get a new version of a dataset, commit the desired changes, + then instantiate a new Dataset instance that references the new git tree. """ # Constants that subclasses should generally define. diff --git a/sno/checkout.py b/sno/checkout.py index cf3f80ace..587a49011 100644 --- a/sno/checkout.py +++ b/sno/checkout.py @@ -378,7 +378,7 @@ def create_workingcopy(ctx, discard_changes, wc_path): wc_path = WorkingCopy.default_path(repo.workdir_path) if wc_path != old_wc_path: - WorkingCopy.check_valid_creation_path(repo.workdir_path, wc_path) + WorkingCopy.check_valid_creation_path(wc_path, repo.workdir_path) # Finished sanity checks - start work: if old_wc and wc_path != old_wc_path: diff --git a/sno/clone.py b/sno/clone.py index c598746ad..63574566a 100644 --- a/sno/clone.py +++ b/sno/clone.py @@ -106,7 +106,7 @@ def clone( if repo_path.exists() and any(repo_path.iterdir()): raise InvalidOperation(f'"{repo_path}" isn\'t empty', param_hint="directory") - WorkingCopy.check_valid_creation_path(repo_path, wc_path) + WorkingCopy.check_valid_creation_path(wc_path, repo_path) if not repo_path.exists(): repo_path.mkdir(parents=True) diff --git a/sno/geometry.py b/sno/geometry.py index ca42f0d75..8873c8ddd 100644 --- a/sno/geometry.py +++ b/sno/geometry.py @@ -59,6 +59,11 @@ def with_crs_id(self, crs_id): @property def crs_id(self): + """ + Returns the CRS ID as it is embedded in the GPKG header - before the WKB. + Note that datasets V2 zeroes this field before committing, + so will return zero when called on Geometry where it has been zeroed. + """ wkb_offset, is_le, crs_id = parse_gpkg_geom(self) return crs_id @@ -296,6 +301,7 @@ def gpkg_geom_to_ogr(gpkg_geom, parse_crs=False): def wkt_to_gpkg_geom(wkt, **kwargs): + """Given a well-known-text string, returns a GPKG Geometry object.""" if wkt is None: return None @@ -304,6 +310,7 @@ def wkt_to_gpkg_geom(wkt, **kwargs): def wkb_to_gpkg_geom(wkb, **kwargs): + """Given a well-known-binary bytestring, returns a GPKG Geometry object.""" if wkb is None: return None @@ -312,6 +319,7 @@ def wkb_to_gpkg_geom(wkb, **kwargs): def hex_wkb_to_gpkg_geom(hex_wkb, **kwargs): + """Given a hex-encoded well-known-binary bytestring, returns a GPKG Geometry object.""" if hex_wkb is None: return None diff --git a/sno/init.py b/sno/init.py index 4f7dcc5de..a9d88b4bc 100644 --- a/sno/init.py +++ b/sno/init.py @@ -369,7 +369,7 @@ def init( if repo_path.exists() and any(repo_path.iterdir()): raise InvalidOperation(f'"{repo_path}" isn\'t empty', param_hint="directory") - WorkingCopy.check_valid_creation_path(repo_path, wc_path) + WorkingCopy.check_valid_creation_path(wc_path, repo_path) if not repo_path.exists(): repo_path.mkdir(parents=True) diff --git a/sno/repo.py b/sno/repo.py index 7fb303fa6..1e5420d1a 100644 --- a/sno/repo.py +++ b/sno/repo.py @@ -177,7 +177,7 @@ def init_repository( repo_root_path = repo_root_path.resolve() cls._ensure_exists_and_empty(repo_root_path) if not bare: - WorkingCopy.check_valid_creation_path(repo_root_path, wc_path) + WorkingCopy.check_valid_creation_path(wc_path, repo_root_path) extra_args = [] if initial_branch is not None: @@ -224,7 +224,7 @@ def clone_repository( repo_root_path = repo_root_path.resolve() cls._ensure_exists_and_empty(repo_root_path) if not bare: - WorkingCopy.check_valid_creation_path(repo_root_path, wc_path) + WorkingCopy.check_valid_creation_path(wc_path, repo_root_path) if bare: sno_repo = cls._create_with_git_command( diff --git a/sno/sqlalchemy.py b/sno/sqlalchemy.py index c3208f462..959bedadb 100644 --- a/sno/sqlalchemy.py +++ b/sno/sqlalchemy.py @@ -1,6 +1,6 @@ import re import socket -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import urlsplit, urlunsplit, urlencode, parse_qs import sqlalchemy @@ -11,6 +11,7 @@ from sno import spatialite_path from sno.geometry import Geometry +from sno.exceptions import NotFound def gpkg_engine(path): @@ -110,7 +111,6 @@ def _on_connect(psycopg2_conn, connection_record): CANONICAL_SQL_SERVER_SCHEME = "mssql" INTERNAL_SQL_SERVER_SCHEME = "mssql+pyodbc" -SQL_SERVER_DRIVER_LIB = "ODBC+Driver+17+for+SQL+Server" def sqlserver_engine(msurl): @@ -122,7 +122,7 @@ def sqlserver_engine(msurl): url_netloc = re.sub(r"\blocalhost\b", _replace_with_localhost, url.netloc) url_query = _append_to_query( - url.query, {"driver": SQL_SERVER_DRIVER_LIB, "Application+Name": "sno"} + url.query, {"driver": get_sqlserver_driver(), "Application Name": "sno"} ) msurl = urlunsplit( @@ -133,18 +133,30 @@ def sqlserver_engine(msurl): return engine +def get_sqlserver_driver(): + import pyodbc + + drivers = [ + d for d in pyodbc.drivers() if re.search("SQL Server", d, flags=re.IGNORECASE) + ] + if not drivers: + drivers = pyodbc.drivers() + if not drivers: + raise NotFound("SQL Server driver was not found") + return sorted(drivers)[-1] # Latest driver + + def _replace_with_localhost(*args, **kwargs): return socket.gethostbyname("localhost") -def _append_query_to_url(uri, query_dict): +def _append_query_to_url(uri, new_query_dict): url = urlsplit(uri) - url_query = _append_to_query(url.query, query_dict) + url_query = _append_to_query(url.query, new_query_dict) return urlunsplit([url.scheme, url.netloc, url.path, url_query, ""]) -def _append_to_query(url_query, query_dict): - for key, value in query_dict.items(): - if key not in url_query: - url_query = "&".join(filter(None, [url_query, f"{key}={value}"])) - return url_query +def _append_to_query(existing_query, new_query_dict): + query_dict = parse_qs(existing_query) + # ignore new keys if they're already set in the querystring + return urlencode({**new_query_dict, **query_dict}) diff --git a/sno/working_copy/base.py b/sno/working_copy/base.py index 4dffdcac2..7332e3015 100644 --- a/sno/working_copy/base.py +++ b/sno/working_copy/base.py @@ -1,14 +1,13 @@ +from enum import Enum, auto import contextlib import functools import itertools import logging import time -from enum import Enum, auto - import click import pygit2 -import sqlalchemy +import sqlalchemy as sa from sno.base_dataset import BaseDataset from sno.diff_structs import RepoDiff, DatasetDiff, DeltaDiff, Delta @@ -48,7 +47,8 @@ def from_path(cls, path, allow_invalid=False): f"Unrecognised working copy type: {path}\n" "Try one of:\n" " PATH.gpkg\n" - " postgresql://[HOST]/DBNAME/SCHEMA\n" + " postgresql://[HOST]/DBNAME/DBSCHEMA\n" + " mssql://[HOST]/DBNAME/DBSCHEMA" ) @property @@ -103,12 +103,9 @@ class WorkingCopy: SNO_WORKINGCOPY_PATH = "sno.workingcopy.path" @property - @functools.lru_cache(maxsize=1) - def DB_SCHEMA(self): - """Escaped, dialect-specific name of the database-schema owned by this working copy (if any).""" - if self.db_schema is None: - raise RuntimeError("No schema to escape.") - return self.preparer.format_schema(self.db_schema) + def WORKING_COPY_TYPE_NAME(self): + """Human readable name of this type of working copy, eg "PostGIS".""" + raise NotImplementedError() @property @functools.lru_cache(maxsize=1) @@ -147,9 +144,7 @@ def table_identifier(self, dataset_or_table): else dataset_or_table ) sqlalchemy_table = ( - sqlalchemy.table(table, schema=self.db_schema) - if isinstance(table, str) - else table + sa.table(table, schema=self.db_schema) if isinstance(table, str) else table ) return self.preparer.format_table(sqlalchemy_table) @@ -157,7 +152,7 @@ def _table_def_for_column_schema(self, col, dataset): # This is just used for selects/inserts/updates - we don't need the full type information. # We only need type information if some automatic conversion needs to happen on read or write. # TODO: return the full type information so we can also use it for CREATE TABLE. - return sqlalchemy.column(col.name) + return sa.column(col.name) @functools.lru_cache() def _table_def_for_dataset(self, dataset, schema=None): @@ -166,7 +161,7 @@ def _table_def_for_dataset(self, dataset, schema=None): Specifies table and column names only - the minimum required such that an insert() or update() will work. """ schema = schema or dataset.schema - return sqlalchemy.table( + return sa.table( dataset.table_name, *[self._table_def_for_column_schema(c, dataset) for c in schema], schema=self.db_schema, @@ -261,7 +256,16 @@ def write_config(cls, repo, path=None, bare=False): repo_cfg[path_key] = str(path) @classmethod - def check_valid_creation_path(cls, workdir_path, wc_path): + def subclass_from_path(cls, wc_path): + wct = WorkingCopyType.from_path(wc_path) + if wct.class_ is cls: + raise RuntimeError( + f"No subclass found - don't call subclass_from_path on concrete implementation {cls}." + ) + return wct.class_ + + @classmethod + def check_valid_creation_path(cls, wc_path, workdir_path=None): """ Given a user-supplied string describing where to put the working copy, ensures it is a valid location, and nothing already exists there that prevents us from creating it. Raises InvalidOperation if it is not. @@ -269,20 +273,16 @@ def check_valid_creation_path(cls, workdir_path, wc_path): """ if not wc_path: wc_path = cls.default_path(workdir_path) - WorkingCopyType.from_path(wc_path).class_.check_valid_creation_path( - workdir_path, wc_path - ) + cls.subclass_from_path(wc_path).check_valid_creation_path(wc_path, workdir_path) @classmethod - def check_valid_path(cls, workdir_path, wc_path): + def check_valid_path(cls, wc_path, workdir_path=None): """ Given a user-supplied string describing where to put the working copy, ensures it is a valid location, and nothing already exists there that prevents us from creating it. Raises InvalidOperation if it is not. Doesn't check if we have permissions to create a working copy there. """ - WorkingCopyType.from_path(wc_path).class_.check_valid_path( - workdir_path, wc_path - ) + cls.subclass_from_path(wc_path).check_valid_path(wc_path, workdir_path) def check_valid_state(self): if self.is_created(): @@ -304,19 +304,39 @@ def default_path(cls, workdir_path): return f"{stem}.gpkg" @classmethod - def normalise_path(cls, repo, path): + def normalise_path(cls, repo, wc_path): """If the path is in a non-standard form, normalise it to the equivalent standard form.""" - return WorkingCopyType.from_path(path).class_.normalise_path(repo, path) + return cls.subclass_from_path(wc_path).normalise_path(repo, wc_path) @contextlib.contextmanager def session(self, bulk=0): """ - Context manager for DB sessions, yields a session object inside a transaction - Calling again yields the _same_ session, the transaction/etc only happen in the outer one. + Context manager for GeoPackage DB sessions, yields a connection object inside a transaction - @bulk controls bulk-loading operating mode - subject to change. See ./gpkg.py + Calling again yields the _same_ session, the transaction/etc only happen in the outer one. """ - raise NotImplementedError() + L = logging.getLogger(f"{self.__class__.__qualname__}.session") + + if hasattr(self, "_session"): + # Inner call - reuse existing session. + L.debug("session: existing...") + yield self._session + L.debug("session: existing/done") + return + + L.debug("session: new...") + self._session = self.sessionmaker() + try: + # TODO - use tidier syntax for opening transactions from sqlalchemy. + yield self._session + self._session.commit() + except Exception: + self._session.rollback() + raise + finally: + self._session.close() + del self._session + L.debug("session: new/done") def is_created(self): """ @@ -569,10 +589,10 @@ def _execute_dirty_rows_query( pk_column = table.columns[schema.pk_columns[0].name] tracking_col_type = sno_track.c.pk.type - base_query = sqlalchemy.select(columns=cols_to_select).select_from( + base_query = sa.select(columns=cols_to_select).select_from( sno_track.outerjoin( table, - sno_track.c.pk == sqlalchemy.cast(pk_column, tracking_col_type), + sno_track.c.pk == sa.cast(pk_column, tracking_col_type), ) ) @@ -581,7 +601,7 @@ def _execute_dirty_rows_query( else: pks = list(feature_filter) query = base_query.where( - sqlalchemy.and_( + sa.and_( sno_track.c.table_name == dataset.table_name, sno_track.c.pk.in_(pks), ) @@ -611,10 +631,10 @@ def reset_tracking_table(self, reset_filter=UNFILTERED): continue pks = list(dataset_filter.get("feature", [])) - t = sqlalchemy.text( + t = sa.text( f"DELETE FROM {self.SNO_TRACK} WHERE table_name=:table_name AND pk IN :pks;" ) - t = t.bindparams(sqlalchemy.bindparam("pks", expanding=True)) + t = t.bindparams(sa.bindparam("pks", expanding=True)) sess.execute(t, {"table_name": table_name, "pks": pks}) def can_find_renames(self, meta_diff): @@ -701,8 +721,7 @@ def write_full(self, commit, *datasets, **kwargs): self._write_meta(sess, dataset) if dataset.has_geometry: - # This should be called while the table is still empty. - self._create_spatial_index(sess, dataset) + self._create_spatial_index_pre(sess, dataset) L.info("Creating features...") sql = self._insert_into_dataset(dataset) @@ -739,6 +758,9 @@ def write_full(self, commit, *datasets, **kwargs): "Overall rate: %d features/s", (feat_progress / (t1 - t0 or 0.001)) ) + if dataset.has_geometry: + self._create_spatial_index_post(sess, dataset) + self._create_triggers(sess, dataset) self._update_last_write_time(sess, dataset, commit) @@ -754,18 +776,29 @@ def _write_meta(self, sess, dataset): """Write any non-feature data relating to dataset - title, description, CRS, etc.""" raise NotImplementedError() - def _create_spatial_index(self, sess, dataset): + def _create_spatial_index_pre(self, sess, dataset): """ Creates a spatial index for the table for the given dataset. - The spatial index is configured so that it is automatically updated when the table is modified. - It is not guaranteed that the spatial index will take into account features that are already present - in the table when this function is called - therefore, this should be called while the table is still empty. + This function comes in a pair - _pre is called before features are written, and _post is called afterwards. + Once both are called, the index must contain all the features currently in the table, and, be + configured such that any further writes cause the index to be updated automatically. """ - raise NotImplementedError() + + # Note that the simplest implementation is to add a trigger here so that any further writes update + # the index. Then _create_spatial_index_post needn't be implemented. + pass + + def _create_spatial_index_post(self, sess, dataset): + """Like _create_spatial_index_pre, but runs AFTER the bulk of features have been written.""" + + # Being able to create the index after the bulk of features have been written could be useful for two reasons: + # 1. It might be more efficient to write the features first, then index afterwards. + # 2. Certain working copies are not able to create an index without first knowing a rough bounding box. + pass def _drop_spatial_index(self, sess, dataset): - """Inverse of _create_spatial_index - deletes the spatial index.""" - raise NotImplementedError() + """Inverse of _create_spatial_index_* - deletes the spatial index.""" + pass def _update_last_write_time(self, sess, dataset, commit=None): """Hook for updating the last-modified timestamp stored for a particular dataset, if there is one.""" @@ -795,9 +828,7 @@ def _delete_features(self, sess, dataset, pk_list): pk_column = self.preparer.quote(dataset.primary_key) sql = f"""DELETE FROM {self.table_identifier(dataset)} WHERE {pk_column} IN :pks;""" - stmt = sqlalchemy.text(sql).bindparams( - sqlalchemy.bindparam("pks", expanding=True) - ) + stmt = sa.text(sql).bindparams(sa.bindparam("pks", expanding=True)) feat_count = 0 CHUNK_SIZE = 100 for pks in self._chunk(pk_list, CHUNK_SIZE): diff --git a/sno/working_copy/db_server.py b/sno/working_copy/db_server.py new file mode 100644 index 000000000..b5d73017d --- /dev/null +++ b/sno/working_copy/db_server.py @@ -0,0 +1,103 @@ +import functools +import re +from urllib.parse import urlsplit, urlunsplit + +import click + +from .base import WorkingCopy +from sno.exceptions import InvalidOperation + + +class DatabaseServer_WorkingCopy(WorkingCopy): + """Functionality common to working copies that connect to a database server.""" + + @property + def URI_SCHEME(self): + """The URI scheme to connect to this type of database, eg "postgresql".""" + raise NotImplementedError() + + @classmethod + def check_valid_creation_path(cls, wc_path, workdir_path=None): + cls.check_valid_path(wc_path, workdir_path) + + working_copy = cls(None, wc_path) + if working_copy.has_data(): + db_schema = working_copy.db_schema + container_text = f"schema '{db_schema}'" if db_schema else "working copy" + raise InvalidOperation( + f"Error creating {cls.WORKING_COPY_TYPE_NAME} working copy at {wc_path} - " + f"non-empty {container_text} already exists" + ) + + @classmethod + def check_valid_path(cls, wc_path, workdir_path=None): + cls.check_valid_db_uri(wc_path, workdir_path) + + @classmethod + def normalise_path(cls, repo, wc_path): + return wc_path + + @classmethod + def check_valid_db_uri(cls, db_uri, workdir_path=None): + """ + For working copies that connect to a database - checks the given URI is in the required form: + >>> URI_SCHEME::[HOST]/DBNAME/DBSCHEMA + """ + url = urlsplit(db_uri) + + if url.scheme != cls.URI_SCHEME: + raise click.UsageError( + f"Invalid {cls.WORKING_COPY_TYPE_NAME} URI - " + f"Expecting URI in form: {cls.URI_SCHEME}://[HOST]/DBNAME/DBSCHEMA" + ) + + url_path = url.path + path_parts = url_path[1:].split("/", 3) if url_path else [] + + suggestion_message = "" + if len(path_parts) == 1 and workdir_path is not None: + suggested_path = f"/{path_parts[0]}/{cls.default_db_schema(workdir_path)}" + suggested_uri = urlunsplit( + [url.scheme, url.netloc, suggested_path, url.query, ""] + ) + suggestion_message = f"\nFor example: {suggested_uri}" + + if len(path_parts) != 2: + raise click.UsageError( + f"Invalid {cls.WORKING_COPY_TYPE_NAME} URI - URI requires both database name and database schema:\n" + f"Expecting URI in form: {cls.URI_SCHEME}://[HOST]/DBNAME/DBSCHEMA" + + suggestion_message + ) + + @classmethod + def _separate_db_schema(cls, db_uri): + """ + Removes the DBSCHEMA part off the end of a uri in the form URI_SCHEME::[HOST]/DBNAME/DBSCHEMA - + and returns the URI and the DBSCHEMA separately. + Useful since generally, URI_SCHEME::[HOST]/DBNAME is what is needed to connect to the database, + and then DBSCHEMA must be specified in each query. + """ + url = urlsplit(db_uri) + url_path = url.path + path_parts = url_path[1:].split("/", 3) if url_path else [] + assert len(path_parts) == 2 + url_path = "/" + path_parts[0] + db_schema = path_parts[1] + return urlunsplit([url.scheme, url.netloc, url_path, url.query, ""]), db_schema + + @classmethod + def default_db_schema(cls, workdir_path): + """Returns a suitable default database schema - named after the folder this Sno repo is in.""" + stem = workdir_path.stem + schema = re.sub("[^a-z0-9]+", "_", stem.lower()) + "_sno" + if schema[0].isdigit(): + schema = "_" + schema + return schema + + @property + @functools.lru_cache(maxsize=1) + def DB_SCHEMA(self): + """Escaped, dialect-specific name of the database-schema owned by this working copy (if any).""" + if self.db_schema is None: + raise RuntimeError("No schema to escape.") + return self.preparer.format_schema(self.db_schema) diff --git a/sno/working_copy/gpkg.py b/sno/working_copy/gpkg.py index 9bcbb00dd..2a0846cd8 100644 --- a/sno/working_copy/gpkg.py +++ b/sno/working_copy/gpkg.py @@ -7,7 +7,7 @@ import click from osgeo import gdal -import sqlalchemy +import sqlalchemy as sa from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.compiler import IdentifierPreparer from sqlalchemy.types import UserDefinedType @@ -26,6 +26,15 @@ class WorkingCopy_GPKG(WorkingCopy): + """ + GPKG working copy implementation. + + Requirements: + 1. Can read and write to the filesystem at the specified path. + """ + + WORKING_COPY_TYPE_NAME = "GPKG" + def __init__(self, repo, path): self.repo = repo self.path = path @@ -37,34 +46,34 @@ def __init__(self, repo, path): self.sno_tables = GpkgSnoTables @classmethod - def check_valid_creation_path(cls, workdir_path, path): - cls.check_valid_path(workdir_path, path) + def check_valid_creation_path(cls, wc_path, workdir_path=None): + cls.check_valid_path(wc_path, workdir_path) - gpkg_path = (workdir_path / path).resolve() + gpkg_path = (workdir_path / wc_path).resolve() if gpkg_path.exists(): desc = "path" if gpkg_path.is_dir() else "GPKG file" raise InvalidOperation( - f"Error creating GPKG working copy at {path} - {desc} already exists" + f"Error creating GPKG working copy at {wc_path} - {desc} already exists" ) @classmethod - def check_valid_path(cls, workdir_path, path): - if not str(path).endswith(".gpkg"): - suggested_path = f"{os.path.splitext(str(path))[0]}.gpkg" + def check_valid_path(cls, wc_path, workdir_path=None): + if not str(wc_path).endswith(".gpkg"): + suggested_path = f"{os.path.splitext(str(wc_path))[0]}.gpkg" raise click.UsageError( f"Invalid GPKG path - expected .gpkg suffix, eg {suggested_path}" ) @classmethod - def normalise_path(cls, repo, path): + def normalise_path(cls, repo, wc_path): """Rewrites a relative path (relative to the current directory) as relative to the repo.workdir_path.""" - path = Path(path) - if not path.is_absolute(): + wc_path = Path(wc_path) + if not wc_path.is_absolute(): try: - return str(path.resolve().relative_to(repo.workdir_path.resolve())) + return str(wc_path.resolve().relative_to(repo.workdir_path.resolve())) except ValueError: pass - return str(path) + return str(wc_path) @property def full_path(self): @@ -82,11 +91,11 @@ def _insert_or_replace_into_dataset(self, dataset): def _table_def_for_column_schema(self, col, dataset): if col.data_type == "geometry": - # This user-defined Geography type adapts WKB to SQL Server's native geography type. - return sqlalchemy.column(col.name, GeometryType) + # This user-defined GeometryType normalises GPKG geometry to the Sno V2 GPKG geometry. + return sa.column(col.name, GeometryType) else: # Don't need to specify type information for other columns at present, since we just pass through the values. - return sqlalchemy.column(col.name) + return sa.column(col.name) def _insert_or_replace_state_table_tree(self, sess, tree_id): r = sess.execute( @@ -119,37 +128,36 @@ def session(self, bulk=0): # - do something consistent and safe from then on. if hasattr(self, "_session"): - # inner - reuse + # Inner call - reuse existing session. L.debug(f"session(bulk={bulk}): existing...") yield self._session L.debug(f"session(bulk={bulk}): existing/done") + return - else: - L.debug(f"session(bulk={bulk}): new...") + # Outer call - create new session: + L.debug(f"session(bulk={bulk}): new...") + self._session = self.sessionmaker() - try: - self._session = self.sessionmaker() - - if bulk: - self._session.execute("PRAGMA synchronous = OFF;") - self._session.execute( - "PRAGMA cache_size = -1048576;" - ) # -KiB => 1GiB - if bulk >= 2: - self._session.execute("PRAGMA journal_mode = MEMORY;") - self._session.execute("PRAGMA locking_mode = EXCLUSIVE;") - - # TODO - use tidier syntax for opening transactions from sqlalchemy. - self._session.execute("BEGIN TRANSACTION;") - yield self._session - self._session.commit() - except Exception: - self._session.rollback() - raise - finally: - self._session.close() - del self._session - L.debug(f"session(bulk={bulk}): new/done") + try: + if bulk: + self._session.execute("PRAGMA synchronous = OFF;") + self._session.execute("PRAGMA cache_size = -1048576;") # -KiB => 1GiB + if bulk >= 2: + self._session.execute("PRAGMA journal_mode = MEMORY;") + self._session.execute("PRAGMA locking_mode = EXCLUSIVE;") + + # TODO - use tidier syntax for opening transactions from sqlalchemy. + self._session.execute("BEGIN TRANSACTION;") + yield self._session + self._session.commit() + + except Exception: + self._session.rollback() + raise + finally: + self._session.close() + del self._session + L.debug(f"session(bulk={bulk}): new/done") def delete(self, keep_db_schema_if_possible=False): """Delete the working copy files.""" @@ -428,12 +436,15 @@ def _delete_meta_metadata(self, sess, table_name): """DELETE FROM gpkg_metadata WHERE id IN :ids;""", ) for sql in sqls: - stmt = sqlalchemy.text(sql).bindparams( - sqlalchemy.bindparam("ids", expanding=True) - ) + stmt = sa.text(sql).bindparams(sa.bindparam("ids", expanding=True)) sess.execute(stmt, {"ids": ids}) - def _create_spatial_index(self, sess, dataset): + def _create_spatial_index_pre(self, sess, dataset): + # Implementing only _create_spatial_index_pre: + # gpkgAddSpatialIndex has to be called before writing any features, + # since it only adds on-write triggers to update the index - it doesn't + # add any pre-existing features to the index. + L = logging.getLogger(f"{self.__class__.__qualname__}._create_spatial_index") geom_col = dataset.geom_column_name @@ -480,56 +491,42 @@ def _create_triggers(self, sess, dataset): table_identifier = self.table_identifier(dataset) pk_column = self.quote(dataset.primary_key) - # SQLite doesn't let you do param substitutions in CREATE TRIGGER: - escaped_table_name = dataset.table_name.replace("'", "''") - - sess.execute( + insert_trigger = sa.text( f""" CREATE TRIGGER {self._quoted_trigger_name(dataset, 'ins')} AFTER INSERT ON {table_identifier} BEGIN - INSERT OR REPLACE INTO {self.SNO_TRACK} - (table_name, pk) - VALUES ('{escaped_table_name}', NEW.{pk_column}); + INSERT OR REPLACE INTO {self.SNO_TRACK} (table_name, pk) + VALUES (:table_name, NEW.{pk_column}); END; """ ) - sess.execute( + update_trigger = sa.text( f""" CREATE TRIGGER {self._quoted_trigger_name(dataset, 'upd')} AFTER UPDATE ON {table_identifier} BEGIN - INSERT OR REPLACE INTO {self.SNO_TRACK} - (table_name, pk) - VALUES - ('{escaped_table_name}', NEW.{pk_column}), - ('{escaped_table_name}', OLD.{pk_column}); + INSERT OR REPLACE INTO {self.SNO_TRACK} (table_name, pk) + VALUES (:table_name, NEW.{pk_column}), (:table_name, OLD.{pk_column}); END; """ ) - sess.execute( + delete_trigger = sa.text( f""" CREATE TRIGGER {self._quoted_trigger_name(dataset, 'del')} AFTER DELETE ON {table_identifier} BEGIN - INSERT OR REPLACE INTO {self.SNO_TRACK} - (table_name, pk) - VALUES - ('{escaped_table_name}', OLD.{pk_column}); + INSERT OR REPLACE INTO {self.SNO_TRACK} (table_name, pk) + VALUES (:table_name, OLD.{pk_column}); END; """ ) - - def _db_geom_to_gpkg_geom(self, g): - # Its possible in GPKG to put arbitrary values in columns, regardless of type. - # We don't try to convert them here - we let the commit validation step report this as an error. - if not isinstance(g, bytes): - return g - # We normalise geometries to avoid spurious diffs - diffs where nothing - # of any consequence has changed (eg, only endianness has changed). - # This includes setting the SRID to zero for each geometry so that we don't store a separate SRID per geometry, - # but only one per column at most. - return normalise_gpkg_geom(g) + for trigger in (insert_trigger, update_trigger, delete_trigger): + # Placeholders not allowed in CREATE TRIGGER - have to use literal_binds. + # See https://docs.sqlalchemy.org/en/13/faq/sqlexpressions.html#faq-sql-expression-string + trigger.bindparams(table_name=dataset.table_name).compile( + sess.connection(), compile_kwargs={"literal_binds": True} + ).execute() def _is_meta_update_supported(self, dataset_version, meta_diff): """ @@ -623,6 +620,21 @@ def _apply_meta_metadata_dataset_json(self, sess, dataset, src_value, dest_value def _update_last_write_time(self, sess, dataset, commit=None): self._update_gpkg_contents(sess, dataset, commit) + def _get_geom_extent(self, sess, dataset, default=None): + """Returns the envelope around the entire dataset as (min_x, min_y, max_x, max_y).""" + # FIXME: Why doesn't Extent(geom) work here as an aggregate? + geom_col = dataset.geom_column_name + r = sess.execute( + f""" + WITH _E AS ( + SELECT Extent({self.quote(geom_col)}) AS extent FROM {self.table_identifier(dataset)} + ) + SELECT ST_MinX(extent), ST_MinY(extent), ST_MaxX(extent), ST_MaxY(extent) FROM _E; + """ + ) + result = r.fetchone() + return default if result == (None, None, None, None) else result + def _update_gpkg_contents(self, sess, dataset, commit=None): """ Update the metadata for the given table in gpkg_contents to have the new bounding-box / last-updated timestamp. @@ -634,17 +646,11 @@ def _update_gpkg_contents(self, sess, dataset, commit=None): # GPKG Spec Req. 15: gpkg_change_time = change_time.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - table_identifer = self.table_identifier(dataset) geom_col = dataset.geom_column_name if geom_col is not None: - # FIXME: Why doesn't Extent(geom) work here as an aggregate? - r = sess.execute( - f""" - WITH _E AS (SELECT extent({self.quote(geom_col)}) AS extent FROM {table_identifer}) - SELECT ST_MinX(extent), ST_MinY(extent), ST_MaxX(extent), ST_MaxY(extent) FROM _E - """ + min_x, min_y, max_x, max_y = self._get_geom_extent( + sess, dataset, default=(None, None, None, None) ) - min_x, min_y, max_x, max_y = r.fetchone() rc = sess.execute( """ UPDATE gpkg_contents diff --git a/sno/working_copy/postgis.py b/sno/working_copy/postgis.py index 611bb63ee..975045fdf 100644 --- a/sno/working_copy/postgis.py +++ b/sno/working_copy/postgis.py @@ -1,37 +1,39 @@ import contextlib import logging -import re import time -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import urlsplit -import click + +from sqlalchemy import Index from sqlalchemy.dialects.postgresql import insert as postgresql_insert from sqlalchemy.sql.compiler import IdentifierPreparer from sqlalchemy.orm import sessionmaker -from .base import WorkingCopy from . import postgis_adapter +from .db_server import DatabaseServer_WorkingCopy from .table_defs import PostgisSnoTables from sno import crs_util -from sno.exceptions import InvalidOperation from sno.schema import Schema from sno.sqlalchemy import postgis_engine -""" -* database needs to exist -* database needs to have postgis enabled -* database user needs to be able to: - 1. create 'sno' schema & tables - 2. create & alter tables in the default (or specified) schema - 3. create triggers -""" +class WorkingCopy_Postgis(DatabaseServer_WorkingCopy): + """ + PosttGIS working copy implementation. -L = logging.getLogger("sno.working_copy.postgis") + Requirements: + 1. The database needs to exist + 2. If the dataset has geometry, then PostGIS (https://postgis.net/) v2.4 or newer needs + to be installed into the database and available in the database user's search path + 3. The database user needs to be able to: + - Create the specified schema (unless it already exists). + - Create, delete and alter tables and triggers in the specified schema. + """ + WORKING_COPY_TYPE_NAME = "PostGIS" + URI_SCHEME = "postgresql" -class WorkingCopy_Postgis(WorkingCopy): def __init__(self, repo, uri): """ uri: connection string of the form postgresql://[user[:password]@][netloc][:port][/dbname/schema][?param1=value1&...] @@ -42,75 +44,18 @@ def __init__(self, repo, uri): self.uri = uri self.path = uri - url = urlsplit(uri) - - if url.scheme != "postgresql": - raise ValueError("Expecting postgresql://") + self.check_valid_db_uri(uri) + self.db_uri, self.db_schema = self._separate_db_schema(uri) - url_path = url.path - path_parts = url_path[1:].split("/", 3) if url_path else [] - if len(path_parts) != 2: - raise ValueError("Expecting postgresql://[HOST]/DBNAME/SCHEMA") - url_path = f"/{path_parts[0]}" - self.db_schema = path_parts[1] - - # Rebuild DB URL suitable for postgres - self.dburl = urlunsplit([url.scheme, url.netloc, url_path, url.query, ""]) - self.engine = postgis_engine(self.dburl) + self.engine = postgis_engine(self.db_uri) self.sessionmaker = sessionmaker(bind=self.engine) self.preparer = IdentifierPreparer(self.engine.dialect) self.sno_tables = PostgisSnoTables(self.db_schema) @classmethod - def check_valid_creation_path(cls, workdir_path, path): - cls.check_valid_path(workdir_path, path) - postgis_wc = cls(None, path) - - # Less strict on Postgis - we are okay with the schema being already created, so long as its empty. - if postgis_wc.has_data(): - raise InvalidOperation( - f"Error creating Postgis working copy at {path} - non-empty schema already exists" - ) - - @classmethod - def check_valid_path(cls, workdir_path, path): - url = urlsplit(path) - - if url.scheme != "postgresql": - raise click.UsageError( - "Invalid postgres URI - Expecting URI in form: postgresql://[HOST]/DBNAME/SCHEMA" - ) - - url_path = url.path - path_parts = url_path[1:].split("/", 3) if url_path else [] - - suggestion_message = "" - if len(path_parts) == 1 and workdir_path is not None: - suggested_path = f"/{path_parts[0]}/{cls.default_schema(workdir_path)}" - suggested_uri = urlunsplit( - [url.scheme, url.netloc, suggested_path, url.query, ""] - ) - suggestion_message = f"\nFor example: {suggested_uri}" - - if len(path_parts) != 2: - raise click.UsageError( - "Invalid postgres URI - postgis working copy requires both dbname and schema:\n" - "Expecting URI in form: postgresql://[HOST]/DBNAME/SCHEMA" - + suggestion_message - ) - - @classmethod - def normalise_path(cls, repo, path): - return path - - @classmethod - def default_schema(cls, workdir_path): - stem = workdir_path.stem - schema = re.sub("[^a-z0-9]+", "_", stem.lower()) + "_sno" - if schema[0].isdigit(): - schema = "_" + schema - return schema + def check_valid_path(cls, wc_path, workdir_path=None): + cls.check_valid_db_uri(wc_path, workdir_path) def __str__(self): p = urlsplit(self.uri) @@ -124,44 +69,11 @@ def __str__(self): p._replace(netloc=nl) return p.geturl() - @contextlib.contextmanager - def session(self, bulk=0): - """ - Context manager for GeoPackage DB sessions, yields a connection object inside a transaction - - Calling again yields the _same_ connection, the transaction/etc only happen in the outer one. - """ - L = logging.getLogger(f"{self.__class__.__qualname__}.session") - - if hasattr(self, "_session"): - # inner - reuse - L.debug("session: existing...") - yield self._session - L.debug("session: existing/done") - - else: - L.debug("session: new...") - - try: - self._session = self.sessionmaker() - - # TODO - use tidier syntax for opening transactions from sqlalchemy. - self._session.execute("BEGIN TRANSACTION;") - yield self._session - self._session.commit() - except Exception: - self._session.rollback() - raise - finally: - self._session.close() - del self._session - L.debug("session: new/done") - def is_created(self): """ - Returns true if the postgres schema referred to by this working copy exists and + Returns true if the DB schema referred to by this working copy exists and contains at least one table. If it exists but is empty, it is treated as uncreated. - This is so the postgres schema can be created ahead of time before a repo is created + This is so the DB schema can be created ahead of time before a repo is created or configured, without it triggering code that checks for corrupted working copies. Note that it might not be initialised as a working copy - see self.is_initialised. """ @@ -177,7 +89,7 @@ def is_created(self): def is_initialised(self): """ - Returns true if the postgis working copy is initialised - + Returns true if the PostGIS working copy is initialised - the schema exists and has the necessary sno tables, _sno_state and _sno_track. """ with self.session() as sess: @@ -192,7 +104,7 @@ def is_initialised(self): def has_data(self): """ - Returns true if the postgis working copy seems to have user-created content already. + Returns true if the PostGIS working copy seems to have user-created content already. """ with self.session() as sess: count = sess.scalar( @@ -283,6 +195,7 @@ def _create_table_for_dataset(self, sess, dataset): ) def _insert_or_replace_into_dataset(self, dataset): + # See https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#insert-on-conflict-upsert pk_col_names = [c.name for c in dataset.schema.pk_columns] stmt = postgresql_insert(self._table_def_for_dataset(dataset)) update_dict = {c.name: c for c in stmt.excluded if c.name not in pk_col_names} @@ -326,21 +239,25 @@ def delete_meta(self, dataset): """Delete any metadata that is only needed by this dataset.""" pass # There is no metadata except for the spatial_ref_sys table. - def _create_spatial_index(self, sess, dataset): + def _create_spatial_index_post(self, sess, dataset): + # Only implemented as _create_spatial_index_post: + # It is more efficient to write the features first, then index them all in bulk. L = logging.getLogger(f"{self.__class__.__qualname__}._create_spatial_index") geom_col = dataset.geom_column_name + index_name = f"{dataset.table_name}_idx_{geom_col}" + table = self._table_def_for_dataset(dataset) - # Create the PostGIS Spatial Index L.debug("Creating spatial index for %s.%s", dataset.table_name, geom_col) t0 = time.monotonic() - index_name = f"{dataset.table_name}_idx_{geom_col}" - sess.execute( - f""" - CREATE INDEX {self.quote(index_name)} - ON {self.table_identifier(dataset)} USING GIST ({self.quote(geom_col)}); - """ + + spatial_index = Index( + index_name, table.columns[geom_col], postgres_using="GIST" ) + spatial_index.table = table + spatial_index.create(sess.connection()) + sess.execute(f"""ANALYZE {self.table_identifier(dataset)};""") + L.info("Created spatial index in %ss", time.monotonic() - t0) def _drop_spatial_index(self, sess, dataset): diff --git a/sno/working_copy/sqlserver.py b/sno/working_copy/sqlserver.py index ee4d8bc33..6156a4c5b 100644 --- a/sno/working_copy/sqlserver.py +++ b/sno/working_copy/sqlserver.py @@ -1,11 +1,9 @@ import contextlib import logging -import re import time -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import urlsplit -import click -import sqlalchemy +import sqlalchemy as sa from sqlalchemy import literal_column from sqlalchemy.ext.compiler import compiles from sqlalchemy.orm import sessionmaker @@ -15,28 +13,28 @@ from sqlalchemy.sql.compiler import IdentifierPreparer from sqlalchemy.types import UserDefinedType -from .base import WorkingCopy from . import sqlserver_adapter +from .db_server import DatabaseServer_WorkingCopy from .table_defs import SqlServerSnoTables from sno import crs_util from sno.geometry import Geometry -from sno.exceptions import InvalidOperation from sno.sqlalchemy import sqlserver_engine -""" -* database needs to exist -* database needs to have postgis enabled -* database user needs to be able to: - 1. create 'sno' schema & tables - 2. create & alter tables in the default (or specified) schema - 3. create triggers -""" +class WorkingCopy_SqlServer(DatabaseServer_WorkingCopy): + """ + SQL Server working copy implementation. -L = logging.getLogger("sno.working_copy.postgis") + Requirements: + 1. The database needs to exist + 2. The database user needs to be able to: + - Create the specified schema (unless it already exists). + - Create, delete and alter tables and triggers in the specified schema. + """ + WORKING_COPY_TYPE_NAME = "SQL Server" + URI_SCHEME = "mssql" -class WorkingCopy_SqlServer(WorkingCopy): def __init__(self, repo, uri): """ uri: connection string of the form mssql://[user[:password]@][netloc][:port][/dbname/schema][?param1=value1&...] @@ -47,78 +45,15 @@ def __init__(self, repo, uri): self.uri = uri self.path = uri - url = urlsplit(uri) - - if url.scheme != "mssql": - raise ValueError("Expecting mssql://") - - url_path = url.path - path_parts = url_path[1:].split("/", 3) if url_path else [] - if len(path_parts) != 2: - raise ValueError("Expecting mssql://[HOST]/DBNAME/SCHEMA") - url_path = f"/{path_parts[0]}" - self.db_schema = path_parts[1] + self.check_valid_db_uri(uri) + self.db_uri, self.db_schema = self._separate_db_schema(uri) - # Rebuild DB URL suitable for sqlserver_engine. - self.dburl = urlunsplit([url.scheme, url.netloc, url_path, url.query, ""]) - self.engine = sqlserver_engine(self.dburl) + self.engine = sqlserver_engine(self.db_uri) self.sessionmaker = sessionmaker(bind=self.engine) self.preparer = IdentifierPreparer(self.engine.dialect) self.sno_tables = SqlServerSnoTables(self.db_schema) - @classmethod - def check_valid_creation_path(cls, workdir_path, path): - # TODO - promote to superclass - cls.check_valid_path(workdir_path, path) - sqlserver_wc = cls(None, path) - - # We are okay with the schema being already created, so long as its empty. - if sqlserver_wc.has_data(): - raise InvalidOperation( - f"Error creating SQL Server working copy at {path} - non-empty schema already exists" - ) - - @classmethod - def check_valid_path(cls, workdir_path, path): - url = urlsplit(path) - - if url.scheme != "mssql": - raise click.UsageError( - "Invalid postgres URI - Expecting URI in form: mssql://[HOST]/DBNAME/SCHEMA" - ) - - url_path = url.path - path_parts = url_path[1:].split("/", 3) if url_path else [] - - suggestion_message = "" - if len(path_parts) == 1 and workdir_path is not None: - suggested_path = f"/{path_parts[0]}/{cls.default_schema(workdir_path)}" - suggested_uri = urlunsplit( - [url.scheme, url.netloc, suggested_path, url.query, ""] - ) - suggestion_message = f"\nFor example: {suggested_uri}" - - if len(path_parts) != 2: - raise click.UsageError( - "Invalid mssql URI - SQL Server working copy requires both dbname and schema:\n" - "Expecting URI in form: mssql://[HOST]/DBNAME/SCHEMA" - + suggestion_message - ) - - @classmethod - def normalise_path(cls, repo, path): - return path - - @classmethod - def default_schema(cls, workdir_path): - # TODO - promote to superclass - stem = workdir_path.stem - schema = re.sub("[^a-z0-9]+", "_", stem.lower()) + "_sno" - if schema[0].isdigit(): - schema = "_" + schema - return schema - def __str__(self): p = urlsplit(self.uri) if p.password is not None: @@ -131,38 +66,6 @@ def __str__(self): p._replace(netloc=nl) return p.geturl() - @contextlib.contextmanager - def session(self, bulk=0): - """ - Context manager for GeoPackage DB sessions, yields a connection object inside a transaction - - Calling again yields the _same_ connection, the transaction/etc only happen in the outer one. - """ - L = logging.getLogger(f"{self.__class__.__qualname__}.session") - - if hasattr(self, "_session"): - # inner - reuse - L.debug("session: existing...") - yield self._session - L.debug("session: existing/done") - - else: - L.debug("session: new...") - - try: - self._session = self.sessionmaker() - - # TODO - use tidier syntax for opening transactions from sqlalchemy. - yield self._session - self._session.commit() - except Exception: - self._session.rollback() - raise - finally: - self._session.close() - del self._session - L.debug("session: new/done") - def is_created(self): """ Returns true if the db schema referred to by this working copy exists and @@ -180,7 +83,7 @@ def is_created(self): def is_initialised(self): """ - Returns true if the postgis working copy is initialised - + Returns true if the SQL server working copy is initialised - the schema exists and has the necessary sno tables, _sno_state and _sno_track. """ with self.session() as sess: @@ -196,7 +99,7 @@ def is_initialised(self): def has_data(self): """ - Returns true if the postgis working copy seems to have user-created content already. + Returns true if the SQL server working copy seems to have user-created content already. """ with self.session() as sess: count = sess.scalar( @@ -245,15 +148,15 @@ def _create_table_for_dataset(self, sess, dataset): def _table_def_for_column_schema(self, col, dataset): if col.data_type == "geometry": - # This user-defined Geography type adapts WKB to SQL Server's native geography type. crs_name = col.extra_type_info.get("geometryCRS", None) crs_id = crs_util.get_identifier_int_from_dataset(dataset, crs_name) or 0 - return sqlalchemy.column(col.name, GeographyType(crs_id)) + # This user-defined GeometryType adapts Sno's GPKG geometry to SQL Server's native geometry type. + return sa.column(col.name, GeometryType(crs_id)) elif col.data_type in ("date", "time", "timestamp"): - return sqlalchemy.column(col.name, BaseDateOrTimeType) + return sa.column(col.name, BaseDateOrTimeType) else: # Don't need to specify type information for other columns at present, since we just pass through the values. - return sqlalchemy.column(col.name) + return sa.column(col.name) def _insert_or_replace_into_dataset(self, dataset): pk_col_names = [c.name for c in dataset.schema.pk_columns] @@ -282,33 +185,85 @@ def _insert_or_replace_state_table_tree(self, sess, tree_id): return r.rowcount def _write_meta(self, sess, dataset): - """Write the title (as a comment) and the CRS. Other metadata is not stored in a PostGIS WC.""" + """Write the title. Other metadata is not stored in a SQL Server WC.""" self._write_meta_title(sess, dataset) def _write_meta_title(self, sess, dataset): """Write the dataset title as a comment on the table.""" - # TODO - probably need to use sp_addextendedproperty @name=N'MS_Description' + # TODO - dataset title is not stored anywhere in SQL server working copy right now. + # We can probably store it using function sp_addextendedproperty to add property 'MS_Description' pass def delete_meta(self, dataset): """Delete any metadata that is only needed by this dataset.""" - pass # There is no metadata except for the spatial_ref_sys table. + # There is no metadata stored anywhere except the table itself. + pass + + def _get_geom_extent(self, sess, dataset, default=None): + """Returns the envelope around the entire dataset as (min_x, min_y, max_x, max_y).""" + geom_col = dataset.geom_column_name + r = sess.execute( + f""" + WITH _E AS ( + SELECT geometry::EnvelopeAggregate({self.quote(geom_col)}) AS envelope + FROM {self.table_identifier(dataset)} + ) + SELECT + envelope.STPointN(1).STX AS min_x, + envelope.STPointN(1).STY AS min_y, + envelope.STPointN(3).STX AS max_x, + envelope.STPointN(3).STY AS max_y + FROM _E; + """ + ) + result = r.fetchone() + return default if result == (None, None, None, None) else result + + def _grow_rectangle(self, rectangle, scale_factor): + # scale_factor = 1 -> no change, >1 -> grow, <1 -> shrink. + min_x, min_y, max_x, max_y = rectangle + centre_x, centre_y = (min_x + max_x) / 2, (min_y + max_y) / 2 + min_x = (min_x - centre_x) * scale_factor + centre_x + min_y = (min_y - centre_y) * scale_factor + centre_y + max_x = (max_x - centre_x) * scale_factor + centre_x + max_y = (max_y - centre_y) * scale_factor + centre_y + return min_x, min_y, max_x, max_y + + def _create_spatial_index_post(self, sess, dataset): + # Only implementing _create_spatial_index_post: + # We need to know the rough extent of the data to create an index in that area, + # so we create the spatial index once the bulk of the features have been written. - def _create_spatial_index(self, sess, dataset): L = logging.getLogger(f"{self.__class__.__qualname__}._create_spatial_index") + extent = self._get_geom_extent(sess, dataset) + if not extent: + # Can't create a spatial index if we don't know the rough bounding box we need to index. + return + + # Add 20% room to grow. + GROW_FACTOR = 1.2 + min_x, min_y, max_x, max_y = self._grow_rectangle(extent, GROW_FACTOR) + geom_col = dataset.geom_column_name + index_name = f"{dataset.table_name}_idx_{geom_col}" - # Create the SQL Server Spatial Index L.debug("Creating spatial index for %s.%s", dataset.table_name, geom_col) t0 = time.monotonic() - index_name = f"{dataset.table_name}_idx_{geom_col}" - sess.execute( + + create_index = sa.text( f""" CREATE SPATIAL INDEX {self.quote(index_name)} - ON {self.table_identifier(dataset)} ({self.quote(geom_col)}); + ON {self.table_identifier(dataset)} ({self.quote(geom_col)}) + WITH (BOUNDING_BOX = (:min_x, :min_y, :max_x, :max_y)) """ - ) + ).bindparams(min_x=min_x, min_y=min_y, max_x=max_x, max_y=max_y) + # Placeholders not allowed in CREATE SPATIAL INDEX - have to use literal_binds. + # See https://docs.sqlalchemy.org/en/13/faq/sqlexpressions.html#faq-sql-expression-string + create_index.compile( + sess.connection(), compile_kwargs={"literal_binds": True} + ).execute() + L.info("Created spatial index in %ss", time.monotonic() - t0) def _drop_spatial_index(self, sess, dataset): @@ -321,24 +276,26 @@ def _quoted_trigger_name(self, dataset): def _create_triggers(self, sess, dataset): pk_name = dataset.primary_key - escaped_table_name = dataset.table_name.replace("'", "''") - - sess.execute( + create_trigger = sa.text( f""" CREATE TRIGGER {self._quoted_trigger_name(dataset)} ON {self.table_identifier(dataset)} AFTER INSERT, UPDATE, DELETE AS BEGIN MERGE {self.SNO_TRACK} TRA USING - (SELECT '{escaped_table_name}', {self.quote(pk_name)} FROM inserted - UNION SELECT '{escaped_table_name}', {self.quote(pk_name)} FROM deleted) + (SELECT :table_name, {self.quote(pk_name)} FROM inserted + UNION SELECT :table_name, {self.quote(pk_name)} FROM deleted) AS SRC (table_name, pk) ON SRC.table_name = TRA.table_name AND SRC.pk = TRA.pk WHEN NOT MATCHED THEN INSERT (table_name, pk) VALUES (SRC.table_name, SRC.pk); END; - """, - {"table_name": dataset.table_name}, - ) + """ + ).bindparams(table_name=dataset.table_name) + # Placeholders not allowed in CREATE TRIGGER - have to use literal_binds. + # See https://docs.sqlalchemy.org/en/13/faq/sqlexpressions.html#faq-sql-expression-string + create_trigger.compile( + sess.connection(), compile_kwargs={"literal_binds": True} + ).execute() @contextlib.contextmanager def _suspend_triggers(self, sess, dataset): @@ -352,24 +309,6 @@ def _suspend_triggers(self, sess, dataset): def meta_items(self, dataset): with self.session() as sess: - table_info_sql = """ - SELECT - C.column_name, C.ordinal_position, C.data_type, C.udt_name, - C.character_maximum_length, C.numeric_precision, C.numeric_scale, - KCU.ordinal_position AS pk_ordinal_position, - upper(postgis_typmod_type(A.atttypmod)) AS geometry_type, - postgis_typmod_srid(A.atttypmod) AS geometry_srid - FROM information_schema.columns C - LEFT OUTER JOIN information_schema.key_column_usage KCU - ON (KCU.table_schema = C.table_schema) - AND (KCU.table_name = C.table_name) - AND (KCU.column_name = C.column_name) - LEFT OUTER JOIN pg_attribute A - ON (A.attname = C.column_name) - AND (A.attrelid = (C.table_schema || '.' || C.table_name)::regclass::oid) - WHERE C.table_schema=:table_schema AND C.table_name=:table_name - ORDER BY C.ordinal_position; - """ table_info_sql = """ SELECT C.column_name, C.ordinal_position, C.data_type, @@ -405,10 +344,6 @@ def try_align_schema_col(cls, old_col_dict, new_col_dict): old_type = old_col_dict["dataType"] new_type = new_col_dict["dataType"] - # Some types have to be approximated as other types in SQL Server. - if sqlserver_adapter.APPROXIMATED_TYPES.get(old_type) == new_type: - new_col_dict["dataType"] = new_type = old_type - # Geometry type loses its extra type info when roundtripped through SQL Server. if new_type == "geometry": new_col_dict["geometryType"] = old_col_dict.get("geometryType") @@ -419,7 +354,7 @@ def try_align_schema_col(cls, old_col_dict, new_col_dict): def _remove_hidden_meta_diffs(self, dataset, ds_meta_items, wc_meta_items): super()._remove_hidden_meta_diffs(dataset, ds_meta_items, wc_meta_items) - # Nowhere to put these in postgis WC + # Nowhere to put these in SQL Server WC for key in self._UNSUPPORTED_META_ITEMS: if key in ds_meta_items: del ds_meta_items[key] @@ -447,15 +382,13 @@ class InstanceFunction(Function): >>> function(element) """ - pass - @compiles(InstanceFunction) def compile_instance_function(element, compiler, **kw): return "(%s).%s()" % (element.clauses, element.name) -class GeographyType(UserDefinedType): +class GeometryType(UserDefinedType): """UserDefinedType so that V2 geometry is adapted to MS binary format.""" def __init__(self, crs_id): @@ -468,7 +401,7 @@ def bind_processor(self, dialect): def bind_expression(self, bindvalue): # 2. Writing - SQL layer - wrap in call to STGeomFromWKB to convert WKB to MS binary. return Function( - quoted_name("geography::STGeomFromWKB", False), + quoted_name("geometry::STGeomFromWKB", False), bindvalue, self.crs_id, type_=self, @@ -484,10 +417,14 @@ def result_processor(self, dialect, coltype): class BaseDateOrTimeType(UserDefinedType): - """UserDefinedType so we read dates, times, and datetimes as text.""" + """ + UserDefinedType so we read dates, times, and datetimes as text. + They are stored as date / time / datetime in SQL Server, but read back out as text. + """ def column_expression(self, col): # When reading, convert dates and times to strings using style 127: ISO8601 with time zone Z. + # https://docs.microsoft.com/en-us/sql/t-sql/functions/cast-and-convert-transact-sql return Function( "CONVERT", literal_column("NVARCHAR"), diff --git a/sno/working_copy/sqlserver_adapter.py b/sno/working_copy/sqlserver_adapter.py index de64fe778..86f10b329 100644 --- a/sno/working_copy/sqlserver_adapter.py +++ b/sno/working_copy/sqlserver_adapter.py @@ -1,4 +1,3 @@ -from sno import crs_util from sno.schema import Schema, ColumnSchema from sqlalchemy.sql.compiler import IdentifierPreparer @@ -17,7 +16,7 @@ def quote(ident): "blob": "varbinary", "date": "date", "float": {0: "real", 32: "real", 64: "float"}, - "geometry": "geography", + "geometry": "geometry", "integer": { 0: "int", 8: "tinyint", @@ -25,7 +24,7 @@ def quote(ident): 32: "int", 64: "bigint", }, - "interval": "nvarchar", + "interval": "text", "numeric": "numeric", "text": "nvarchar", "time": "time", @@ -40,6 +39,8 @@ def quote(ident): "bigint": ("integer", 64), "real": ("float", 32), "float": ("float", 64), + "binary": "blob", + "char": "text", "date": "date", "datetime": "timestamp", "datetime2": "timestamp", @@ -47,9 +48,10 @@ def quote(ident): "decimal": "numeric", "geography": "geometry", "geometry": "geometry", - "interval": "interval", + "nchar": "text", "numeric": "numeric", "nvarchar": "text", + "ntext": "text", "text": "text", "time": "time", "varchar": "text", @@ -91,13 +93,6 @@ def v2_type_to_ms_type(column_schema, v2_obj): return ms_type_info.get(extra_type_info.get("size", 0)) ms_type = ms_type_info - if ms_type == "geometry": - geometry_type = extra_type_info.get("geometryType") - crs_name = extra_type_info.get("geometryCRS") - crs_id = None - if crs_name is not None: - crs_id = crs_util.get_identifier_int_from_dataset(v2_obj, crs_name) - return _v2_geometry_type_to_ms_type(geometry_type, crs_id) if ms_type in ("varchar", "nvarchar", "varbinary"): length = extra_type_info.get("length", None) @@ -116,33 +111,19 @@ def v2_type_to_ms_type(column_schema, v2_obj): return ms_type -def _v2_geometry_type_to_ms_type(geometry_type, crs_id): - if geometry_type is not None: - geometry_type = geometry_type.replace(" ", "") - - if geometry_type is not None and crs_id is not None: - return f"geometry({geometry_type},{crs_id})" - elif geometry_type is not None: - return f"geometry({geometry_type})" - else: - return "geometry" - - def sqlserver_to_v2_schema(ms_table_info, id_salt): - """Generate a V2 schema from the given postgis metadata tables.""" + """Generate a V2 schema from the given SQL server metadata.""" return Schema([_sqlserver_to_column_schema(col, id_salt) for col in ms_table_info]) def _sqlserver_to_column_schema(ms_col_info, id_salt): """ - Given the postgis column info for a particular column, and some extra context in - case it is a geometry column, converts it to a ColumnSchema. The extra context will - only be used if the given ms_col_info is the geometry column. + Given the MS column info for a particular column, converts it to a ColumnSchema. + Parameters: ms_col_info - info about a single column from ms_table_info. - ms_spatial_ref_sys - rows of the "spatial_ref_sys" table that are referenced by this dataset. id_salt - the UUIDs of the generated ColumnSchema are deterministic and depend on - the name and type of the column, and on this salt. + the name and type of the column, and on this salt. """ name = ms_col_info["column_name"] pk_index = ms_col_info["pk_ordinal_position"] diff --git a/sno/working_copy/table_defs.py b/sno/working_copy/table_defs.py index 5f8fbd2dc..ef500911c 100644 --- a/sno/working_copy/table_defs.py +++ b/sno/working_copy/table_defs.py @@ -104,9 +104,11 @@ def create_all(self, session): class SqlServerSnoTables(TableSet): """ - Tables for sno-specific metadata - PostGIS variant. + Tables for sno-specific metadata - SQL Server variant. Table names have a user-defined schema, and so unlike other table sets, we need to construct an instance with the appropriate schema. + Primary keys have to be NVARCHAR of a fixed maximum length - + if the total maximum length is too long, SQL Server cannot generate an index. """ def __init__(self, schema=None): diff --git a/tests/conftest.py b/tests/conftest.py index 014dbda6c..47cdf6deb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -930,8 +930,8 @@ def ctx(create=False): def sqlserver_db(): """ Using docker, you can run a SQL Server test - such as those in test_working_copy_sqlserver - as follows: - docker run -it --rm -d -p 11433:1433 -e ACCEPT_EULA=Y -e 'SA_PASSWORD=Sql(server)' mcr.microsoft.com/mssql/server - SNO_SQLSERVER_URL='mssql://sa:Sql(server)@127.0.0.1:11433/master' spytest -k sqlserver --pdb -vvs + docker run -it --rm -d -p 11433:1433 -e ACCEPT_EULA=Y -e 'SA_PASSWORD=PassWord1' mcr.microsoft.com/mssql/server + SNO_SQLSERVER_URL='mssql://sa:PassWord1@127.0.0.1:11433/master' pytest -k sqlserver --pdb -vvs """ if "SNO_SQLSERVER_URL" not in os.environ: raise pytest.skip( @@ -939,7 +939,7 @@ def sqlserver_db(): ) engine = sqlserver_engine(os.environ["SNO_SQLSERVER_URL"]) with engine.connect() as conn: - # test connection and postgis support + # Test connection try: conn.execute("SELECT @@version;") except sqlalchemy.exc.DBAPIError: diff --git a/tests/test_working_copy_sqlserver.py b/tests/test_working_copy_sqlserver.py index 2853f2263..aa5e5f6a0 100644 --- a/tests/test_working_copy_sqlserver.py +++ b/tests/test_working_copy_sqlserver.py @@ -308,6 +308,6 @@ def test_types_roundtrip(data_archive, cli_runner, new_sqlserver_db_schema): r = cli_runner.invoke(["checkout"]) # If type-approximation roundtrip code isn't working, - # we would get spurious diffs on types that PostGIS doesn't support. + # we would get spurious diffs on types that SQL server doesn't support. r = cli_runner.invoke(["diff", "--exit-code"]) assert r.exit_code == 0, r.stdout