From 973a63d4d8a33dd77bd1fe7e11a24ee63771ad42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= Date: Tue, 17 Jan 2023 17:19:35 -0300 Subject: [PATCH 1/5] support PG json vs RS super matching --- sqeleton/abcs/database_types.py | 12 +++++++++ sqeleton/databases/base.py | 4 +++ sqeleton/databases/postgresql.py | 36 ++++++++++++++++++++++++++ sqeleton/databases/redshift.py | 44 +++++++++++++++++++++++++++++++- 4 files changed, 95 insertions(+), 1 deletion(-) diff --git a/sqeleton/abcs/database_types.py b/sqeleton/abcs/database_types.py index bc022b5..9a66bcd 100644 --- a/sqeleton/abcs/database_types.py +++ b/sqeleton/abcs/database_types.py @@ -134,6 +134,18 @@ class Text(StringType): supported = False +class JSONType(ColType): + pass + + +class RedShiftSuper(JSONType): + pass + + +class PostgresqlJSON(JSONType): + pass + + @dataclass class Integer(NumericType, IKey): precision: int = 0 diff --git a/sqeleton/databases/base.py b/sqeleton/databases/base.py index cc6abc9..f4f9076 100644 --- a/sqeleton/databases/base.py +++ b/sqeleton/databases/base.py @@ -34,6 +34,7 @@ DbTime, DbPath, Boolean, + JSONType ) from ..abcs.mixins import Compilable from ..abcs.mixins import AbstractMixin_Schema, AbstractMixin_RandomSample, AbstractMixin_NormalizeValue @@ -246,6 +247,9 @@ def parse_type( elif issubclass(cls, (Text, Native_UUID)): return cls() + elif issubclass(cls, JSONType): + return cls() + raise TypeError(f"Parsing {type_repr} returned an unknown type '{cls}'.") def _convert_db_precision_to_digits(self, p: int) -> int: diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py index 7c7b47d..ffa12d1 100644 --- a/sqeleton/databases/postgresql.py +++ b/sqeleton/databases/postgresql.py @@ -1,15 +1,18 @@ from ..abcs.database_types import ( + ColType, Timestamp, TimestampTZ, Float, Decimal, Integer, TemporalType, + ColType_UUID, Native_UUID, Text, FractionalType, Boolean, Date, + PostgresqlJSON ) from ..abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue from .base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema @@ -48,6 +51,37 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str: def normalize_boolean(self, value: str, _coltype: Boolean) -> str: return self.to_string(f"{value}::int") + def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str: + return f"replace({value}::text, '\": \"', '\":\"')" # minified json + + def normalize_value_by_type(self, value: str, coltype: ColType) -> str: + """Creates an SQL expression, that converts 'value' to a normalized representation. + + The returned expression must accept any SQL value, and return a string. + + The default implementation dispatches to a method according to `coltype`: + + :: + + TemporalType -> normalize_timestamp() + FractionalType -> normalize_number() + *else* -> to_string() + + (`Integer` falls in the *else* category) + + """ + if isinstance(coltype, TemporalType): + return self.normalize_timestamp(value, coltype) + elif isinstance(coltype, FractionalType): + return self.normalize_number(value, coltype) + elif isinstance(coltype, ColType_UUID): + return self.normalize_uuid(value, coltype) + elif isinstance(coltype, Boolean): + return self.normalize_boolean(value, coltype) + elif isinstance(coltype, PostgresqlJSON): + return self.normalize_json(value, coltype) + return self.to_string(value) + class PostgresqlDialect(BaseDialect, Mixin_Schema): name = "PostgreSQL" @@ -74,6 +108,8 @@ class PostgresqlDialect(BaseDialect, Mixin_Schema): "character varying": Text, "varchar": Text, "text": Text, + # JSON + "json": PostgresqlJSON, # UUID "uuid": Native_UUID, # Boolean diff --git a/sqeleton/databases/redshift.py b/sqeleton/databases/redshift.py index 4eaf171..8a68983 100644 --- a/sqeleton/databases/redshift.py +++ b/sqeleton/databases/redshift.py @@ -1,5 +1,14 @@ from typing import List, Dict -from ..abcs.database_types import Float, TemporalType, FractionalType, DbPath +from ..abcs.database_types import ( + Float, + TemporalType, + FractionalType, + DbPath, + RedShiftSuper, + ColType, + ColType_UUID, + Boolean +) from ..abcs.mixins import AbstractMixin_MD5 from .postgresql import ( PostgreSQL, @@ -40,6 +49,37 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38,{coltype.precision})") + def normalize_super(self, value: str, _coltype: RedShiftSuper) -> str: + return f'nvl2({value}, json_serialize({value}), NULL)' # only ::varchar causes redshift to return null + + def normalize_value_by_type(self, value: str, coltype: ColType) -> str: + """Creates an SQL expression, that converts 'value' to a normalized representation. + + The returned expression must accept any SQL value, and return a string. + + The default implementation dispatches to a method according to `coltype`: + + :: + + TemporalType -> normalize_timestamp() + FractionalType -> normalize_number() + *else* -> to_string() + + (`Integer` falls in the *else* category) + + """ + if isinstance(coltype, TemporalType): + return self.normalize_timestamp(value, coltype) + elif isinstance(coltype, FractionalType): + return self.normalize_number(value, coltype) + elif isinstance(coltype, ColType_UUID): + return self.normalize_uuid(value, coltype) + elif isinstance(coltype, Boolean): + return self.normalize_boolean(value, coltype) + elif isinstance(coltype, RedShiftSuper): + return self.normalize_super(value, coltype) + return self.to_string(value) + class Dialect(PostgresqlDialect): name = "Redshift" @@ -47,6 +87,8 @@ class Dialect(PostgresqlDialect): **PostgresqlDialect.TYPE_CLASSES, "double": Float, "real": Float, + # JSON + "super": RedShiftSuper } SUPPORTS_INDEXES = False From 93b6a2b8f09d816aae05cb80b117cf8962af8a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= Date: Mon, 30 Jan 2023 16:26:53 -0300 Subject: [PATCH 2/5] remove spaces after comma in PG json, refactor --- sqeleton/abcs/mixins.py | 8 ++++++- sqeleton/databases/postgresql.py | 32 +-------------------------- sqeleton/databases/redshift.py | 37 +++----------------------------- 3 files changed, 11 insertions(+), 66 deletions(-) diff --git a/sqeleton/abcs/mixins.py b/sqeleton/abcs/mixins.py index 34d3250..099065d 100644 --- a/sqeleton/abcs/mixins.py +++ b/sqeleton/abcs/mixins.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from .database_types import TemporalType, FractionalType, ColType_UUID, Boolean, ColType, String_UUID +from .database_types import TemporalType, FractionalType, ColType_UUID, Boolean, ColType, String_UUID, JSONType from .compiler import Compilable @@ -49,6 +49,10 @@ def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str: return f"TRIM({value})" return self.to_string(value) + def normalize_json(self, value: str, _coltype: JSONType) -> str: + """Creates an SQL expression, that converts 'value' to its minified json string representation.""" + raise NotImplementedError() + def normalize_value_by_type(self, value: str, coltype: ColType) -> str: """Creates an SQL expression, that converts 'value' to a normalized representation. @@ -73,6 +77,8 @@ def normalize_value_by_type(self, value: str, coltype: ColType) -> str: return self.normalize_uuid(value, coltype) elif isinstance(coltype, Boolean): return self.normalize_boolean(value, coltype) + elif isinstance(coltype, JSONType): + return self.normalize_json(value, coltype) return self.to_string(value) diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py index ffa12d1..f0f2ddc 100644 --- a/sqeleton/databases/postgresql.py +++ b/sqeleton/databases/postgresql.py @@ -1,12 +1,10 @@ from ..abcs.database_types import ( - ColType, Timestamp, TimestampTZ, Float, Decimal, Integer, TemporalType, - ColType_UUID, Native_UUID, Text, FractionalType, @@ -52,35 +50,7 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str: return self.to_string(f"{value}::int") def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str: - return f"replace({value}::text, '\": \"', '\":\"')" # minified json - - def normalize_value_by_type(self, value: str, coltype: ColType) -> str: - """Creates an SQL expression, that converts 'value' to a normalized representation. - - The returned expression must accept any SQL value, and return a string. - - The default implementation dispatches to a method according to `coltype`: - - :: - - TemporalType -> normalize_timestamp() - FractionalType -> normalize_number() - *else* -> to_string() - - (`Integer` falls in the *else* category) - - """ - if isinstance(coltype, TemporalType): - return self.normalize_timestamp(value, coltype) - elif isinstance(coltype, FractionalType): - return self.normalize_number(value, coltype) - elif isinstance(coltype, ColType_UUID): - return self.normalize_uuid(value, coltype) - elif isinstance(coltype, Boolean): - return self.normalize_boolean(value, coltype) - elif isinstance(coltype, PostgresqlJSON): - return self.normalize_json(value, coltype) - return self.to_string(value) + return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')" class PostgresqlDialect(BaseDialect, Mixin_Schema): diff --git a/sqeleton/databases/redshift.py b/sqeleton/databases/redshift.py index 8a68983..f7c4bec 100644 --- a/sqeleton/databases/redshift.py +++ b/sqeleton/databases/redshift.py @@ -4,10 +4,7 @@ TemporalType, FractionalType, DbPath, - RedShiftSuper, - ColType, - ColType_UUID, - Boolean + RedShiftSuper ) from ..abcs.mixins import AbstractMixin_MD5 from .postgresql import ( @@ -49,36 +46,8 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: def normalize_number(self, value: str, coltype: FractionalType) -> str: return self.to_string(f"{value}::decimal(38,{coltype.precision})") - def normalize_super(self, value: str, _coltype: RedShiftSuper) -> str: - return f'nvl2({value}, json_serialize({value}), NULL)' # only ::varchar causes redshift to return null - - def normalize_value_by_type(self, value: str, coltype: ColType) -> str: - """Creates an SQL expression, that converts 'value' to a normalized representation. - - The returned expression must accept any SQL value, and return a string. - - The default implementation dispatches to a method according to `coltype`: - - :: - - TemporalType -> normalize_timestamp() - FractionalType -> normalize_number() - *else* -> to_string() - - (`Integer` falls in the *else* category) - - """ - if isinstance(coltype, TemporalType): - return self.normalize_timestamp(value, coltype) - elif isinstance(coltype, FractionalType): - return self.normalize_number(value, coltype) - elif isinstance(coltype, ColType_UUID): - return self.normalize_uuid(value, coltype) - elif isinstance(coltype, Boolean): - return self.normalize_boolean(value, coltype) - elif isinstance(coltype, RedShiftSuper): - return self.normalize_super(value, coltype) - return self.to_string(value) + def normalize_json(self, value: str, _coltype: RedShiftSuper) -> str: + return f'nvl2({value}, json_serialize({value}), NULL)' class Dialect(PostgresqlDialect): From 34e931734a33dffb2f81570fe22a930bbdf04e92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= Date: Tue, 31 Jan 2023 13:42:28 -0300 Subject: [PATCH 3/5] add docstring --- sqeleton/databases/postgresql.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py index f0f2ddc..76db9d6 100644 --- a/sqeleton/databases/postgresql.py +++ b/sqeleton/databases/postgresql.py @@ -50,6 +50,15 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str: return self.to_string(f"{value}::int") def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str: + """ + Converts json or bjson values to its minified (most compact) string representation. + + Removes whitespaces after json separators (':' and ',') using string replacement. + '{"a": 1, "b": true, "c": "x"}' (bjson::text) -> '{"a":1,"b":true,"c":"x"}' + + Comparisons to jsons of other db types can give false positives if they contain string + values that include any of the replaced patterns, or if the items have different order. + """ return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')" From 97e3569f4727ea1428bdc9cca0470307b0d1d59b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= Date: Tue, 31 Jan 2023 17:10:35 -0300 Subject: [PATCH 4/5] add jsonb type --- sqeleton/abcs/database_types.py | 4 ++++ sqeleton/databases/postgresql.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sqeleton/abcs/database_types.py b/sqeleton/abcs/database_types.py index 9a66bcd..2a901d3 100644 --- a/sqeleton/abcs/database_types.py +++ b/sqeleton/abcs/database_types.py @@ -146,6 +146,10 @@ class PostgresqlJSON(JSONType): pass +class PostgresqlJSONB(JSONType): + pass + + @dataclass class Integer(NumericType, IKey): precision: int = 0 diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py index 76db9d6..46961ed 100644 --- a/sqeleton/databases/postgresql.py +++ b/sqeleton/databases/postgresql.py @@ -10,7 +10,8 @@ FractionalType, Boolean, Date, - PostgresqlJSON + PostgresqlJSON, + PostgresqlJSONB ) from ..abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue from .base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema @@ -89,6 +90,7 @@ class PostgresqlDialect(BaseDialect, Mixin_Schema): "text": Text, # JSON "json": PostgresqlJSON, + "jsonb": PostgresqlJSONB, # UUID "uuid": Native_UUID, # Boolean From 85ded0e938fe0ce95916562c1c87ce2745d5aaed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= Date: Wed, 8 Feb 2023 14:18:18 -0300 Subject: [PATCH 5/5] changes related to https://github.com/datafold/data-diff/pull/383 --- sqeleton/abcs/__init__.py | 1 + sqeleton/databases/postgresql.py | 11 +---------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/sqeleton/abcs/__init__.py b/sqeleton/abcs/__init__.py index 8659875..6359a7f 100644 --- a/sqeleton/abcs/__init__.py +++ b/sqeleton/abcs/__init__.py @@ -10,5 +10,6 @@ PrecisionType, StringType, Boolean, + JSONType, ) from .compiler import AbstractCompiler, Compilable diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py index 46961ed..147b801 100644 --- a/sqeleton/databases/postgresql.py +++ b/sqeleton/databases/postgresql.py @@ -51,16 +51,7 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str: return self.to_string(f"{value}::int") def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str: - """ - Converts json or bjson values to its minified (most compact) string representation. - - Removes whitespaces after json separators (':' and ',') using string replacement. - '{"a": 1, "b": true, "c": "x"}' (bjson::text) -> '{"a":1,"b":true,"c":"x"}' - - Comparisons to jsons of other db types can give false positives if they contain string - values that include any of the replaced patterns, or if the items have different order. - """ - return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')" + return f"{value}::text" class PostgresqlDialect(BaseDialect, Mixin_Schema):