From 973a63d4d8a33dd77bd1fe7e11a24ee63771ad42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= <nicolas.aldecoa@emilabs.ai>
Date: Tue, 17 Jan 2023 17:19:35 -0300
Subject: [PATCH 1/5] support PG json vs RS super matching

---
 sqeleton/abcs/database_types.py  | 12 +++++++++
 sqeleton/databases/base.py       |  4 +++
 sqeleton/databases/postgresql.py | 36 ++++++++++++++++++++++++++
 sqeleton/databases/redshift.py   | 44 +++++++++++++++++++++++++++++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/sqeleton/abcs/database_types.py b/sqeleton/abcs/database_types.py
index bc022b5..9a66bcd 100644
--- a/sqeleton/abcs/database_types.py
+++ b/sqeleton/abcs/database_types.py
@@ -134,6 +134,18 @@ class Text(StringType):
     supported = False
 
 
+class JSONType(ColType):
+    pass
+
+
+class RedShiftSuper(JSONType):
+    pass
+
+
+class PostgresqlJSON(JSONType):
+    pass
+
+
 @dataclass
 class Integer(NumericType, IKey):
     precision: int = 0
diff --git a/sqeleton/databases/base.py b/sqeleton/databases/base.py
index cc6abc9..f4f9076 100644
--- a/sqeleton/databases/base.py
+++ b/sqeleton/databases/base.py
@@ -34,6 +34,7 @@
     DbTime,
     DbPath,
     Boolean,
+    JSONType
 )
 from ..abcs.mixins import Compilable
 from ..abcs.mixins import AbstractMixin_Schema, AbstractMixin_RandomSample, AbstractMixin_NormalizeValue
@@ -246,6 +247,9 @@ def parse_type(
         elif issubclass(cls, (Text, Native_UUID)):
             return cls()
 
+        elif issubclass(cls, JSONType):
+            return cls()
+
         raise TypeError(f"Parsing {type_repr} returned an unknown type '{cls}'.")
 
     def _convert_db_precision_to_digits(self, p: int) -> int:
diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py
index 7c7b47d..ffa12d1 100644
--- a/sqeleton/databases/postgresql.py
+++ b/sqeleton/databases/postgresql.py
@@ -1,15 +1,18 @@
 from ..abcs.database_types import (
+    ColType,
     Timestamp,
     TimestampTZ,
     Float,
     Decimal,
     Integer,
     TemporalType,
+    ColType_UUID,
     Native_UUID,
     Text,
     FractionalType,
     Boolean,
     Date,
+    PostgresqlJSON
 )
 from ..abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from .base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema
@@ -48,6 +51,37 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
     def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
         return self.to_string(f"{value}::int")
 
+    def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str:
+        return f"replace({value}::text, '\": \"', '\":\"')"  # minified json
+
+    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
+        """Creates an SQL expression, that converts 'value' to a normalized representation.
+
+        The returned expression must accept any SQL value, and return a string.
+
+        The default implementation dispatches to a method according to `coltype`:
+
+        ::
+
+            TemporalType    -> normalize_timestamp()
+            FractionalType  -> normalize_number()
+            *else*          -> to_string()
+
+            (`Integer` falls in the *else* category)
+
+        """
+        if isinstance(coltype, TemporalType):
+            return self.normalize_timestamp(value, coltype)
+        elif isinstance(coltype, FractionalType):
+            return self.normalize_number(value, coltype)
+        elif isinstance(coltype, ColType_UUID):
+            return self.normalize_uuid(value, coltype)
+        elif isinstance(coltype, Boolean):
+            return self.normalize_boolean(value, coltype)
+        elif isinstance(coltype, PostgresqlJSON):
+            return self.normalize_json(value, coltype)
+        return self.to_string(value)
+
 
 class PostgresqlDialect(BaseDialect, Mixin_Schema):
     name = "PostgreSQL"
@@ -74,6 +108,8 @@ class PostgresqlDialect(BaseDialect, Mixin_Schema):
         "character varying": Text,
         "varchar": Text,
         "text": Text,
+        # JSON
+        "json": PostgresqlJSON,
         # UUID
         "uuid": Native_UUID,
         # Boolean
diff --git a/sqeleton/databases/redshift.py b/sqeleton/databases/redshift.py
index 4eaf171..8a68983 100644
--- a/sqeleton/databases/redshift.py
+++ b/sqeleton/databases/redshift.py
@@ -1,5 +1,14 @@
 from typing import List, Dict
-from ..abcs.database_types import Float, TemporalType, FractionalType, DbPath
+from ..abcs.database_types import (
+    Float,
+    TemporalType,
+    FractionalType,
+    DbPath,
+    RedShiftSuper,
+    ColType,
+    ColType_UUID,
+    Boolean
+)
 from ..abcs.mixins import AbstractMixin_MD5
 from .postgresql import (
     PostgreSQL,
@@ -40,6 +49,37 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"{value}::decimal(38,{coltype.precision})")
 
+    def normalize_super(self, value: str, _coltype: RedShiftSuper) -> str:
+        return f'nvl2({value}, json_serialize({value}), NULL)'  # only ::varchar causes redshift to return null
+
+    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
+        """Creates an SQL expression, that converts 'value' to a normalized representation.
+
+        The returned expression must accept any SQL value, and return a string.
+
+        The default implementation dispatches to a method according to `coltype`:
+
+        ::
+
+            TemporalType    -> normalize_timestamp()
+            FractionalType  -> normalize_number()
+            *else*          -> to_string()
+
+            (`Integer` falls in the *else* category)
+
+        """
+        if isinstance(coltype, TemporalType):
+            return self.normalize_timestamp(value, coltype)
+        elif isinstance(coltype, FractionalType):
+            return self.normalize_number(value, coltype)
+        elif isinstance(coltype, ColType_UUID):
+            return self.normalize_uuid(value, coltype)
+        elif isinstance(coltype, Boolean):
+            return self.normalize_boolean(value, coltype)
+        elif isinstance(coltype, RedShiftSuper):
+            return self.normalize_super(value, coltype)
+        return self.to_string(value)
+
 
 class Dialect(PostgresqlDialect):
     name = "Redshift"
@@ -47,6 +87,8 @@ class Dialect(PostgresqlDialect):
         **PostgresqlDialect.TYPE_CLASSES,
         "double": Float,
         "real": Float,
+        # JSON
+        "super": RedShiftSuper
     }
     SUPPORTS_INDEXES = False
 

From 93b6a2b8f09d816aae05cb80b117cf8962af8a31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= <nicolas.aldecoa@emilabs.ai>
Date: Mon, 30 Jan 2023 16:26:53 -0300
Subject: [PATCH 2/5] remove spaces after comma in PG json, refactor

---
 sqeleton/abcs/mixins.py          |  8 ++++++-
 sqeleton/databases/postgresql.py | 32 +--------------------------
 sqeleton/databases/redshift.py   | 37 +++-----------------------------
 3 files changed, 11 insertions(+), 66 deletions(-)

diff --git a/sqeleton/abcs/mixins.py b/sqeleton/abcs/mixins.py
index 34d3250..099065d 100644
--- a/sqeleton/abcs/mixins.py
+++ b/sqeleton/abcs/mixins.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from .database_types import TemporalType, FractionalType, ColType_UUID, Boolean, ColType, String_UUID
+from .database_types import TemporalType, FractionalType, ColType_UUID, Boolean, ColType, String_UUID, JSONType
 from .compiler import Compilable
 
 
@@ -49,6 +49,10 @@ def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
             return f"TRIM({value})"
         return self.to_string(value)
 
+    def normalize_json(self, value: str, _coltype: JSONType) -> str:
+        """Creates an SQL expression, that converts 'value' to its minified json string representation."""
+        raise NotImplementedError()
+
     def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
         """Creates an SQL expression, that converts 'value' to a normalized representation.
 
@@ -73,6 +77,8 @@ def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
             return self.normalize_uuid(value, coltype)
         elif isinstance(coltype, Boolean):
             return self.normalize_boolean(value, coltype)
+        elif isinstance(coltype, JSONType):
+            return self.normalize_json(value, coltype)
         return self.to_string(value)
 
 
diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py
index ffa12d1..f0f2ddc 100644
--- a/sqeleton/databases/postgresql.py
+++ b/sqeleton/databases/postgresql.py
@@ -1,12 +1,10 @@
 from ..abcs.database_types import (
-    ColType,
     Timestamp,
     TimestampTZ,
     Float,
     Decimal,
     Integer,
     TemporalType,
-    ColType_UUID,
     Native_UUID,
     Text,
     FractionalType,
@@ -52,35 +50,7 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
         return self.to_string(f"{value}::int")
 
     def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str:
-        return f"replace({value}::text, '\": \"', '\":\"')"  # minified json
-
-    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
-        """Creates an SQL expression, that converts 'value' to a normalized representation.
-
-        The returned expression must accept any SQL value, and return a string.
-
-        The default implementation dispatches to a method according to `coltype`:
-
-        ::
-
-            TemporalType    -> normalize_timestamp()
-            FractionalType  -> normalize_number()
-            *else*          -> to_string()
-
-            (`Integer` falls in the *else* category)
-
-        """
-        if isinstance(coltype, TemporalType):
-            return self.normalize_timestamp(value, coltype)
-        elif isinstance(coltype, FractionalType):
-            return self.normalize_number(value, coltype)
-        elif isinstance(coltype, ColType_UUID):
-            return self.normalize_uuid(value, coltype)
-        elif isinstance(coltype, Boolean):
-            return self.normalize_boolean(value, coltype)
-        elif isinstance(coltype, PostgresqlJSON):
-            return self.normalize_json(value, coltype)
-        return self.to_string(value)
+        return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')"
 
 
 class PostgresqlDialect(BaseDialect, Mixin_Schema):
diff --git a/sqeleton/databases/redshift.py b/sqeleton/databases/redshift.py
index 8a68983..f7c4bec 100644
--- a/sqeleton/databases/redshift.py
+++ b/sqeleton/databases/redshift.py
@@ -4,10 +4,7 @@
     TemporalType,
     FractionalType,
     DbPath,
-    RedShiftSuper,
-    ColType,
-    ColType_UUID,
-    Boolean
+    RedShiftSuper
 )
 from ..abcs.mixins import AbstractMixin_MD5
 from .postgresql import (
@@ -49,36 +46,8 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"{value}::decimal(38,{coltype.precision})")
 
-    def normalize_super(self, value: str, _coltype: RedShiftSuper) -> str:
-        return f'nvl2({value}, json_serialize({value}), NULL)'  # only ::varchar causes redshift to return null
-
-    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
-        """Creates an SQL expression, that converts 'value' to a normalized representation.
-
-        The returned expression must accept any SQL value, and return a string.
-
-        The default implementation dispatches to a method according to `coltype`:
-
-        ::
-
-            TemporalType    -> normalize_timestamp()
-            FractionalType  -> normalize_number()
-            *else*          -> to_string()
-
-            (`Integer` falls in the *else* category)
-
-        """
-        if isinstance(coltype, TemporalType):
-            return self.normalize_timestamp(value, coltype)
-        elif isinstance(coltype, FractionalType):
-            return self.normalize_number(value, coltype)
-        elif isinstance(coltype, ColType_UUID):
-            return self.normalize_uuid(value, coltype)
-        elif isinstance(coltype, Boolean):
-            return self.normalize_boolean(value, coltype)
-        elif isinstance(coltype, RedShiftSuper):
-            return self.normalize_super(value, coltype)
-        return self.to_string(value)
+    def normalize_json(self, value: str, _coltype: RedShiftSuper) -> str:
+        return f'nvl2({value}, json_serialize({value}), NULL)'
 
 
 class Dialect(PostgresqlDialect):

From 34e931734a33dffb2f81570fe22a930bbdf04e92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= <nicolas.aldecoa@emilabs.ai>
Date: Tue, 31 Jan 2023 13:42:28 -0300
Subject: [PATCH 3/5] add docstring

---
 sqeleton/databases/postgresql.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py
index f0f2ddc..76db9d6 100644
--- a/sqeleton/databases/postgresql.py
+++ b/sqeleton/databases/postgresql.py
@@ -50,6 +50,15 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
         return self.to_string(f"{value}::int")
 
     def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str:
+        """
+        Converts json or bjson values to its minified (most compact) string representation.
+
+        Removes whitespaces after json separators (':' and ',') using string replacement.
+        '{"a": 1, "b": true, "c": "x"}' (bjson::text) -> '{"a":1,"b":true,"c":"x"}'
+
+        Comparisons to jsons of other db types can give false positives if they contain string
+        values that include any of the replaced patterns, or if the items have different order.
+        """
         return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')"
 
 

From 97e3569f4727ea1428bdc9cca0470307b0d1d59b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= <nicolas.aldecoa@emilabs.ai>
Date: Tue, 31 Jan 2023 17:10:35 -0300
Subject: [PATCH 4/5] add jsonb type

---
 sqeleton/abcs/database_types.py  | 4 ++++
 sqeleton/databases/postgresql.py | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sqeleton/abcs/database_types.py b/sqeleton/abcs/database_types.py
index 9a66bcd..2a901d3 100644
--- a/sqeleton/abcs/database_types.py
+++ b/sqeleton/abcs/database_types.py
@@ -146,6 +146,10 @@ class PostgresqlJSON(JSONType):
     pass
 
 
+class PostgresqlJSONB(JSONType):
+    pass
+
+
 @dataclass
 class Integer(NumericType, IKey):
     precision: int = 0
diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py
index 76db9d6..46961ed 100644
--- a/sqeleton/databases/postgresql.py
+++ b/sqeleton/databases/postgresql.py
@@ -10,7 +10,8 @@
     FractionalType,
     Boolean,
     Date,
-    PostgresqlJSON
+    PostgresqlJSON,
+    PostgresqlJSONB
 )
 from ..abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from .base import BaseDialect, ThreadedDatabase, import_helper, ConnectError, Mixin_Schema
@@ -89,6 +90,7 @@ class PostgresqlDialect(BaseDialect, Mixin_Schema):
         "text": Text,
         # JSON
         "json": PostgresqlJSON,
+        "jsonb": PostgresqlJSONB,
         # UUID
         "uuid": Native_UUID,
         # Boolean

From 85ded0e938fe0ce95916562c1c87ce2745d5aaed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20Aldecoa?= <nicolas.aldecoa@emilabs.ai>
Date: Wed, 8 Feb 2023 14:18:18 -0300
Subject: [PATCH 5/5] changes related to
 https://github.com/datafold/data-diff/pull/383

---
 sqeleton/abcs/__init__.py        |  1 +
 sqeleton/databases/postgresql.py | 11 +----------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/sqeleton/abcs/__init__.py b/sqeleton/abcs/__init__.py
index 8659875..6359a7f 100644
--- a/sqeleton/abcs/__init__.py
+++ b/sqeleton/abcs/__init__.py
@@ -10,5 +10,6 @@
     PrecisionType,
     StringType,
     Boolean,
+    JSONType,
 )
 from .compiler import AbstractCompiler, Compilable
diff --git a/sqeleton/databases/postgresql.py b/sqeleton/databases/postgresql.py
index 46961ed..147b801 100644
--- a/sqeleton/databases/postgresql.py
+++ b/sqeleton/databases/postgresql.py
@@ -51,16 +51,7 @@ def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
         return self.to_string(f"{value}::int")
 
     def normalize_json(self, value: str, _coltype: PostgresqlJSON) -> str:
-        """
-        Converts json or bjson values to its minified (most compact) string representation.
-
-        Removes whitespaces after json separators (':' and ',') using string replacement.
-        '{"a": 1, "b": true, "c": "x"}' (bjson::text) -> '{"a":1,"b":true,"c":"x"}'
-
-        Comparisons to jsons of other db types can give false positives if they contain string
-        values that include any of the replaced patterns, or if the items have different order.
-        """
-        return f"replace(replace({value}::text, '\": ', '\":'), ', \"', ',\"')"
+        return f"{value}::text"
 
 
 class PostgresqlDialect(BaseDialect, Mixin_Schema):