Json UUID any

Fixes ibis-project#1944. This does *not* add support for JSON, JSONB, and UUID PostgreSQL types. Instead, it marks them as having `dt.Any` type, allowing the tables to be loaded, and those columns mostly ignored. There is also some black formatting on some lines I didn't touch... Author: Ian Rose <ian.r.rose@gmail.com> Author: Ivan Ogasawara <ivan.ogasawara@gmail.com> Closes ibis-project#1962 from ian-r-rose/json-uuid-any and squashes the following commits: 6a878b0 [Ian Rose] Merge pull request #1 from Quansight/json-uuid-any 298efc2 [Ivan Ogasawara] Added JSON JSONB and UUID data types. 79bab94 [Ian Rose] Move test to postgres client test suite. 3748ef9 [Ian Rose] Add some light type testing for json, jsonb, uuid. b514069 [Ian Rose] Allow postgres client to read tables with UUID, JSON, JSONB types.
icexelloss · Oct 2, 2019 · e8e1397 · e8e1397
1 parent a57ac5f
commit e8e1397
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 4 deletions.
diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py
@@ -636,6 +636,23 @@ def _literal_value_hash_key(self, value):
         return self, _tuplize(value.items())
 
 
+class JSON(String):
+    """JSON (JavaScript Object Notation) text format."""
+
+    scalar = ir.JSONScalar
+    column = ir.JSONColumn
+
+
+class JSONB(Binary):
+    """JSON (JavaScript Object Notation) data stored as a binary
+    representation, which eliminates whitespace, duplicate keys,
+    and key ordering.
+    """
+
+    scalar = ir.JSONBScalar
+    column = ir.JSONBColumn
+
+
 class GeoSpatial(DataType):
     __slots__ = 'geotype', 'srid'
 
@@ -779,6 +796,17 @@ class MultiPolygon(GeoSpatial):
     __slots__ = ()
 
 
+class UUID(String):
+    """A universally unique identifier (UUID) is a 128-bit number used to
+    identify information in computer systems.
+    """
+
+    scalar = ir.UUIDScalar
+    column = ir.UUIDColumn
+
+    __slots__ = ()
+
+
 # ---------------------------------------------------------------------
 any = Any()
 null = Null()
@@ -815,7 +843,11 @@ class MultiPolygon(GeoSpatial):
 multilinestring = MultiLineString()
 multipoint = MultiPoint()
 multipolygon = MultiPolygon()
-
+# json
+json = JSON()
+jsonb = JSONB()
+# special string based data type
+uuid = UUID()
 
 _primitive_types = [
     ('any', any),
@@ -881,6 +913,9 @@ class Tokens:
     MULTIPOINT = 28
     MULTIPOLYGON = 29
     SEMICOLON = 30
+    JSON = 31
+    JSONB = 32
+    UUID = 33
 
     @staticmethod
     def name(value):
@@ -891,7 +926,6 @@ def name(value):
     (getattr(Tokens, n), n) for n in dir(Tokens) if n.isalpha() and n.isupper()
 )
 
-
 Token = collections.namedtuple('Token', ('type', 'value'))
 
 
@@ -1005,6 +1039,22 @@ def name(value):
             ),
         )
     ]
+    + [
+        # json data type
+        (
+            '(?P<{}>{})'.format(token.upper(), token),
+            lambda token, toktype=toktype: Token(toktype, token),
+        )
+        for token, toktype in zip(
+            # note: `jsonb` should be first to avoid conflict with `json`
+            ('jsonb', 'json'),
+            (Tokens.JSONB, Tokens.JSON),
+        )
+    ]
+    + [
+        # special string based data types
+        ('(?P<UUID>uuid)', lambda token: Token(Tokens.UUID, token))
+    ]
     + [
         # integers, for decimal spec
         (r'(?P<INTEGER>\d+)', lambda token: Token(Tokens.INTEGER, int(token))),
@@ -1209,6 +1259,12 @@ def type(self) -> DataType:
                      | "multipolygon" ":" geotype
                      | "multipolygon" ";" srid ":" geotype
 
+        json : "json"
+
+        jsonb : "jsonb"
+
+        uuid : "uuid"
+
         """
         if self._accept(Tokens.PRIMITIVE):
             assert self.tok is not None
@@ -1322,6 +1378,13 @@ def type(self) -> DataType:
             self._expect(Tokens.RBRACKET)
             return Struct(names, types)
 
+        # json data types
+        elif self._accept(Tokens.JSON):
+            return JSON()
+
+        elif self._accept(Tokens.JSONB):
+            return JSONB()
+
         # geo spatial data type
         elif self._accept(Tokens.GEOMETRY):
             return Geometry()
@@ -1431,6 +1494,10 @@ def type(self) -> DataType:
 
             return MultiPolygon(geotype=geotype, srid=srid)
 
+        # special string based data types
+        elif self._accept(Tokens.UUID):
+            return UUID()
+
         else:
             raise SyntaxError('Type cannot be parsed: {}'.format(self.text))
 
@@ -1763,6 +1830,16 @@ def can_cast_variadic(
     return castable(source.value_type, target.value_type)
 
 
+@castable.register(JSON, JSON)
+def can_cast_json(source, target, **kwargs):
+    return True
+
+
+@castable.register(JSONB, JSONB)
+def can_cast_jsonb(source, target, **kwargs):
+    return True
+
+
 # geo spatial data type
 # cast between same type, used to cast from/to geometry and geography
 GEO_TYPES = (
@@ -1782,6 +1859,11 @@ def can_cast_geospatial(source, target, **kwargs):
     return True
 
 
+@castable.register(UUID, UUID)
+def can_cast_special_string(source, target, **kwargs):
+    return True
+
+
 # @castable.register(Map, Map)
 # def can_cast_maps(source, target):
 #     return (source.equals(target) or

diff --git a/ibis/expr/types.py b/ibis/expr/types.py
@@ -794,6 +794,30 @@ class MapColumn(AnyColumn, MapValue):
     pass  # noqa: E701,E302
 
 
+class JSONValue(StringValue):
+    pass  # noqa: E701,E302
+
+
+class JSONScalar(StringScalar, JSONValue):
+    pass  # noqa: E701,E302
+
+
+class JSONColumn(StringColumn, JSONValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBValue(BinaryValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBScalar(BinaryScalar, JSONBValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBColumn(BinaryColumn, JSONBValue):
+    pass  # noqa: E701,E302
+
+
 class StructValue(AnyValue):
     def __dir__(self):
         return sorted(
@@ -909,6 +933,18 @@ class MultiPolygonColumn(GeoSpatialColumn, MultiPolygonValue):  # noqa: E302
     pass  # noqa: E701
 
 
+class UUIDValue(StringValue):
+    pass  # noqa: E701,E302
+
+
+class UUIDScalar(StringScalar, UUIDValue):
+    pass  # noqa: E701,E302
+
+
+class UUIDColumn(StringColumn, UUIDValue):
+    pass  # noqa: E701,E302
+
+
 class ListExpr(ColumnExpr, AnyValue):
     @property
     def values(self):

diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py
@@ -136,6 +136,21 @@ def sa_double(_, satype, nullable=True):
     return dt.Double(nullable=nullable)
 
 
+@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID)
+def sa_uuid(_, satype, nullable=True):
+    return dt.UUID(nullable=nullable)
+
+
+@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON)
+def sa_json(_, satype, nullable=True):
+    return dt.JSON(nullable=nullable)
+
+
+@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB)
+def sa_jsonb(_, satype, nullable=True):
+    return dt.JSONB(nullable=nullable)
+
+
 if geospatial_supported:
 
     @dt.dtype.register(SQLAlchemyDialect, (ga.Geometry, ga.types._GISType))

diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py
@@ -21,9 +21,10 @@
 import ibis
 import ibis.expr.datatypes as dt
 import ibis.expr.types as ir
+import ibis.sql.alchemy as alch  # noqa: E402
 from ibis.tests.util import assert_equal
 
-pytest.importorskip('sqlalchemy')
+sa = pytest.importorskip('sqlalchemy')
 pytest.importorskip('psycopg2')
 
 pytestmark = pytest.mark.postgresql
@@ -136,6 +137,33 @@ def test_schema_table():
     assert isinstance(schema['tables'], ir.TableExpr)
 
 
+def test_schema_type_conversion():
+    typespec = [
+        # name, type, nullable
+        ('json', sa.dialects.postgresql.JSON, True, dt.JSON),
+        ('jsonb', sa.dialects.postgresql.JSONB, True, dt.JSONB),
+        ('uuid', sa.dialects.postgresql.UUID, True, dt.UUID),
+    ]
+
+    sqla_types = []
+    ibis_types = []
+    for name, t, nullable, ibis_type in typespec:
+        sqla_type = sa.Column(name, t, nullable=nullable)
+        sqla_types.append(sqla_type)
+        ibis_types.append((name, ibis_type(nullable=nullable)))
+
+    # Create a table with placeholder stubs for JSON, JSONB, and UUID.
+    engine = sa.create_engine('postgresql://')
+    table = sa.Table('tname', sa.MetaData(bind=engine), *sqla_types)
+
+    # Check that we can correctly create a schema with dt.any for the
+    # missing types.
+    schema = alch.schema_from_table(table)
+    expected = ibis.schema(ibis_types)
+
+    assert_equal(schema, expected)
+
+
 def test_interval_films_schema(con):
     t = con.table("films")
     assert t.len.type() == dt.Interval(unit="m")

diff --git a/ibis/tests/all/test_json.py b/ibis/tests/all/test_json.py
@@ -0,0 +1,31 @@
+""" Tests for json data types"""
+import json
+
+import pytest
+from pytest import param
+
+import ibis
+from ibis.tests.backends import PostgreSQL
+
+# add here backends that support json types
+all_db_geo_supported = [PostgreSQL]
+
+
+@pytest.mark.parametrize('data', [param({'status': True}, id='status')])
+@pytest.mark.only_on_backends(all_db_geo_supported)
+def test_json(backend, con, data, alltypes):
+    json_value = json.dumps(data)
+    lit = ibis.literal(json_value, type='json').name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == json_value
+
+
+@pytest.mark.parametrize('data', [param({'status': True}, id='status')])
+@pytest.mark.only_on_backends(all_db_geo_supported)
+def test_jsonb(backend, con, data, alltypes):
+    jsonb_value = json.dumps(data).encode('utf8')
+    lit = ibis.literal(jsonb_value, type='jsonb').name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == jsonb_value
diff --git a/ibis/tests/all/test_string.py b/ibis/tests/all/test_string.py
@@ -3,7 +3,7 @@
 
 import ibis
 import ibis.expr.datatypes as dt
-from ibis.tests.backends import Clickhouse, Impala, PySpark, Spark
+from ibis.tests.backends import Clickhouse, Impala, PostgreSQL, PySpark, Spark
 
 
 def test_string_col_is_unicode(backend, alltypes, df):
@@ -233,3 +233,15 @@ def test_string(backend, alltypes, df, result_func, expected_func):
 
     expected = backend.default_series_rename(expected_func(df))
     backend.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'data, data_type',
+    [param('123e4567-e89b-12d3-a456-426655440000', 'uuid', id='uuid')],
+)
+@pytest.mark.only_on_backends([PostgreSQL])
+def test_special_strings(backend, con, alltypes, data, data_type):
+    lit = ibis.literal(data, type=data_type).name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == data