Merge pull request #1051 from guzman-raphael/json

Add `json` data type
datajoint · Feb 10, 2023 · baf445a · baf445a
2 parents f28a3b9 + 477d270
commit baf445a
Show file tree

Hide file tree

Showing 14 changed files with 1,411 additions and 63 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## Release notes
 
-### 0.14.0 -- TBA
+### 0.14.0 -- Feb 10, 2023
+- Added - `json` data type ([#245](https://github.com/datajoint/datajoint-python/issues/245)) PR [#1051](https://github.com/datajoint/datajoint-python/pull/1051)
 - Fixed - Activating a schema requires all tables to exist even if `create_tables=False` PR [#1058](https://github.com/datajoint/datajoint-python/pull/1058)
 - Changed - Populate call with `reserve_jobs=True` to exclude `error` and `ignore` keys - PR [#1062](https://github.com/datajoint/datajoint-python/pull/1062)
 - Added - Support for inserting data with CSV files - PR [#1067](https://github.com/datajoint/datajoint-python/pull/1067)

diff --git a/datajoint/condition.py b/datajoint/condition.py
@@ -8,8 +8,29 @@
 import decimal
 import numpy
 import pandas
+import json
 from .errors import DataJointError
 
+JSON_PATTERN = re.compile(
+    r"^(?P<attr>\w+)(\.(?P<path>[\w.*\[\]]+))?(:(?P<type>[\w(,\s)]+))?$"
+)
+
+
+def translate_attribute(key):
+    match = JSON_PATTERN.match(key)
+    if match is None:
+        return match, key
+    match = match.groupdict()
+    if match["path"] is None:
+        return match, match["attr"]
+    else:
+        return match, "json_value(`{}`, _utf8mb4'$.{}'{})".format(
+            *[
+                ((f" returning {v}" if k == "type" else v) if v else "")
+                for k, v in match.items()
+            ]
+        )
+
 
 class PromiscuousOperand:
     """
@@ -94,35 +115,56 @@ def make_condition(query_expression, condition, columns):
     from .expression import QueryExpression, Aggregation, U
 
     def prep_value(k, v):
-        """prepare value v for inclusion as a string in an SQL condition"""
-        if query_expression.heading[k].uuid:
+        """prepare SQL condition"""
+        key_match, k = translate_attribute(k)
+        if key_match["path"] is None:
+            k = f"`{k}`"
+        if (
+            query_expression.heading[key_match["attr"]].json
+            and key_match["path"] is not None
+            and isinstance(v, dict)
+        ):
+            return f"{k}='{json.dumps(v)}'"
+        if v is None:
+            return f"{k} IS NULL"
+        if query_expression.heading[key_match["attr"]].uuid:
             if not isinstance(v, uuid.UUID):
                 try:
                     v = uuid.UUID(v)
                 except (AttributeError, ValueError):
                     raise DataJointError(
                         "Badly formed UUID {v} in restriction by `{k}`".format(k=k, v=v)
                     )
-            return "X'%s'" % v.bytes.hex()
+            return f"{k}=X'{v.bytes.hex()}'"
         if isinstance(
-            v, (datetime.date, datetime.datetime, datetime.time, decimal.Decimal)
+            v,
+            (
+                datetime.date,
+                datetime.datetime,
+                datetime.time,
+                decimal.Decimal,
+                list,
+            ),
         ):
-            return '"%s"' % v
+            return f'{k}="{v}"'
         if isinstance(v, str):
-            return '"%s"' % v.replace("%", "%%").replace("\\", "\\\\")
-        return "%r" % v
+            v = v.replace("%", "%%").replace("\\", "\\\\")
+            return f'{k}="{v}"'
+        return f"{k}={v}"
+
+    def combine_conditions(negate, conditions):
+        return f"{'NOT ' if negate else ''} ({')AND('.join(conditions)})"
 
     negate = False
     while isinstance(condition, Not):
         negate = not negate
         condition = condition.restriction
-    template = "NOT (%s)" if negate else "%s"
 
     # restrict by string
     if isinstance(condition, str):
         columns.update(extract_column_names(condition))
-        return template % condition.strip().replace(
-            "%", "%%"
+        return combine_conditions(
+            negate, conditions=[condition.strip().replace("%", "%%")]
         )  # escape %, see issue #376
 
     # restrict by AndList
@@ -139,7 +181,7 @@ def prep_value(k, v):
             return negate  # if any item is False, the whole thing is False
         if not items:
             return not negate  # and empty AndList is True
-        return template % ("(" + ") AND (".join(items) + ")")
+        return combine_conditions(negate, conditions=items)
 
     # restriction by dj.U evaluates to True
     if isinstance(condition, U):
@@ -151,23 +193,19 @@ def prep_value(k, v):
 
     # restrict by a mapping/dict -- convert to an AndList of string equality conditions
     if isinstance(condition, collections.abc.Mapping):
-        common_attributes = set(condition).intersection(query_expression.heading.names)
+        common_attributes = set(c.split(".", 1)[0] for c in condition).intersection(
+            query_expression.heading.names
+        )
         if not common_attributes:
             return not negate  # no matching attributes -> evaluates to True
         columns.update(common_attributes)
-        return template % (
-            "("
-            + ") AND (".join(
-                "`%s`%s"
-                % (
-                    k,
-                    " IS NULL"
-                    if condition[k] is None
-                    else f"={prep_value(k, condition[k])}",
-                )
-                for k in common_attributes
-            )
-            + ")"
+        return combine_conditions(
+            negate,
+            conditions=[
+                prep_value(k, v)
+                for k, v in condition.items()
+                if k.split(".", 1)[0] in common_attributes  # handle json indexing
+            ],
         )
 
     # restrict by a numpy record -- convert to an AndList of string equality conditions
@@ -178,12 +216,9 @@ def prep_value(k, v):
         if not common_attributes:
             return not negate  # no matching attributes -> evaluate to True
         columns.update(common_attributes)
-        return template % (
-            "("
-            + ") AND (".join(
-                "`%s`=%s" % (k, prep_value(k, condition[k])) for k in common_attributes
-            )
-            + ")"
+        return combine_conditions(
+            negate,
+            conditions=[prep_value(k, condition[k]) for k in common_attributes],
         )
 
     # restrict by a QueryExpression subclass -- trigger instantiation and move on
@@ -231,7 +266,11 @@ def prep_value(k, v):
         ]  # ignore False conditions
         if any(item is True for item in or_list):  # if any item is True, entirely True
             return not negate
-        return template % ("(%s)" % " OR ".join(or_list)) if or_list else negate
+        return (
+            f"{'NOT ' if negate else ''} ({' OR '.join(or_list)})"
+            if or_list
+            else negate
+        )
 
 
 def extract_column_names(sql_expression):

diff --git a/datajoint/declare.py b/datajoint/declare.py
@@ -7,6 +7,7 @@
 import logging
 from .errors import DataJointError, _support_filepath_types, FILEPATH_FEATURE_SWITCH
 from .attribute_adapter import get_adapter
+from .condition import translate_attribute
 
 UUID_DATA_TYPE = "binary(16)"
 MAX_TABLE_NAME_LENGTH = 64
@@ -23,6 +24,7 @@
         DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$",
         FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$",
         STRING=r"(var)?char\s*\(.+\)$",
+        JSON=r"json$",
         ENUM=r"enum\s*\(.+\)$",
         BOOL=r"bool(ean)?$",  # aliased to tinyint(1)
         TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$",
@@ -129,25 +131,9 @@ def build_attribute_parser():
     return attribute_name + pp.Optional(default) + colon + data_type + comment
 
 
-def build_index_parser():
-    left = pp.Literal("(").suppress()
-    right = pp.Literal(")").suppress()
-    unique = pp.Optional(pp.CaselessKeyword("unique")).setResultsName("unique")
-    index = pp.CaselessKeyword("index").suppress()
-    attribute_name = pp.Word(pp.srange("[a-z]"), pp.srange("[a-z0-9_]"))
-    return (
-        unique
-        + index
-        + left
-        + pp.delimitedList(attribute_name).setResultsName("attr_list")
-        + right
-    )
-
-
 foreign_key_parser_old = build_foreign_key_parser_old()
 foreign_key_parser = build_foreign_key_parser()
 attribute_parser = build_attribute_parser()
-index_parser = build_index_parser()
 
 
 def is_foreign_key(line):
@@ -275,7 +261,7 @@ def prepare_declare(definition, context):
                 foreign_key_sql,
                 index_sql,
             )
-        elif re.match(r"^(unique\s+)?index[^:]*$", line, re.I):  # index
+        elif re.match(r"^(unique\s+)?index\s*.*$", line, re.I):  # index
             compile_index(line, index_sql)
         else:
             name, sql, store = compile_attribute(line, in_key, foreign_key_sql, context)
@@ -449,10 +435,22 @@ def alter(definition, old_definition, context):
 
 
 def compile_index(line, index_sql):
-    match = index_parser.parseString(line)
+    def format_attribute(attr):
+        match, attr = translate_attribute(attr)
+        if match is None:
+            return attr
+        if match["path"] is None:
+            return f"`{attr}`"
+        return f"({attr})"
+
+    match = re.match(
+        r"(?P<unique>unique\s+)?index\s*\(\s*(?P<args>.*)\)", line, re.I
+    ).groupdict()
+    attr_list = re.findall(r"(?:[^,(]|\([^)]*\))+", match["args"])
     index_sql.append(
-        "{unique} index ({attrs})".format(
-            unique=match.unique, attrs=",".join("`%s`" % a for a in match.attr_list)
+        "{unique}index ({attrs})".format(
+            unique="unique " if match["unique"] else "",
+            attrs=",".join(format_attribute(a.strip()) for a in attr_list),
         )
     )
 

diff --git a/datajoint/expression.py b/datajoint/expression.py
@@ -14,6 +14,7 @@
     assert_join_compatibility,
     extract_column_names,
     PromiscuousOperand,
+    translate_attribute,
 )
 from .declare import CONSTANT_LITERALS
 
@@ -342,6 +343,9 @@ def proj(self, *attributes, **named_attributes):
         from other attributes available before the projection.
         Each attribute name can only be used once.
         """
+        named_attributes = {
+            k: translate_attribute(v)[1] for k, v in named_attributes.items()
+        }
         # new attributes in parentheses are included again with the new name without removing original
         duplication_pattern = re.compile(
             rf'^\s*\(\s*(?!{"|".join(CONSTANT_LITERALS)})(?P<name>[a-zA-Z_]\w*)\s*\)\s*$'

diff --git a/datajoint/fetch.py b/datajoint/fetch.py
@@ -4,6 +4,7 @@
 import pandas
 import itertools
 import re
+import json
 import numpy as np
 import uuid
 import numbers
@@ -47,6 +48,8 @@ def _get(connection, attr, data, squeeze, download_path):
     """
     if data is None:
         return
+    if attr.json:
+        return json.loads(data)
 
     extern = (
         connection.schemas[attr.database].external[attr.store]
@@ -59,7 +62,6 @@ def _get(connection, attr, data, squeeze, download_path):
 
     if attr.is_filepath:
         return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0])
-
     if attr.is_attachment:
         # Steps:
         # 1. get the attachment filename

diff --git a/datajoint/heading.py b/datajoint/heading.py
@@ -28,6 +28,7 @@
         numeric=None,
         string=None,
         uuid=False,
+        json=None,
         is_blob=False,
         is_attachment=False,
         is_filepath=False,
@@ -142,7 +143,7 @@ def non_blobs(self):
         return [
             k
             for k, v in self.attributes.items()
-            if not v.is_blob and not v.is_attachment and not v.is_filepath
+            if not (v.is_blob or v.is_attachment or v.is_filepath or v.json)
         ]
 
     @property
@@ -290,6 +291,7 @@ def _init_from_database(self):
                 ),
                 is_blob=bool(TYPE_PATTERN["INTERNAL_BLOB"].match(attr["type"])),
                 uuid=False,
+                json=bool(TYPE_PATTERN["JSON"].match(attr["type"])),
                 is_attachment=False,
                 is_filepath=False,
                 adapter=None,
@@ -375,10 +377,15 @@ def _init_from_database(self):
                 )
 
             if attr["in_key"] and any(
-                (attr["is_blob"], attr["is_attachment"], attr["is_filepath"])
+                (
+                    attr["is_blob"],
+                    attr["is_attachment"],
+                    attr["is_filepath"],
+                    attr["json"],
+                )
             ):
                 raise DataJointError(
-                    "Blob, attachment, or filepath attributes are not allowed in the primary key"
+                    "Json, Blob, attachment, or filepath attributes are not allowed in the primary key"
                 )
 
             if (
@@ -419,7 +426,8 @@ def _init_from_database(self):
         ):
             if item["Key_name"] != "PRIMARY":
                 keys[item["Key_name"]][item["Seq_in_index"]] = dict(
-                    column=item["Column_name"],
+                    column=item["Column_name"]
+                    or f"({item['Expression']})".replace(r"\'", "'"),
                     unique=(item["Non_unique"] == 0),
                     nullable=item["Null"].lower() == "yes",
                 )

diff --git a/datajoint/table.py b/datajoint/table.py
@@ -8,6 +8,7 @@
 import uuid
 import csv
 import re
+import json
 from pathlib import Path
 from .settings import config
 from .declare import declare, alter
@@ -831,6 +832,8 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False):
                 value = self.external[attr.store].upload_filepath(value).bytes
             elif attr.numeric:
                 value = str(int(value) if isinstance(value, bool) else value)
+            elif attr.json:
+                value = json.dumps(value)
         return name, placeholder, value
 
     def __make_row_to_insert(self, row, field_list, ignore_extra_fields):

diff --git a/docs/.docker/pip_requirements.txt b/docs/.docker/pip_requirements.txt
@@ -7,3 +7,4 @@ mdx-truly-sane-lists
 mkdocs-gen-files
 mkdocs-literate-nav
 mkdocs-exclude-search
+mkdocs-jupyter
diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml
@@ -15,7 +15,8 @@ nav:
   - Reproducibility:
       - Table Tiers: reproduce/table-tiers.md
       - Make Method: reproduce/make-method.md
-  - Tutorials: tutorials.md
+  - Tutorials:
+      - tutorials/json.ipynb
   - Develop: develop.md
   - Changelog: about/changelog.md
   - API: api/ # defer to gen-files + literate-nav
@@ -72,6 +73,8 @@ plugins:
       exclude:
         - "*/navigation.md"
         - "*/archive/*md"
+  - mkdocs-jupyter:
+      include: ["*.ipynb"]
 markdown_extensions:
   - attr_list
   - toc:

diff --git a/docs/src/concepts.md b/docs/src/concepts.md
diff --git a/docs/src/tutorials.md b/docs/src/tutorials.md