iterative · dmpetrov · Jul 16, 2024 · Jul 13, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/examples/openimage-detect.py b/examples/openimage-detect.py
@@ -3,8 +3,8 @@
 import pandas as pd
 from PIL import Image
 
+from datachain.lib.convert.flatten import ShallowFeature
 from datachain.lib.dc import DataChain
-from datachain.lib.feature import ShallowFeature
 from datachain.lib.feature_udf import FeatureAggregator
 from datachain.lib.file import File, FileInfo
 from datachain.query.schema import C

diff --git a/src/datachain/__init__.py b/src/datachain/__init__.py
@@ -1,11 +1,10 @@
-from datachain.lib.data_model import DataModel, FileBasic
-from datachain.lib.dc import C, DataChain
+from datachain.lib.data_model import DataModel, DataType, FileBasic, is_chain_type
+from datachain.lib.dc import C, Column, DataChain
 from datachain.lib.file import File, FileError, IndexedFile, TarVFile
 from datachain.lib.image import ImageFile
 from datachain.lib.udf import Aggregator, Generator, Mapper
 from datachain.lib.utils import AbstractUDF, DataChainError
 from datachain.query.dataset import UDF as BaseUDF  # noqa: N811
-from datachain.query.schema import Column
 from datachain.query.session import Session
 
 __all__ = [
@@ -17,6 +16,7 @@
     "DataChain",
     "DataChainError",
     "DataModel",
+    "DataType",
     "File",
     "FileBasic",
     "FileError",
@@ -26,4 +26,5 @@
     "Mapper",
     "Session",
     "TarVFile",
+    "is_chain_type",
 ]
diff --git a/src/datachain/lib/convert/__init__.py b/src/datachain/lib/convert/__init__.py
diff --git a/src/datachain/lib/convert/flatten.py b/src/datachain/lib/convert/flatten.py
@@ -0,0 +1,67 @@
+from datetime import datetime
+
+from pydantic import BaseModel
+
+from datachain.lib.model_store import ModelStore
+from datachain.sql.types import (
+    JSON,
+    Array,
+    Binary,
+    Boolean,
+    DateTime,
+    Float,
+    Int,
+    Int32,
+    Int64,
+    NullType,
+    String,
+)
+
+DATACHAIN_TO_TYPE = {
+    Int: int,
+    Int32: int,
+    Int64: int,
+    String: str,
+    Float: float,
+    Boolean: bool,
+    DateTime: datetime,
+    Binary: bytes,
+    Array(NullType): list,
+    JSON: dict,
+}
+
+
+def flatten(obj: BaseModel):
+    return tuple(_flatten_fields_values(obj.model_fields, obj))
+
+
+def flatten_list(obj_list):
+    return tuple(
+        val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
+    )
+
+
+def _flatten_fields_values(fields, obj: BaseModel):
+    for name, f_info in fields.items():
+        anno = f_info.annotation
+        # Optimization: Access attributes directly to skip the model_dump() call.
+        value = getattr(obj, name)
+
+        if isinstance(value, list):
+            yield [
+                val.model_dump() if ModelStore.is_pydantic(type(val)) else val
+                for val in value
+            ]
+        elif isinstance(value, dict):
+            yield {
+                key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
+                for key, val in value.items()
+            }
+        elif ModelStore.is_pydantic(anno):
+            yield from _flatten_fields_values(anno.model_fields, value)
+        else:
+            yield value
+
+
+def _flatten(obj):
+    return tuple(_flatten_fields_values(obj.model_fields, obj))
diff --git a/src/datachain/lib/convert/type_converter.py b/src/datachain/lib/convert/type_converter.py
@@ -0,0 +1,96 @@
+import inspect
+from datetime import datetime
+from enum import Enum
+from typing import Annotated, Literal, Union, get_args, get_origin
+
+from pydantic import BaseModel
+from typing_extensions import Literal as LiteralEx
+
+from datachain.lib.model_store import ModelStore
+from datachain.sql.types import (
+    JSON,
+    Array,
+    Binary,
+    Boolean,
+    DateTime,
+    Float,
+    Int64,
+    SQLType,
+    String,
+)
+
+TYPE_TO_DATACHAIN = {
+    int: Int64,
+    str: String,
+    Literal: String,
+    LiteralEx: String,
+    Enum: String,
+    float: Float,
+    bool: Boolean,
+    datetime: DateTime,  # Note, list of datetime is not supported yet
+    bytes: Binary,  # Note, list of bytes is not supported yet
+    list: Array,
+    dict: JSON,
+}
+
+
+def convert_to_db_type(typ):  # noqa: PLR0911
+    if inspect.isclass(typ):
+        if issubclass(typ, SQLType):
+            return typ
+        if issubclass(typ, Enum):
+            return str
+
+    res = TYPE_TO_DATACHAIN.get(typ)
+    if res:
+        return res
+
+    orig = get_origin(typ)
+
+    if orig in (Literal, LiteralEx):
+        return String
+
+    args = get_args(typ)
+    if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
+        if args is None or len(args) != 1:
+            raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
+
+        args0 = args[0]
+        if ModelStore.is_pydantic(args0):
+            return Array(JSON())
+
+        next_type = convert_to_db_type(args0)
+        return Array(next_type)
+
+    if orig is Annotated:
+        # Ignoring annotations
+        return convert_to_db_type(args[0])
+
+    if inspect.isclass(orig) and issubclass(dict, orig):
+        return JSON
+
+    if orig == Union:
+        if len(args) == 2 and (type(None) in args):
+            return convert_to_db_type(args[0])
+
+        if _is_json_inside_union(orig, args):
+            return JSON
+
+    raise TypeError(f"Cannot recognize type {typ}")
+
+
+def _is_json_inside_union(orig, args) -> bool:
+    if orig == Union and len(args) >= 2:
+        # List in JSON: Union[dict, list[dict]]
+        args_no_nones = [arg for arg in args if arg != type(None)]
+        if len(args_no_nones) == 2:
+            args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
+            if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
+                arg = get_args(args_no_dicts[0])
+                if len(arg) == 1 and arg[0] is dict:
+                    return True
+
+        # List of objects: Union[MyClass, OtherClass]
+        if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
+            return True
+    return False
diff --git a/src/datachain/lib/convert/unflatten.py b/src/datachain/lib/convert/unflatten.py
@@ -0,0 +1,69 @@
+import copy
+import inspect
+import re
+from collections.abc import Sequence
+from typing import Any, get_origin
+
+from pydantic import BaseModel
+
+from datachain.query.schema import DEFAULT_DELIMITER
+
+
+def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
+    return unflatten_to_json_pos(model, row, pos)[0]
+
+
+def unflatten_to_json_pos(
+    model: type[BaseModel], row: Sequence[Any], pos=0
+) -> tuple[dict, int]:
+    res = {}
+    for name, f_info in model.model_fields.items():
+        anno = f_info.annotation
+        origin = get_origin(anno)
+        if (
+            origin not in (list, dict)
+            and inspect.isclass(anno)
+            and issubclass(anno, BaseModel)
+        ):
+            res[name], pos = unflatten_to_json_pos(anno, row, pos)
+        else:
+            res[name] = row[pos]
+            pos += 1
+    return res, pos
+
+
+def _normalize(name: str) -> str:
+    if DEFAULT_DELIMITER in name:
+        raise RuntimeError(
+            f"variable '{name}' cannot be used "
+            f"because it contains {DEFAULT_DELIMITER}"
+        )
+    return _to_snake_case(name)
+
+
+def _to_snake_case(name: str) -> str:
+    """Convert a CamelCase name to snake_case."""
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+
+
+def _unflatten_with_path(model: type[BaseModel], dump, name_path: list[str]):
+    res = {}
+    for name, f_info in model.model_fields.items():
+        anno = f_info.annotation
+        name_norm = _normalize(name)
+        lst = copy.copy(name_path)
+
+        if inspect.isclass(anno) and issubclass(anno, BaseModel):
+            lst.append(name_norm)
+            val = _unflatten_with_path(anno, dump, lst)
+            res[name] = val
+        else:
+            lst.append(name_norm)
+            curr_path = DEFAULT_DELIMITER.join(lst)
+            res[name] = dump[curr_path]
+    return model(**res)
+
+
+def unflatten(model: type[BaseModel], dump):
+    return _unflatten_with_path(model, dump, [])
diff --git a/src/datachain/lib/feature_utils.py → ...datachain/lib/convert/values_to_tuples.py b/src/datachain/lib/feature_utils.py → ...datachain/lib/convert/values_to_tuples.py
@@ -1,71 +1,56 @@
-import string
 from collections.abc import Sequence
 from typing import Any, Union
 
-from pydantic import BaseModel, create_model
-
-from datachain.lib.feature import (
-    FeatureType,
-    FeatureTypeNames,
-    is_feature_type,
-)
+from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.utils import DataChainParamsError
 
-AUTO_FEATURE_PREFIX = "_auto_fr"
-SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
-
 
-class FeatureToTupleError(DataChainParamsError):
+class ValuesToTupleError(DataChainParamsError):
     def __init__(self, ds_name, msg):
         if ds_name:
             ds_name = f"' {ds_name}'"
         super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
 
 
-def dict_to_feature(name: str, data_dict: dict[str, FeatureType]) -> type[BaseModel]:
-    fields = {name: (anno, ...) for name, anno in data_dict.items()}
-    return create_model(name, **fields)  # type: ignore[call-overload]
-
-
-def features_to_tuples(
+def values_to_tuples(
     ds_name: str = "",
-    output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
+    output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
     **fr_map,
 ) -> tuple[Any, Any, Any]:
     types_map = {}
     length = -1
     for k, v in fr_map.items():
         if not isinstance(v, Sequence) or isinstance(v, str):
-            raise FeatureToTupleError(ds_name, f"features '{k}' is not a sequence")
+            raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
         len_ = len(v)
 
         if len_ == 0:
-            raise FeatureToTupleError(ds_name, f"feature '{k}' is empty list")
+            raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")
 
         if length < 0:
             length = len_
         elif length != len_:
-            raise FeatureToTupleError(
+            raise ValuesToTupleError(
                 ds_name,
                 f"feature '{k}' should have length {length} while {len_} is given",
             )
         typ = type(v[0])
-        if not is_feature_type(typ):
-            raise FeatureToTupleError(
+        if not is_chain_type(typ):
+            raise ValuesToTupleError(
                 ds_name,
                 f"feature '{k}' has unsupported type '{typ.__name__}'."
-                f" Please use Feature types: {FeatureTypeNames}",
+                f" Please use Feature types: {DataTypeNames}",
             )
         types_map[k] = typ
     if output:
         if not isinstance(output, Sequence) and not isinstance(output, str):
             if len(fr_map) != 1:
-                raise FeatureToTupleError(
+                raise ValuesToTupleError(
                     ds_name,
                     f"only one output type was specified, {len(fr_map)} expected",
                 )
             if not isinstance(output, type):
-                raise FeatureToTupleError(
+                raise ValuesToTupleError(
                     ds_name,
                     f"output must specify a type while '{output}' was given",
                 )
@@ -74,13 +59,13 @@ def features_to_tuples(
             output = {key: output}  # type: ignore[dict-item]
 
         if len(output) != len(fr_map):
-            raise FeatureToTupleError(
+            raise ValuesToTupleError(
                 ds_name,
                 f"number of outputs '{len(output)}' should match"
                 f" number of features '{len(fr_map)}'",
             )
         if isinstance(output, dict):
-            raise FeatureToTupleError(
+            raise ValuesToTupleError(
                 ds_name,
                 "output type must be dict[str, FeatureType] while "
                 f"'{type(output).__name__}' is given",