Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Renaming (on top of Pydantic heaven) #48

Merged
merged 35 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a6b8db2
rm _is_file from Feature
dmpetrov Jul 13, 2024
366c8d8
Get rid of Feature class
dmpetrov Jul 14, 2024
c6883ff
unflatten_to_json: support list of objects
dmpetrov Jul 14, 2024
6cf05ea
claude: bo Feature, no common classes
dmpetrov Jul 14, 2024
80d4927
convert_type_to_datachain: extract JSON check
dmpetrov Jul 14, 2024
3be3909
Clean up dead code
dmpetrov Jul 14, 2024
860be4f
rm Feature class completely
dmpetrov Jul 14, 2024
64a2e87
Fix unit-tests
dmpetrov Jul 14, 2024
cc718ae
Auto-register data classes & misc changes
dmpetrov Jul 14, 2024
c3fc27e
unit-test: fix
dmpetrov Jul 14, 2024
4e2fb5c
fix unit test
dmpetrov Jul 15, 2024
5936827
Merge branch 'main' into pydantic_heaven
dmpetrov Jul 15, 2024
aa7d755
after merge fix
dmpetrov Jul 15, 2024
2ca79f7
Intro DataModel and move register feature there
dmpetrov Jul 15, 2024
cabcfd0
into is_pydantic()
dmpetrov Jul 15, 2024
c1156db
linter
dmpetrov Jul 15, 2024
b912acc
fix tests for python<3.10
dmpetrov Jul 15, 2024
520af9c
Rename core data structure: ChainType, ModelStore
dmpetrov Jul 15, 2024
525b076
Rename internal data structure: type_converter
dmpetrov Jul 15, 2024
b6c22cb
Rename internal data structure: extract to converters/ dir
dmpetrov Jul 15, 2024
77d1851
fix test in is_chain_type()
dmpetrov Jul 15, 2024
1d794ba
Merge branch 'main' into pydantic_heaven
dmpetrov Jul 15, 2024
60a11e9
Merge branch 'main' into pydantic_heaven
dmpetrov Jul 15, 2024
7a01162
fixes after main merge
dmpetrov Jul 15, 2024
813a8b0
Merge branch 'main' into pydantic_heaven
dmpetrov Jul 16, 2024
1750951
after review changes & print_schema small fix
dmpetrov Jul 16, 2024
9f62196
linter
dmpetrov Jul 16, 2024
58cc659
after-merge fixes
dmpetrov Jul 16, 2024
78e2a81
merge to main
dmpetrov Jul 16, 2024
66b4d53
small bugfix
dmpetrov Jul 16, 2024
863862c
after-review: rename ChainType to DataType
dmpetrov Jul 16, 2024
f909a6c
after-review: rename module name convertors to convert
dmpetrov Jul 16, 2024
052abe1
linter
dmpetrov Jul 16, 2024
4c47b27
Merge branch 'main' into pydantic_heaven_renaming
dmpetrov Jul 16, 2024
30cfd7c
linter
dmpetrov Jul 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/openimage-detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import pandas as pd
from PIL import Image

from datachain.lib.convert.flatten import ShallowFeature
from datachain.lib.dc import DataChain
from datachain.lib.feature import ShallowFeature
from datachain.lib.feature_udf import FeatureAggregator
from datachain.lib.file import File, FileInfo
from datachain.query.schema import C
Expand Down
7 changes: 4 additions & 3 deletions src/datachain/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from datachain.lib.data_model import DataModel, FileBasic
from datachain.lib.dc import C, DataChain
from datachain.lib.data_model import DataModel, DataType, FileBasic, is_chain_type
from datachain.lib.dc import C, Column, DataChain
from datachain.lib.file import File, FileError, IndexedFile, TarVFile
from datachain.lib.image import ImageFile
from datachain.lib.udf import Aggregator, Generator, Mapper
from datachain.lib.utils import AbstractUDF, DataChainError
from datachain.query.dataset import UDF as BaseUDF # noqa: N811
from datachain.query.schema import Column
from datachain.query.session import Session

__all__ = [
Expand All @@ -17,6 +16,7 @@
"DataChain",
"DataChainError",
"DataModel",
"DataType",
"File",
"FileBasic",
"FileError",
Expand All @@ -26,4 +26,5 @@
"Mapper",
"Session",
"TarVFile",
"is_chain_type",
]
Empty file.
67 changes: 67 additions & 0 deletions src/datachain/lib/convert/flatten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from datetime import datetime

from pydantic import BaseModel

from datachain.lib.model_store import ModelStore
from datachain.sql.types import (
JSON,
Array,
Binary,
Boolean,
DateTime,
Float,
Int,
Int32,
Int64,
NullType,
String,
)

DATACHAIN_TO_TYPE = {
Int: int,
Int32: int,
Int64: int,
String: str,
Float: float,
Boolean: bool,
DateTime: datetime,
Binary: bytes,
Array(NullType): list,
JSON: dict,
}


def flatten(obj: BaseModel):
return tuple(_flatten_fields_values(obj.model_fields, obj))


def flatten_list(obj_list):
return tuple(
val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
)


def _flatten_fields_values(fields, obj: BaseModel):
for name, f_info in fields.items():
anno = f_info.annotation
# Optimization: Access attributes directly to skip the model_dump() call.
value = getattr(obj, name)

if isinstance(value, list):
yield [
val.model_dump() if ModelStore.is_pydantic(type(val)) else val
for val in value
]
elif isinstance(value, dict):
yield {
key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
for key, val in value.items()
}
elif ModelStore.is_pydantic(anno):
yield from _flatten_fields_values(anno.model_fields, value)
else:
yield value


def _flatten(obj):
return tuple(_flatten_fields_values(obj.model_fields, obj))
96 changes: 96 additions & 0 deletions src/datachain/lib/convert/type_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import inspect
from datetime import datetime
from enum import Enum
from typing import Annotated, Literal, Union, get_args, get_origin

from pydantic import BaseModel
from typing_extensions import Literal as LiteralEx

from datachain.lib.model_store import ModelStore
from datachain.sql.types import (
JSON,
Array,
Binary,
Boolean,
DateTime,
Float,
Int64,
SQLType,
String,
)

TYPE_TO_DATACHAIN = {
int: Int64,
str: String,
Literal: String,
LiteralEx: String,
Enum: String,
float: Float,
bool: Boolean,
datetime: DateTime, # Note, list of datetime is not supported yet
bytes: Binary, # Note, list of bytes is not supported yet
list: Array,
dict: JSON,
}


def convert_to_db_type(typ): # noqa: PLR0911
if inspect.isclass(typ):
if issubclass(typ, SQLType):
return typ
if issubclass(typ, Enum):
return str

res = TYPE_TO_DATACHAIN.get(typ)
if res:
return res

orig = get_origin(typ)

if orig in (Literal, LiteralEx):
return String

args = get_args(typ)
if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
if args is None or len(args) != 1:
raise TypeError(f"Cannot resolve type '{typ}' for flattening features")

args0 = args[0]
if ModelStore.is_pydantic(args0):
return Array(JSON())

next_type = convert_to_db_type(args0)
return Array(next_type)

if orig is Annotated:
# Ignoring annotations
return convert_to_db_type(args[0])

if inspect.isclass(orig) and issubclass(dict, orig):
return JSON

if orig == Union:
if len(args) == 2 and (type(None) in args):
return convert_to_db_type(args[0])

if _is_json_inside_union(orig, args):
return JSON

raise TypeError(f"Cannot recognize type {typ}")


def _is_json_inside_union(orig, args) -> bool:
if orig == Union and len(args) >= 2:
# List in JSON: Union[dict, list[dict]]
args_no_nones = [arg for arg in args if arg != type(None)]
if len(args_no_nones) == 2:
args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
arg = get_args(args_no_dicts[0])
if len(arg) == 1 and arg[0] is dict:
return True

# List of objects: Union[MyClass, OtherClass]
if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
return True
return False
69 changes: 69 additions & 0 deletions src/datachain/lib/convert/unflatten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import copy
import inspect
import re
from collections.abc import Sequence
from typing import Any, get_origin

from pydantic import BaseModel

from datachain.query.schema import DEFAULT_DELIMITER


def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
return unflatten_to_json_pos(model, row, pos)[0]


def unflatten_to_json_pos(
model: type[BaseModel], row: Sequence[Any], pos=0
) -> tuple[dict, int]:
res = {}
for name, f_info in model.model_fields.items():
anno = f_info.annotation
origin = get_origin(anno)
if (
origin not in (list, dict)
and inspect.isclass(anno)
and issubclass(anno, BaseModel)
):
res[name], pos = unflatten_to_json_pos(anno, row, pos)
else:
res[name] = row[pos]
pos += 1
return res, pos


def _normalize(name: str) -> str:
if DEFAULT_DELIMITER in name:
raise RuntimeError(
f"variable '{name}' cannot be used "
f"because it contains {DEFAULT_DELIMITER}"
)
return _to_snake_case(name)


def _to_snake_case(name: str) -> str:
"""Convert a CamelCase name to snake_case."""
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()


def _unflatten_with_path(model: type[BaseModel], dump, name_path: list[str]):
res = {}
for name, f_info in model.model_fields.items():
anno = f_info.annotation
name_norm = _normalize(name)
lst = copy.copy(name_path)

if inspect.isclass(anno) and issubclass(anno, BaseModel):
lst.append(name_norm)
val = _unflatten_with_path(anno, dump, lst)
res[name] = val
else:
lst.append(name_norm)
curr_path = DEFAULT_DELIMITER.join(lst)
res[name] = dump[curr_path]
return model(**res)


def unflatten(model: type[BaseModel], dump):
return _unflatten_with_path(model, dump, [])
Original file line number Diff line number Diff line change
@@ -1,71 +1,56 @@
import string
from collections.abc import Sequence
from typing import Any, Union

from pydantic import BaseModel, create_model

from datachain.lib.feature import (
FeatureType,
FeatureTypeNames,
is_feature_type,
)
from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
from datachain.lib.utils import DataChainParamsError

AUTO_FEATURE_PREFIX = "_auto_fr"
SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase


class FeatureToTupleError(DataChainParamsError):
class ValuesToTupleError(DataChainParamsError):
def __init__(self, ds_name, msg):
if ds_name:
ds_name = f"' {ds_name}'"
super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")


def dict_to_feature(name: str, data_dict: dict[str, FeatureType]) -> type[BaseModel]:
fields = {name: (anno, ...) for name, anno in data_dict.items()}
return create_model(name, **fields) # type: ignore[call-overload]


def features_to_tuples(
def values_to_tuples(
ds_name: str = "",
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
**fr_map,
) -> tuple[Any, Any, Any]:
types_map = {}
length = -1
for k, v in fr_map.items():
if not isinstance(v, Sequence) or isinstance(v, str):
raise FeatureToTupleError(ds_name, f"features '{k}' is not a sequence")
raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
len_ = len(v)

if len_ == 0:
raise FeatureToTupleError(ds_name, f"feature '{k}' is empty list")
raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")

if length < 0:
length = len_
elif length != len_:
raise FeatureToTupleError(
raise ValuesToTupleError(
ds_name,
f"feature '{k}' should have length {length} while {len_} is given",
)
typ = type(v[0])
if not is_feature_type(typ):
raise FeatureToTupleError(
if not is_chain_type(typ):
raise ValuesToTupleError(
ds_name,
f"feature '{k}' has unsupported type '{typ.__name__}'."
f" Please use Feature types: {FeatureTypeNames}",
f" Please use Feature types: {DataTypeNames}",
)
types_map[k] = typ
if output:
if not isinstance(output, Sequence) and not isinstance(output, str):
if len(fr_map) != 1:
raise FeatureToTupleError(
raise ValuesToTupleError(
ds_name,
f"only one output type was specified, {len(fr_map)} expected",
)
if not isinstance(output, type):
raise FeatureToTupleError(
raise ValuesToTupleError(
ds_name,
f"output must specify a type while '{output}' was given",
)
Expand All @@ -74,13 +59,13 @@ def features_to_tuples(
output = {key: output} # type: ignore[dict-item]

if len(output) != len(fr_map):
raise FeatureToTupleError(
raise ValuesToTupleError(
ds_name,
f"number of outputs '{len(output)}' should match"
f" number of features '{len(fr_map)}'",
)
if isinstance(output, dict):
raise FeatureToTupleError(
raise ValuesToTupleError(
ds_name,
"output type must be dict[str, FeatureType] while "
f"'{type(output).__name__}' is given",
Expand Down
Loading
Loading