ml6team · RobbeSneyders · Jun 9, 2023 · Jun 7, 2023 · Jun 7, 2023 · Jun 8, 2023
diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml
@@ -6,7 +6,9 @@ consumes:
   embeddings:
     fields:
       data:
-        type: float32_list
+        type: array
+        items:
+          type: float32
 
 produces:
   images:

diff --git a/components/image_embedding/fondant_component.yaml b/components/image_embedding/fondant_component.yaml
@@ -12,7 +12,9 @@ produces:
   embeddings:
     fields:
       data:
-        type: float32_list
+        type: array
+        items:
+          type: float32
 
 args:
   model_id:

diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml
@@ -12,7 +12,9 @@ produces:
   segmentations:
     fields:
       data:
-        type: binary
+        type: array
+        items:
+          type: binary
 
 args:
   model_id:

diff --git a/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml
@@ -16,7 +16,9 @@ consumes:
   segmentations:
     fields:
       data:
-        type: binary
+        type: array
+        items:
+          type: binary
 
 args:
   hf_token:

diff --git a/fondant/component_spec.py b/fondant/component_spec.py
@@ -71,7 +71,7 @@ def __repr__(self) -> str:
     def fields(self) -> t.Mapping[str, Field]:
         return types.MappingProxyType(
             {
-                name: Field(name=name, type=Type[field["type"]])
+                name: Field(name=name, type=Type.from_json(field))
                 for name, field in self._specification["fields"].items()
             }
         )

diff --git a/fondant/manifest.py b/fondant/manifest.py
@@ -38,7 +38,7 @@ def fields(self) -> t.Mapping[str, Field]:
         """The fields of the subset returned as an immutable mapping."""
         return types.MappingProxyType(
             {
-                name: Field(name=name, type=Type[field["type"]])
+                name: Field(name=name, type=Type.from_json(field))
                 for name, field in self._specification["fields"].items()
             }
         )
@@ -62,8 +62,8 @@ class Index(Subset):
     @property
     def fields(self) -> t.Dict[str, Field]:
         return {
-            "id": Field(name="id", type=Type.string),
-            "source": Field(name="source", type=Type.string),
+            "id": Field(name="id", type=Type("string")),
+            "source": Field(name="source", type=Type("string")),
         }
 
 

diff --git a/fondant/schema.py b/fondant/schema.py
@@ -2,56 +2,129 @@
 and pipelines.
 """
 
-import enum
 import typing as t
 
 import pyarrow as pa
 
 KubeflowCommandArguments = t.List[t.Union[str, t.Dict[str, str]]]
 
-
-class Type(enum.Enum):
-    """Supported types.
-
-    Based on:
-    - https://arrow.apache.org/docs/python/api/datatypes.html#api-types
-    - https://pola-rs.github.io/polars/py-polars/html/reference/datatypes.html
+"""
+Types based on:
+- https://arrow.apache.org/docs/python/api/datatypes.html#api-types
+- https://pola-rs.github.io/polars/py-polars/html/reference/datatypes.html
+"""
+_TYPES: t.Dict[str, pa.DataType] = {
+    "null": pa.null(),
+    "bool": pa.bool_(),
+    "int8": pa.int8(),
+    "int16": pa.int16(),
+    "int32": pa.int32(),
+    "int64": pa.int64(),
+    "uint8": pa.uint8(),
+    "uint16": pa.uint16(),
+    "uint32": pa.uint32(),
+    "uint64": pa.uint64(),
+    "float16": pa.float16(),
+    "float32": pa.float32(),
+    "float64": pa.float64(),
+    "decimal128": pa.decimal128(38),
+    "time32": pa.time32("s"),
+    "time64": pa.time64("us"),
+    "timestamp": pa.timestamp("us"),
+    "date32": pa.date32(),
+    "date64": pa.date64(),
+    "duration": pa.duration("us"),
+    "string": pa.string(),
+    "utf8": pa.utf8(),
+    "binary": pa.binary(),
+    "large_binary": pa.large_binary(),
+    "large_utf8": pa.large_utf8(),
+}
+
+
+class Type:
+    """
+    The `Type` class provides a way to define and validate data types for various purposes. It
+      supports different data types including primitive types and complex types like lists.
     """
 
-    bool = pa.bool_()
-
-    int8 = pa.int8()
-    int16 = pa.int16()
-    int32 = pa.int32()
-    int64 = pa.int64()
-
-    uint8 = pa.uint8()
-    uint16 = pa.uint16()
-    uint32 = pa.uint32()
-    uint64 = pa.uint64()
-
-    float16 = pa.float16()
-    float32 = pa.float32()
-    float64 = pa.float64()
-
-    decimal = pa.decimal128(38)
-
-    time32 = pa.time32("s")
-    time64 = pa.time64("us")
-    timestamp = pa.timestamp("us")
-
-    date32 = pa.date32()
-    date64 = pa.date64()
-    duration = pa.duration("us")
-
-    string = pa.string()
-    utf8 = pa.utf8()
-
-    binary = pa.binary()
-
-    int8_list = pa.list_(pa.int8())
-
-    float32_list = pa.list_(pa.float32())
+    def __init__(self, data_type: t.Union[str, pa.DataType]):
+        self.value = self._validate_data_type(data_type)
+
+    @staticmethod
+    def _validate_data_type(data_type: t.Union[str, pa.DataType]):
+        """
+        Validates the provided data type and returns the corresponding data type object.
+
+        Args:
+            data_type: The data type to validate.
+
+        Returns:
+            The validated `pa.DataType` object.
+        """
+        if isinstance(data_type, str):
-        if isinstance(data_type, str):
+        if not isinstance(data_type, Type):
-        if isinstance(data_type, str):
+        if not isinstance(data_type, Type):
+            try:
+                data_type = _TYPES[data_type]
+            except KeyError:
+                raise ValueError(
+                    f"Invalid schema provided. Current available data types are:"
+                    f" {_TYPES.keys()}"
+                )
+        return data_type
+
+    @classmethod
+    def list(cls, data_type: t.Union[str, pa.DataType, "Type"]) -> "Type":
+        """
+        Creates a new `Type` instance representing a list of the specified data type.
+
+        Args:
+            data_type: The data type for the list elements. It can be a string representing the
+            data type or an existing `pa.DataType` object.
+
+        Returns:
+            A new `Type` instance representing a list of the specified data type.
+        """
+        data_type = cls._validate_data_type(data_type)
+        return cls(
+            pa.list_(data_type.value if isinstance(data_type, Type) else data_type)
+        )
+
+    @classmethod
+    def from_json(cls, json_schema: dict):
+        """
+        Creates a new `Type` instance based on a dictionary representation of the json schema
+          of a data type (https://swagger.io/docs/specification/data-models/data-types/).
+
+        Args:
+            json_schema: The dictionary representation of the data type, can represent nested values
+
+        Returns:
+            A new `Type` instance representing the specified data type.
+        """
+        if json_schema["type"] in _TYPES:
+            return Type(json_schema["type"])
+
+        elif json_schema["type"] == "array":
+            items = json_schema["items"]
+            if isinstance(items, dict):
+                return cls.list(Type.from_json(items))
-                return cls.list(Type.from_json(items))
+                return cls.list(cls.from_json(items))
-                return cls.list(Type.from_json(items))
+                return cls.list(cls.from_json(items))
+        else:
+            raise ValueError(f"Invalid schema provided: {json_schema}")
+
+    @property
+    def name(self):
+        """Name of the data type."""
+        return str(self.value)
+
+    def __repr__(self):
+        """Returns a string representation of the `Type` instance."""
+        return f"Type({repr(self.value)})"
+
+    def __eq__(self, other):
+        if isinstance(other, Type):
+            return self.value == other.value
+
+        return False
 
 
 class Field(t.NamedTuple):

diff --git a/fondant/schemas/common.json b/fondant/schemas/common.json
@@ -26,8 +26,7 @@
         "binary",
         "list",
         "struct",
-        "int8_list",
-        "float32_list"
+        "array"
       ]
     },
     "field": {
@@ -36,11 +35,23 @@
         "type": {
           "type": "string",
           "$ref": "#/definitions/subset_data_type"
+        },
+        "items": {
+          "oneOf": [
+            {
+              "$ref": "#/definitions/field"
+            },
+            {
+              "type": "array",
+              "items": {
+                "$ref": "#/definitions/field"
+              }
+            }
+          ]
         }
       },
-      "required": [
-        "type"
-      ]
+      "required": ["type"],
+      "additionalProperties": false
     },
     "fields": {
       "type": "object",

diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml
@@ -11,7 +11,9 @@ consumes:
   embeddings:
     fields:
       data:
-        type: int8_list
+        type: array
+        items:
+          type: binary
 
 produces:
   captions:

diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py
@@ -1,7 +1,6 @@
 """Fondant component specs test."""
 from pathlib import Path
 
-import pyarrow as pa
 import pytest
 import yaml
 
@@ -53,13 +52,10 @@ def test_attribute_access(valid_fondant_schema):
 
     assert fondant_component.name == "Example component"
     assert fondant_component.description == "This is an example component"
-    assert fondant_component.consumes["images"].fields["data"].type == Type.binary
-    assert (
-        fondant_component.consumes["embeddings"].fields["data"].type == Type.int8_list
+    assert fondant_component.consumes["images"].fields["data"].type == Type("binary")
+    assert fondant_component.consumes["embeddings"].fields["data"].type == Type.list(
+        Type("binary")
     )
-    assert fondant_component.consumes["embeddings"].fields[
-        "data"
-    ].type.value == pa.list_(pa.int8())
 
 
 def test_kfp_component_creation(valid_fondant_schema, valid_kubeflow_schema):

diff --git a/tests/test_manifest.py b/tests/test_manifest.py
@@ -60,16 +60,16 @@ def test_subset_fields():
     subset = Subset(specification=subset_spec, base_path="/tmp")
 
     # add a field
-    subset.add_field(name="data2", type_=Type.binary)
+    subset.add_field(name="data2", type_=Type("binary"))
     assert "data2" in subset.fields
 
     # add a duplicate field
     with pytest.raises(ValueError):
-        subset.add_field(name="data2", type_=Type.binary)
+        subset.add_field(name="data2", type_=Type("binary"))
 
     # add a duplicate field but overwrite
-    subset.add_field(name="data2", type_=Type.string, overwrite=True)
-    assert subset.fields["data2"].type == Type.string
+    subset.add_field(name="data2", type_=Type("string"), overwrite=True)
+    assert subset.fields["data2"].type.value == Type("string").value
 
     # remove a field
     subset.remove_field(name="data2")
@@ -111,7 +111,7 @@ def test_attribute_access(valid_manifest):
     assert manifest.metadata == valid_manifest["metadata"]
     assert manifest.index.location == "gs://bucket/index"
     assert manifest.subsets["images"].location == "gs://bucket/images"
-    assert manifest.subsets["images"].fields["data"].type == Type.binary
+    assert manifest.subsets["images"].fields["data"].type == Type("binary")
 
 
 def test_manifest_creation():
@@ -123,8 +123,8 @@ def test_manifest_creation():
     manifest = Manifest.create(
         base_path=base_path, run_id=run_id, component_id=component_id
     )
-    manifest.add_subset("images", [("width", Type.int32), ("height", Type.int32)])
-    manifest.subsets["images"].add_field("data", Type.binary)
+    manifest.add_subset("images", [("width", Type("int32")), ("height", Type("int32"))])
+    manifest.subsets["images"].add_field("data", Type("binary"))
 
     assert manifest._specification == {
         "metadata": {
@@ -166,12 +166,16 @@ def test_manifest_alteration(valid_manifest):
     manifest = Manifest(valid_manifest)
 
     # test adding a subset
-    manifest.add_subset("images2", [("width", Type.int32), ("height", Type.int32)])
+    manifest.add_subset(
+        "images2", [("width", Type("int32")), ("height", Type("int32"))]
+    )
     assert "images2" in manifest.subsets
 
     # test adding a duplicate subset
     with pytest.raises(ValueError):
-        manifest.add_subset("images2", [("width", Type.int32), ("height", Type.int32)])
+        manifest.add_subset(
+            "images2", [("width", Type("int32")), ("height", Type("int32"))]
+        )
 
     # test removing a subset
     manifest.remove_subset("images2")