iterative · ilongin · Nov 6, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py
@@ -58,7 +58,7 @@
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
-from datachain.sql.types import DateTime, SQLType, String
+from datachain.sql.types import DateTime, SQLType
 from datachain.utils import (
     DataChainDir,
     batched,
@@ -196,11 +196,6 @@ def fix_columns(self, df) -> None:
         for c in [c for c, t in self.schema.items() if t == DateTime]:
             df[c] = pd.to_datetime(df[c], unit="s")
 
-        # strings are represented as binaries in parquet export so need to
-        # decode it back to strings
-        for c in [c for c, t in self.schema.items() if t == String]:
-            df[c] = df[c].str.decode("utf-8")
-
     def do_task(self, urls):
         import lz4.frame
         import pandas as pd
@@ -1403,6 +1398,7 @@ def _instantiate_dataset():
             query_script=remote_dataset_version.query_script,
             create_rows=True,
             columns=columns,
+            feature_schema=remote_dataset_version.feature_schema,
             validate_version=False,
         )
 

diff --git a/src/datachain/data_storage/schema.py b/src/datachain/data_storage/schema.py
@@ -145,6 +145,8 @@ def query(self, q):
 
 
 class DataTable:
+    MAX_RANDOM = 2**63 - 1
+
     def __init__(
         self,
         name: str,
@@ -269,8 +271,8 @@ def update(self):
     def delete(self):
         return self.apply_conditions(self.table.delete())
 
-    @staticmethod
-    def sys_columns():
+    @classmethod
+    def sys_columns(cls):
         return [
             sa.Column("sys__id", Int, primary_key=True),
             sa.Column(

diff --git a/src/datachain/sql/types.py b/src/datachain/sql/types.py
@@ -187,6 +187,22 @@
         return read_converter(dialect).int32(value)
 
 
+class UInt32(Int):
+    def load_dialect_impl(self, dialect):
+        return converter(dialect).uint32()
+
+    @staticmethod
+    def default_value(dialect):
+        return type_defaults(dialect).uint32()
+
+    @staticmethod
+    def db_default_value(dialect):
+        return db_defaults(dialect).uint32()
+
+    def on_read_convert(self, value, dialect):
+        return read_converter(dialect).uint32(value)
+
+
 class Int64(Int):
     def load_dialect_impl(self, dialect):
         return converter(dialect).int64()
@@ -395,6 +411,9 @@
     def int32(self, value):
         return value
 
+    def uint32(self, value):
+        return value
+
     def int64(self, value):
         return value
 
@@ -421,6 +440,8 @@
 
     def json(self, value):
         if isinstance(value, str):
+            if value == "":
+                return {}
             return orjson.loads(value)
         return value
 
@@ -446,6 +467,9 @@
     def int32(self):
         return self.int()
 
+    def uint32(self):
+        return self.int()
+
     def int64(self):
         return self.int()
 
@@ -487,6 +511,9 @@
     def int32(self):
         return None
 
+    def uint32(self):
+        return None
+
     def int64(self):
         return None
 
@@ -528,6 +555,9 @@
     def int32(self):
         return self.int()
 
+    def uint32(self):
+        return self.int()
+
     def int64(self):
         return self.int()
 
@@ -561,6 +591,7 @@
     Boolean,
     Int,
     Int32,
+    UInt32,
     Int64,
     UInt64,
     Float,

diff --git a/tests/func/test_datasets.py b/tests/func/test_datasets.py
@@ -5,7 +5,7 @@
 import pytest
 import sqlalchemy as sa
 
-from datachain.data_storage.sqlite import SQLiteWarehouse
+from datachain.data_storage.schema import DataTable
 from datachain.dataset import DatasetDependencyType, DatasetStatus
 from datachain.error import (
     DatasetInvalidVersionError,
@@ -827,10 +827,7 @@ def test_row_random(cloud_test_catalog):
     # Random values are unique
     assert len(set(random_values)) == len(random_values)
 
-    if isinstance(catalog.warehouse, SQLiteWarehouse):
-        RAND_MAX = 2**63  # noqa: N806
-    else:
-        RAND_MAX = 2**64  # noqa: N806
+    RAND_MAX = DataTable.MAX_RANDOM  # noqa: N806
 
     # Values are drawn uniformly from range(2**63)
     assert 0 <= min(random_values) < 0.4 * RAND_MAX

diff --git a/tests/func/test_pull.py b/tests/func/test_pull.py
@@ -49,19 +49,17 @@ def _adapt_row(row):
         """
         adapted = {}
         for k, v in row.items():
-            if isinstance(v, str):
-                adapted[k] = v.encode("utf-8")
-            elif isinstance(v, datetime):
+            if isinstance(v, datetime):
                 adapted[k] = v.timestamp()
             elif v is None:
-                adapted[k] = b""
+                adapted[k] = ""
             else:
                 adapted[k] = v
 
         adapted["sys__id"] = 1
         adapted["sys__rand"] = 1
-        adapted["file__location"] = b""
-        adapted["file__source"] = b"s3://dogs"
+        adapted["file__location"] = ""
+        adapted["file__source"] = "s3://dogs"
         return adapted
 
     dog_entries = [_adapt_row(e) for e in dog_entries]

diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py
@@ -27,6 +27,7 @@
     Int32,
     Int64,
     String,
+    UInt32,
     UInt64,
 )
 
@@ -721,6 +722,7 @@ def test_mutate_change_type():
         [Boolean, bool],
         [Int, int],
         [Int32, int],
+        [UInt32, int],
         [Int64, int],
         [UInt64, int],
         [Float, float],

diff --git a/tests/unit/test_data_storage.py b/tests/unit/test_data_storage.py
@@ -18,6 +18,7 @@
     Int32,
     Int64,
     String,
+    UInt32,
     UInt64,
 )
 from tests.utils import (
@@ -173,6 +174,7 @@ def run_convert_type(value, sql_type):
         [Boolean(), False],
         [Int(), 0],
         [Int32(), 0],
+        [UInt32(), 0],
         [Int64(), 0],
         [UInt64(), 0],
         [Float(), lambda val: math.isnan(val)],