iterative · dberenbaum · Jul 22, 2024 · Jul 20, 2024 · dberenbaum · Jul 20, 2024
diff --git a/src/datachain/lib/arrow.py b/src/datachain/lib/arrow.py
@@ -4,6 +4,7 @@
 
 import pyarrow as pa
 from pyarrow.dataset import dataset
+from tqdm import tqdm
 
 from datachain.lib.file import File, IndexedFile
 from datachain.lib.udf import Generator
@@ -30,11 +31,13 @@ def process(self, file: File):
         path = file.get_path()
         ds = dataset(path, filesystem=file.get_fs(), schema=self.schema, **self.kwargs)
         index = 0
-        for record_batch in ds.to_batches():
-            for record in record_batch.to_pylist():
-                source = IndexedFile(file=file, index=index)
-                yield [source, *record.values()]
-                index += 1
+        with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
+            for record_batch in ds.to_batches():
+                for record in record_batch.to_pylist():
+                    source = IndexedFile(file=file, index=index)
+                    yield [source, *record.values()]
+                    index += 1
+                pbar.update(len(record_batch))
 
 
 def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:

diff --git a/src/datachain/lib/file.py b/src/datachain/lib/file.py
@@ -185,10 +185,10 @@ def __init__(self, **kwargs):
         self._caching_enabled = False
 
     @contextmanager
-    def open(self):
-        """Stream in binary mode like `with file.open()`."""
+    def open(self, mode: Literal["rb", "r"] = "rb"):
+        """Open the file and return a file object."""
         if self.location:
-            with VFileRegistry.resolve(self, self.location) as f:
+            with VFileRegistry.resolve(self, self.location) as f:  # type: ignore[arg-type]
                 yield f
 
         uid = self.get_uid()
@@ -198,11 +198,20 @@ def open(self):
         with client.open_object(
             uid, use_cache=self._caching_enabled, cb=self._download_cb
         ) as f:
-            yield f
+            yield io.TextIOWrapper(f) if mode == "r" else f
 
-    def read(self):
+    def read(self, length: int = -1):
         """Returns file contents."""
         with self.open() as stream:
+            return stream.read(length)
+
+    def read_bytes(self):
+        """Returns file contents as bytes."""
+        return self.read()
+
+    def read_text(self):
+        """Returns file contents as text."""
+        with self.open(mode="r") as stream:
             return stream.read()
 
     def export(
@@ -308,9 +317,9 @@ class TextFile(File):
 
     @contextmanager
     def open(self):
-        """Stream in text mode like `with file.open()`."""
-        with super().open() as binary:
-            yield io.TextIOWrapper(binary)
+        """Open the file and return a file object in text mode."""
+        with super().open(mode="r") as stream:
+            yield stream
 
 
 class ImageFile(File):

diff --git a/tests/unit/lib/test_file.py b/tests/unit/lib/test_file.py
@@ -220,3 +220,56 @@ def test_get_fs(catalog):
     file = File(name="file", parent="dir", source="file:///")
     file._catalog = catalog
     assert isinstance(file.get_fs(), LocalFileSystem)
+
+
+def test_open_mode(tmp_path, catalog: Catalog):
+    file_name = "myfile"
+    data = "this is a TexT data..."
+
+    file_path = tmp_path / file_name
+    with open(file_path, "w") as fd:
+        fd.write(data)
+
+    file = File(name=file_name, source=f"file://{tmp_path}")
+    file._set_stream(catalog, True)
+    with file.open(mode="r") as stream:
+        assert stream.read() == data
+
+
+def test_read_length(tmp_path, catalog):
+    file_name = "myfile"
+    data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
+
+    file_path = tmp_path / file_name
+    with open(file_path, "wb") as fd:
+        fd.write(data)
+
+    file = File(name=file_name, source=f"file://{tmp_path}")
+    file._set_stream(catalog, False)
+    assert file.read(length=4) == data[:4]
+
+
+def test_read_bytes(tmp_path, catalog):
+    file_name = "myfile"
+    data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
+
+    file_path = tmp_path / file_name
+    with open(file_path, "wb") as fd:
+        fd.write(data)
+
+    file = File(name=file_name, source=f"file://{tmp_path}")
+    file._set_stream(catalog, False)
+    assert file.read_bytes() == data
+
+
+def test_read_text(tmp_path, catalog):
+    file_name = "myfile"
+    data = "this is a TexT data..."
+
+    file_path = tmp_path / file_name
+    with open(file_path, "w") as fd:
+        fd.write(data)
+
+    file = File(name=file_name, source=f"file://{tmp_path}")
+    file._set_stream(catalog, True)
+    assert file.read_text() == data