iterative · ilongin · Jul 18, 2024 · Jul 13, 2024 · Jul 13, 2024 · Jul 13, 2024
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
@@ -19,6 +19,7 @@
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataType
 from datachain.lib.dataset_info import DatasetInfo
+from datachain.lib.file import ExportPlacement as FileExportPlacement
 from datachain.lib.file import File, IndexedFile, get_file
 from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.model_store import ModelStore
@@ -1009,3 +1010,19 @@ def setup(self, **kwargs) -> "Self":
 
         self._setup = self._setup | kwargs
         return self
+
+    def export_files(
+        self,
+        output: str,
+        signal="file",
+        placement: FileExportPlacement = "fullpath",
+        use_cache: bool = True,
+    ) -> None:
+        """Method that export all files from chain to some folder"""
+        if placement == "filename":
+            print("Checking if file names are unique")
+            if self.select(f"{signal}.name").distinct().count() != self.count():
+                raise ValueError("Files with the same name found")
+
+        for file in self.collect_one(signal):
+            file.export(output, placement, use_cache)  # type: ignore[union-attr]
diff --git a/src/datachain/lib/file.py b/src/datachain/lib/file.py
@@ -1,5 +1,7 @@
 import io
 import json
+import os
+import posixpath
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from datetime import datetime
@@ -24,6 +26,9 @@
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
 
+# how to create file path when exporting
+ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+
 
 class VFileError(DataChainError):
     def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -186,6 +191,21 @@ def open(self):
         ) as f:
             yield f
 
+    def export(
+        self,
+        output: str,
+        placement: ExportPlacement = "fullpath",
+        use_cache: bool = True,
+    ) -> None:
+        if use_cache:
+            self._caching_enabled = use_cache
+        dst = self.get_destination_path(output, placement)
+        dst_dir = os.path.dirname(dst)
+        os.makedirs(dst_dir, exist_ok=True)
+
+        with open(dst, mode="wb") as f:
+            f.write(self.read())
+
     def _set_stream(
         self,
         catalog: "Catalog",
@@ -233,6 +253,29 @@ def get_path(self) -> str:
             path = url2pathname(path)
         return path
 
+    def get_destination_path(self, output: str, placement: ExportPlacement) -> str:
+        """
+        Returns full destination path of a file for exporting to some output
+        based on export placement
+        """
+        if placement == "filename":
+            path = unquote(self.name)
+        elif placement == "etag":
+            path = f"{self.etag}{self.get_file_suffix()}"
+        elif placement == "fullpath":
+            fs = self.get_fs()
+            if isinstance(fs, LocalFileSystem):
+                path = unquote(self.get_full_name())
+            else:
+                path = (
+                    Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
+                ).as_posix()
+        elif placement == "checksum":
+            raise NotImplementedError("Checksum placement not implemented yet")
+        else:
+            raise ValueError(f"Unsupported file export placement: {placement}")
+        return posixpath.join(output, path)  # type: ignore[union-attr]
+
     def get_fs(self):
         return self._catalog.get_client(self.source).fs
 

diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
@@ -865,6 +865,12 @@ def apply_sql_clause(self, query):
         return sqlalchemy.select(f.count(1)).select_from(query.subquery())
 
 
+@frozen
+class SQLDistinct(SQLClause):
+    def apply_sql_clause(self, query):
+        return query.distinct()
+
+
 @frozen
 class SQLUnion(Step):
     query1: "DatasetQuery"
@@ -1407,6 +1413,12 @@ def offset(self, offset: int) -> "Self":
         query.steps.append(SQLOffset(offset))
         return query
 
+    @detach
+    def distinct(self) -> "Self":
+        query = self.clone()
+        query.steps.append(SQLDistinct())
+        return query
+
     def as_scalar(self) -> Any:
         with self.as_iterable() as rows:
             row = next(iter(rows))

diff --git a/tests/func/test_datachain.py b/tests/func/test_datachain.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 
 from datachain.lib.dc import DataChain
@@ -56,3 +58,58 @@ def test_read_file(cloud_test_catalog, use_cache):
         assert file.get_local_path() is None
         file.read()
         assert bool(file.get_local_path()) is use_cache
+
+
+@pytest.mark.parametrize("placement", ["fullpath", "filename"])
+@pytest.mark.parametrize("use_map", [True, False])
+@pytest.mark.parametrize("use_cache", [True, False])
+@pytest.mark.parametrize("cloud_type", ["file"], indirect=True)
+def test_export_files(tmp_dir, cloud_test_catalog, placement, use_map, use_cache):
+    ctc = cloud_test_catalog
+    df = DataChain.from_storage(ctc.src_uri)
+    if use_map:
+        df.export_files(tmp_dir / "output", placement=placement, use_cache=use_cache)
+        df.map(
+            res=lambda file: file.export(
+                tmp_dir / "output", placement=placement, use_cache=use_cache
+            )
+        ).exec()
+    else:
+        df.export_files(tmp_dir / "output", placement=placement)
+
+    expected = {
+        "description": "Cats and Dogs",
+        "cat1": "meow",
+        "cat2": "mrow",
+        "dog1": "woof",
+        "dog2": "arf",
+        "dog3": "bark",
+        "dog4": "ruff",
+    }
+
+    for file in df.collect_one("file"):
+        if placement == "filename":
+            file_path = file.name
+        else:
+            file_path = file.get_full_name()
+        with open(tmp_dir / "output" / file_path) as f:
+            assert f.read() == expected[file.name]
+
+
+def test_export_files_filename_placement_not_unique_files(tmp_dir, catalog):
+    data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"
+    bucket_name = "mybucket"
+    files = ["dir1/a.json", "dir1/dir2/a.json"]
+
+    # create bucket dir with duplicate file names
+    bucket_dir = tmp_dir / bucket_name
+    bucket_dir.mkdir(parents=True)
+    for file_path in files:
+        file_path = bucket_dir / file_path
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "wb") as fd:
+            fd.write(data)
+
+    df = DataChain.from_storage((tmp_dir / bucket_name).as_uri())
+    with pytest.raises(ValueError):
+        df.export_files(tmp_dir / "output", placement="filename")
diff --git a/tests/func/test_dataset_query.py b/tests/func/test_dataset_query.py
@@ -434,6 +434,40 @@ def test_select_except(cloud_test_catalog):
     ]
 
 
+@pytest.mark.parametrize(
+    "cloud_type,version_aware",
+    [("s3", True)],
+    indirect=True,
+)
+def test_distinct(cloud_test_catalog):
+    catalog = cloud_test_catalog.catalog
+    path = cloud_test_catalog.src_uri
+    ds = DatasetQuery(path=path, catalog=catalog)
+
+    q = ds.select(C.parent).order_by(C.parent).distinct()
+    result = q.results()
+
+    assert result == [
+        ("",),
+        ("cats",),
+        ("dogs",),
+        ("dogs/others",),
+    ]
+
+
+@pytest.mark.parametrize(
+    "cloud_type,version_aware",
+    [("s3", True)],
+    indirect=True,
+)
+def test_distinct_count(cloud_test_catalog):
+    catalog = cloud_test_catalog.catalog
+    path = cloud_test_catalog.src_uri
+    ds = DatasetQuery(path=path, catalog=catalog)
+
+    assert ds.select(C.parent).order_by(C.parent).distinct().count() == 4
+
+
 @pytest.mark.parametrize("save", [True, False])
 @pytest.mark.parametrize(
     "cloud_type,version_aware",

diff --git a/tests/unit/lib/test_file.py b/tests/unit/lib/test_file.py
@@ -8,6 +8,15 @@
 from datachain.lib.file import File, TextFile
 
 
+def create_file(source: str):
+    return File(
+        name="test.txt",
+        parent="dir1/dir2",
+        source=source,
+        etag="ed779276108738fdb2179ccabf9680d9",
+    )
+
+
 def test_uid_missing_location():
     name = "my_name"
     vtype = "vt1"
@@ -65,6 +74,47 @@ def test_cache_get_path(catalog: Catalog):
         assert f.read() == data
 
 
+def test_get_destination_path_wrong_strategy():
+    file = create_file("s3://mybkt")
+    with pytest.raises(ValueError):
+        file.get_destination_path("", "wrong")
+
+
+def test_get_destination_path_filename_strategy():
+    file = create_file("s3://mybkt")
+    assert file.get_destination_path("output", "filename") == "output/test.txt"
+
+
+def test_get_destination_path_empty_output():
+    file = create_file("s3://mybkt")
+    assert file.get_destination_path("", "filename") == "test.txt"
+
+
+def test_get_destination_path_etag_strategy():
+    file = create_file("s3://mybkt")
+    assert (
+        file.get_destination_path("output", "etag")
+        == "output/ed779276108738fdb2179ccabf9680d9.txt"
+    )
+
+
+def test_get_destination_path_fullpath_strategy(catalog):
+    file = create_file("s3://mybkt")
+    file._set_stream(catalog, False)
+    assert (
+        file.get_destination_path("output", "fullpath")
+        == "output/mybkt/dir1/dir2/test.txt"
+    )
+
+
+def test_get_destination_path_fullpath_strategy_file_source(catalog, tmp_path):
+    file = create_file("file:///")
+    file._set_stream(catalog, False)
+    assert (
+        file.get_destination_path("output", "fullpath") == "output/dir1/dir2/test.txt"
+    )
+
+
 def test_read_binary_data(tmp_path, catalog: Catalog):
     file_name = "myfile"
     data = b"some\x00data\x00is\x48\x65\x6c\x57\x6f\x72\x6c\x64\xff\xffheRe"