From 403c7946fce888be1230b15d7c3abd7645ca7e49 Mon Sep 17 00:00:00 2001
From: shibuiwilliam <shibuiyusuke@gmail.com>
Date: Sat, 7 Dec 2024 02:19:51 +0900
Subject: [PATCH 01/10] add some missing type hint (#648)

* add some missing type hint

* Move import under TYPE_CHECKING

---------

Co-authored-by: Vladimir Rudnykh <dreadatour@gmail.com>
---
 src/datachain/client/fsspec.py                |  6 ++++--
 src/datachain/client/local.py                 | 13 +++++++++----
 src/datachain/lib/convert/flatten.py          | 12 +++++++-----
 src/datachain/lib/convert/unflatten.py        |  4 ++--
 src/datachain/lib/convert/values_to_tuples.py |  2 +-
 src/datachain/lib/utils.py                    |  2 +-
 src/datachain/query/dataset.py                |  2 +-
 7 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/datachain/client/fsspec.py b/src/datachain/client/fsspec.py
index 91774e381..b03abae12 100644
--- a/src/datachain/client/fsspec.py
+++ b/src/datachain/client/fsspec.py
@@ -172,7 +172,7 @@ def is_root_url(cls, url) -> bool:
         return url == cls.PREFIX
 
     @classmethod
-    def get_uri(cls, name) -> "StorageURI":
+    def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
 
         return StorageURI(f"{cls.PREFIX}{name}")
@@ -278,7 +278,9 @@ async def _fetch_default(
     ) -> None:
         await self._fetch_nested(start_prefix, result_queue)
 
-    async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]:
+    async def _fetch_dir(
+        self, prefix: str, pbar, result_queue: ResultQueue
+    ) -> set[str]:
         path = f"{self.name}/{prefix}"
         infos = await self.ls_dir(path)
         files = []
diff --git a/src/datachain/client/local.py b/src/datachain/client/local.py
index d76c53681..d654ad600 100644
--- a/src/datachain/client/local.py
+++ b/src/datachain/client/local.py
@@ -12,6 +12,7 @@
 from .fsspec import Client
 
 if TYPE_CHECKING:
+    from datachain.cache import DataChainCache
     from datachain.dataset import StorageURI
 
 
@@ -21,7 +22,11 @@ class FileClient(Client):
     protocol = "file"
 
     def __init__(
-        self, name: str, fs_kwargs: dict[str, Any], cache, use_symlinks: bool = False
+        self,
+        name: str,
+        fs_kwargs: dict[str, Any],
+        cache: "DataChainCache",
+        use_symlinks: bool = False,
     ) -> None:
         super().__init__(name, fs_kwargs, cache)
         self.use_symlinks = use_symlinks
@@ -30,7 +35,7 @@ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
         raise TypeError("Signed urls are not implemented for local file system")
 
     @classmethod
-    def get_uri(cls, name) -> "StorageURI":
+    def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
 
         return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
@@ -77,7 +82,7 @@ def split_url(cls, url: str) -> tuple[str, str]:
         return bucket, path
 
     @classmethod
-    def from_name(cls, name: str, cache, kwargs) -> "FileClient":
+    def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
         use_symlinks = kwargs.pop("use_symlinks", False)
         return cls(name, kwargs, cache, use_symlinks=use_symlinks)
 
@@ -85,7 +90,7 @@ def from_name(cls, name: str, cache, kwargs) -> "FileClient":
     def from_source(
         cls,
         uri: str,
-        cache,
+        cache: "DataChainCache",
         use_symlinks: bool = False,
         **kwargs,
     ) -> "FileClient":
diff --git a/src/datachain/lib/convert/flatten.py b/src/datachain/lib/convert/flatten.py
index 7b73c5e2d..a35df07d9 100644
--- a/src/datachain/lib/convert/flatten.py
+++ b/src/datachain/lib/convert/flatten.py
@@ -1,19 +1,21 @@
+from collections.abc import Generator
+
 from pydantic import BaseModel
 
 from datachain.lib.model_store import ModelStore
 
 
-def flatten(obj: BaseModel):
+def flatten(obj: BaseModel) -> tuple:
     return tuple(_flatten_fields_values(obj.model_fields, obj))
 
 
-def flatten_list(obj_list):
+def flatten_list(obj_list: list[BaseModel]) -> tuple:
     return tuple(
         val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj)
     )
 
 
-def _flatten_list_field(value: list):
+def _flatten_list_field(value: list) -> list:
     assert isinstance(value, list)
     if value and ModelStore.is_pydantic(type(value[0])):
         return [val.model_dump() for val in value]
@@ -22,7 +24,7 @@ def _flatten_list_field(value: list):
     return value
 
 
-def _flatten_fields_values(fields, obj: BaseModel):
+def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator:
     for name, f_info in fields.items():
         anno = f_info.annotation
         # Optimization: Access attributes directly to skip the model_dump() call.
@@ -40,5 +42,5 @@ def _flatten_fields_values(fields, obj: BaseModel):
             yield value
 
 
-def _flatten(obj):
+def _flatten(obj: BaseModel) -> tuple:
     return tuple(_flatten_fields_values(obj.model_fields, obj))
diff --git a/src/datachain/lib/convert/unflatten.py b/src/datachain/lib/convert/unflatten.py
index 2a38959d8..a41a4b65a 100644
--- a/src/datachain/lib/convert/unflatten.py
+++ b/src/datachain/lib/convert/unflatten.py
@@ -9,12 +9,12 @@
 from datachain.query.schema import DEFAULT_DELIMITER
 
 
-def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict:
+def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos: int = 0) -> dict:
     return unflatten_to_json_pos(model, row, pos)[0]
 
 
 def unflatten_to_json_pos(
-    model: type[BaseModel], row: Sequence[Any], pos=0
+    model: type[BaseModel], row: Sequence[Any], pos: int = 0
 ) -> tuple[dict, int]:
     res = {}
     for name, f_info in model.model_fields.items():
diff --git a/src/datachain/lib/convert/values_to_tuples.py b/src/datachain/lib/convert/values_to_tuples.py
index c75f77da7..f92206912 100644
--- a/src/datachain/lib/convert/values_to_tuples.py
+++ b/src/datachain/lib/convert/values_to_tuples.py
@@ -11,7 +11,7 @@
 
 
 class ValuesToTupleError(DataChainParamsError):
-    def __init__(self, ds_name, msg):
+    def __init__(self, ds_name: str, msg: str):
         if ds_name:
             ds_name = f"' {ds_name}'"
         super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
diff --git a/src/datachain/lib/utils.py b/src/datachain/lib/utils.py
index c7d90e65a..7865ac70b 100644
--- a/src/datachain/lib/utils.py
+++ b/src/datachain/lib/utils.py
@@ -28,7 +28,7 @@ def __init__(self, message):
 
 
 class DataChainColumnError(DataChainParamsError):
-    def __init__(self, col_name, msg):
+    def __init__(self, col_name: str, msg: str):
         super().__init__(f"Error for column {col_name}: {msg}")
 
 
diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py
index effc24caf..46058ba83 100644
--- a/src/datachain/query/dataset.py
+++ b/src/datachain/query/dataset.py
@@ -215,7 +215,7 @@ def query(
         Should return select query that calculates desired diff between dataset queries
         """
 
-    def apply(self, query_generator, temp_tables: list[str]):
+    def apply(self, query_generator, temp_tables: list[str]) -> "StepResult":
         source_query = query_generator.exclude(("sys__id",))
         target_query = self.dq.apply_steps().select()
         temp_tables.extend(self.dq.temp_table_names)

From e881c1fef3724a590c1da43c9fc253b2200d1d38 Mon Sep 17 00:00:00 2001
From: Ivan Shcheklein <shcheklein@gmail.com>
Date: Fri, 6 Dec 2024 17:01:50 -0800
Subject: [PATCH 02/10] update readme - lower use cases to the ground (#668)

* update readme - lower use cases to the ground

* lint
---
 README.rst | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/README.rst b/README.rst
index 8cfd74d35..d2025b29d 100644
--- a/README.rst
+++ b/README.rst
@@ -28,16 +28,20 @@ in an internal database for easy and efficient querying.
 Use Cases
 =========
 
-1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
-   refining data in pre-training, finetuning or LLM evaluating stages.
-2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
-   ad-hoc analytics using LLMs.
+1. **ETL.** Pythonic framework for describing and running unstructured data transformations
+   and enrichments, applying models to data, including LLMs.
+2. **Analytics.** DataChain dataset is a table that combines all the information about data
+   objects in one place + it provides dataframe-like API and vecrorized engine to do analytics
+   on these tables at scale.
+3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
+   Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
+
 
 Key Features
 ============
 
 📂 **Multimodal Dataset Versioning.**
-   - Version unstructured data without redundant data copies, by supporting
+   - Version unstructured data without moving or creating data copies, by supporting
      references to S3, GCP, Azure, and local file systems.
    - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
    - Unite files and metadata together into persistent, versioned, columnar datasets.
@@ -145,7 +149,7 @@ detected are then copied to the local directory.
 
 
 LLM judging chatbots
-=============================
+====================
 
 LLMs can work as universal classifiers. In the example below,
 we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
@@ -255,7 +259,7 @@ Output:
 
 
 Iterating over Python data structures
-=============================================
+=====================================
 
 In the previous examples, datasets were saved in the embedded database
 (`SQLite`_ in folder `.datachain` of the working directory).
@@ -343,7 +347,7 @@ DataChain Studio Platform
 
 - **Centralized dataset registry** to manage data, code and dependency
   dependencies in one place.
-- **Data Lineage** for data sources as well as direvative dataset.
+- **Data Lineage** for data sources as well as derivative dataset.
 - **UI for Multimodal Data** like images, videos, and PDFs.
 - **Scalable Compute** to handle large datasets (100M+ files) and in-house
   AI model inference.
@@ -368,7 +372,7 @@ To learn more, see the `Contributor Guide`_.
 Community and Support
 ---------------------
 
-* `Docs <https://datachain.dvc.ai/>`_
+* `Docs <https://docs.datachain.ai/>`_
 * `File an issue`_ if you encounter any problems
 * `Discord Chat <https://dvc.org/chat>`_
 * `Email <mailto:support@dvc.org>`_

From d4cd7a8681e06f38be56c4b7f93038ac46647652 Mon Sep 17 00:00:00 2001
From: Vladimir Rudnykh <dreadatour@gmail.com>
Date: Mon, 9 Dec 2024 08:56:26 +0700
Subject: [PATCH 03/10] Add 'bit_hamming_distance' and 'byte_hamming_distance'
 SQLite functions (#669)

---
 src/datachain/func/__init__.py         |  5 ++-
 src/datachain/func/numeric.py          | 46 +++++++++++++++++++
 src/datachain/func/string.py           | 46 +++++++++++++++++++
 src/datachain/sql/functions/numeric.py | 12 +++++
 src/datachain/sql/functions/string.py  | 12 +++++
 src/datachain/sql/sqlite/base.py       | 40 +++++++++++++++++
 tests/unit/test_func.py                | 62 +++++++++++++++++++++++++-
 7 files changed, 220 insertions(+), 3 deletions(-)

diff --git a/src/datachain/func/__init__.py b/src/datachain/func/__init__.py
index cfbbedea0..7e77d770b 100644
--- a/src/datachain/func/__init__.py
+++ b/src/datachain/func/__init__.py
@@ -17,8 +17,9 @@
 )
 from .array import cosine_distance, euclidean_distance, length, sip_hash_64
 from .conditional import greatest, least
-from .numeric import bit_and, bit_or, bit_xor, int_hash_64
+from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
+from .string import byte_hamming_distance
 from .window import window
 
 __all__ = [
@@ -26,8 +27,10 @@
     "array",
     "avg",
     "bit_and",
+    "bit_hamming_distance",
     "bit_or",
     "bit_xor",
+    "byte_hamming_distance",
     "case",
     "collect",
     "concat",
diff --git a/src/datachain/func/numeric.py b/src/datachain/func/numeric.py
index a26e7ff51..7ae60460f 100644
--- a/src/datachain/func/numeric.py
+++ b/src/datachain/func/numeric.py
@@ -160,3 +160,49 @@ def int_hash_64(col: Union[ColT, int]) -> Func:
     return Func(
         "int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int
     )
+
+
+def bit_hamming_distance(*args: Union[ColT, int]) -> Func:
+    """
+    Computes the Hamming distance between the bit representations of two integer values.
+
+    The Hamming distance is the number of positions at which the corresponding bits
+    are different. This function returns the dissimilarity between the integers,
+    where 0 indicates identical integers and values closer to the number of bits
+    in the integer indicate higher dissimilarity.
+
+    Args:
+        args (str | int): Two integers to compute the Hamming distance between.
+            If a str is provided, it is assumed to be the name of the column.
+            If an int is provided, it is assumed to be an integer literal.
+
+    Returns:
+        Func: A Func object that represents the Hamming distance function.
+
+    Example:
+        ```py
+        dc.mutate(
+            ham_dist=func.bit_hamming_distance("embed1", 123456),
+        )
+        ```
+
+    Notes:
+        - Result column will always be of type int.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, int):
+            func_args.append(arg)
+        else:
+            cols.append(arg)
+
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("bit_hamming_distance() requires exactly two arguments")
+
+    return Func(
+        "bit_hamming_distance",
+        inner=numeric.bit_hamming_distance,
+        cols=cols,
+        args=func_args,
+        result_type=int,
+    )
diff --git a/src/datachain/func/string.py b/src/datachain/func/string.py
index 33aa5a22d..448c7f447 100644
--- a/src/datachain/func/string.py
+++ b/src/datachain/func/string.py
@@ -152,3 +152,49 @@ def inner(arg):
         args = None
 
     return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)
+
+
+def byte_hamming_distance(*args: Union[str, Func]) -> Func:
+    """
+    Computes the Hamming distance between two strings.
+
+    The Hamming distance is the number of positions at which the corresponding
+    characters are different. This function returns the dissimilarity between
+    the strings, where 0 indicates identical strings and values closer to the length
+    of the strings indicate higher dissimilarity.
+
+    Args:
+        args (str | literal): Two strings to compute the Hamming distance between.
+            If a str is provided, it is assumed to be the name of the column.
+            If a Literal is provided, it is assumed to be a string literal.
+
+    Returns:
+        Func: A Func object that represents the Hamming distance function.
+
+    Example:
+        ```py
+        dc.mutate(
+            ham_dist=func.byte_hamming_distance("file.phash", literal("hello")),
+        )
+        ```
+
+    Notes:
+        - Result column will always be of type int.
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if get_origin(arg) is literal:
+            func_args.append(arg)
+        else:
+            cols.append(arg)
+
+    if len(cols) + len(func_args) != 2:
+        raise ValueError("byte_hamming_distance() requires exactly two arguments")
+
+    return Func(
+        "byte_hamming_distance",
+        inner=string.byte_hamming_distance,
+        cols=cols,
+        args=func_args,
+        result_type=int,
+    )
diff --git a/src/datachain/sql/functions/numeric.py b/src/datachain/sql/functions/numeric.py
index 2a7a82d6c..968eecc8d 100644
--- a/src/datachain/sql/functions/numeric.py
+++ b/src/datachain/sql/functions/numeric.py
@@ -35,9 +35,21 @@ class int_hash_64(GenericFunction):  # noqa: N801
     inherit_cache = True
 
 
+class bit_hamming_distance(GenericFunction):  # noqa: N801
+    """
+    Returns the Hamming distance between two integers.
+    """
+
+    type = Int64()
+    package = "numeric"
+    name = "hamming_distance"
+    inherit_cache = True
+
+
 compiler_not_implemented(bit_and)
 compiler_not_implemented(bit_or)
 compiler_not_implemented(bit_xor)
 compiler_not_implemented(bit_rshift)
 compiler_not_implemented(bit_lshift)
 compiler_not_implemented(int_hash_64)
+compiler_not_implemented(bit_hamming_distance)
diff --git a/src/datachain/sql/functions/string.py b/src/datachain/sql/functions/string.py
index 4ccee8444..545c706a9 100644
--- a/src/datachain/sql/functions/string.py
+++ b/src/datachain/sql/functions/string.py
@@ -48,7 +48,19 @@ class replace(GenericFunction):  # noqa: N801
     inherit_cache = True
 
 
+class byte_hamming_distance(GenericFunction):  # noqa: N801
+    """
+    Returns the Hamming distance between two strings.
+    """
+
+    type = Int64()
+    package = "string"
+    name = "hamming_distance"
+    inherit_cache = True
+
+
 compiler_not_implemented(length)
 compiler_not_implemented(split)
 compiler_not_implemented(regexp_replace)
 compiler_not_implemented(replace)
+compiler_not_implemented(byte_hamming_distance)
diff --git a/src/datachain/sql/sqlite/base.py b/src/datachain/sql/sqlite/base.py
index d5589cf85..552d39efd 100644
--- a/src/datachain/sql/sqlite/base.py
+++ b/src/datachain/sql/sqlite/base.py
@@ -90,6 +90,7 @@ def setup():
     compiles(string.split, "sqlite")(compile_string_split)
     compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace)
     compiles(string.replace, "sqlite")(compile_string_replace)
+    compiles(string.byte_hamming_distance, "sqlite")(compile_byte_hamming_distance)
     compiles(conditional.greatest, "sqlite")(compile_greatest)
     compiles(conditional.least, "sqlite")(compile_least)
     compiles(Values, "sqlite")(compile_values)
@@ -104,6 +105,7 @@ def setup():
     compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift)
     compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift)
     compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64)
+    compiles(numeric.bit_hamming_distance, "sqlite")(compile_bit_hamming_distance)
 
     if load_usearch_extension(sqlite3.connect(":memory:")):
         compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -191,6 +193,26 @@ def sqlite_int_hash_64(x: int) -> int:
     return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64)
 
 
+def sqlite_bit_hamming_distance(a: int, b: int) -> int:
+    """Calculate the Hamming distance between two integers."""
+    diff = (a & MAX_INT64) ^ (b & MAX_INT64)
+    if hasattr(diff, "bit_count"):
+        return diff.bit_count()
+    return bin(diff).count("1")
+
+
+def sqlite_byte_hamming_distance(a: str, b: str) -> int:
+    """Calculate the Hamming distance between two strings."""
+    diff = 0
+    if len(a) < len(b):
+        diff = len(b) - len(a)
+        b = b[: len(a)]
+    elif len(b) < len(a):
+        diff = len(a) - len(b)
+        a = a[: len(b)]
+    return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
+
+
 def register_user_defined_sql_functions() -> None:
     # Register optional functions if we have the necessary dependencies
     # and otherwise register functions that will raise an exception with
@@ -225,6 +247,9 @@ def create_numeric_functions(conn):
             "bitwise_lshift", 2, lambda a, b: a << b, deterministic=True
         )
         conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True)
+        conn.create_function(
+            "bit_hamming_distance", 2, sqlite_bit_hamming_distance, deterministic=True
+        )
 
     _registered_function_creators["numeric_functions"] = create_numeric_functions
 
@@ -237,6 +262,9 @@ def create_string_functions(conn):
         conn.create_function(
             "regexp_replace", 3, sqlite_regexp_replace, deterministic=True
         )
+        conn.create_function(
+            "byte_hamming_distance", 2, sqlite_byte_hamming_distance, deterministic=True
+        )
 
     _registered_function_creators["string_functions"] = create_string_functions
 
@@ -383,6 +411,18 @@ def compile_int_hash_64(element, compiler, **kwargs):
     return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs)
 
 
+def compile_bit_hamming_distance(element, compiler, **kwargs):
+    return compiler.process(
+        func.bit_hamming_distance(*element.clauses.clauses), **kwargs
+    )
+
+
+def compile_byte_hamming_distance(element, compiler, **kwargs):
+    return compiler.process(
+        func.byte_hamming_distance(*element.clauses.clauses), **kwargs
+    )
+
+
 def py_json_array_length(arr):
     return len(orjson.loads(arr))
 
diff --git a/tests/unit/test_func.py b/tests/unit/test_func.py
index 7ca4d3dde..fc9e93455 100644
--- a/tests/unit/test_func.py
+++ b/tests/unit/test_func.py
@@ -2,11 +2,20 @@
 from sqlalchemy import Label
 
 from datachain import DataChain
-from datachain.func import int_hash_64
+from datachain.func import (
+    bit_hamming_distance,
+    byte_hamming_distance,
+    int_hash_64,
+    literal,
+)
 from datachain.func.random import rand
 from datachain.func.string import length as strlen
 from datachain.lib.signal_schema import SignalSchema
-from datachain.sql.sqlite.base import sqlite_int_hash_64
+from datachain.sql.sqlite.base import (
+    sqlite_bit_hamming_distance,
+    sqlite_byte_hamming_distance,
+    sqlite_int_hash_64,
+)
 
 
 @pytest.fixture()
@@ -584,3 +593,52 @@ def test_int_hash_64_mutate(dc):
         7766709361750702608,
         15228578409069794350,
     ]
+
+
+@pytest.mark.parametrize(
+    "value1,value2,distance",
+    [
+        [0, 0, 0],
+        [0, 1, 1],
+        [2, 3, 1],
+        [2, 4, 2],
+        [0, 2**64 - 1, 64],
+        [-(2**63), 2**63 - 1, 64],
+        [-(2**63), 2**63, 0],
+    ],
+)
+def test_sqlite_bit_hamming_distance(value1, value2, distance):
+    assert sqlite_bit_hamming_distance(value1, value2) == distance
+
+
+def test_bit_hamming_distance_mutate(dc):
+    res = (
+        dc.mutate(test=bit_hamming_distance(strlen("val"), 5))
+        .order_by("num")
+        .collect("test")
+    )
+    assert list(res) == [1, 3, 2, 1, 0]
+
+
+@pytest.mark.parametrize(
+    "value1,value2,distance",
+    [
+        ["", "", 0],
+        ["", "a", 1],
+        ["foo", "foo", 0],
+        ["foo", "bar", 3],
+        ["foo", "foobar", 3],
+        ["karolin", "kathrin", 3],
+    ],
+)
+def test_sqlite_byte_hamming_distance(value1, value2, distance):
+    assert sqlite_byte_hamming_distance(value1, value2) == distance
+
+
+def test_byte_hamming_distance_mutate(dc):
+    res = (
+        dc.mutate(test=byte_hamming_distance("val", literal("xxx")))
+        .order_by("num")
+        .collect("test")
+    )
+    assert list(res) == [2, 1, 0, 1, 2]

From 2c8226ac23b4af0505c1d33931bcb96524668b9e Mon Sep 17 00:00:00 2001
From: Thomas Kunwar <20840228+yathomasi@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:35:51 +0545
Subject: [PATCH 04/10] feat: update documentation and enhance navigation
 structure (#670)

* feat: update documentation and enhance navigation structure

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* chore: minor updates and fixes

* chore: update readme (#672)

* chore: update readme

* chore: minor update

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CONTRIBUTING.rst         | 129 -------------
 README.rst               | 317 ++----------------------------
 docs/contributing.md     | 111 +++++++++++
 docs/examples.md         | 289 ++++++++++++++++++++++++++++
 docs/index.md            | 405 ++++++++++-----------------------------
 docs/quick-start.md      | 286 +++++++++++++++++++++++++++
 docs/references/index.md |  14 +-
 docs/tutorials.md        |   5 +
 mkdocs.yml               |  15 +-
 9 files changed, 829 insertions(+), 742 deletions(-)
 delete mode 100644 CONTRIBUTING.rst
 create mode 100644 docs/contributing.md
 create mode 100644 docs/examples.md
 create mode 100644 docs/quick-start.md
 create mode 100644 docs/tutorials.md

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
deleted file mode 100644
index e20a29e09..000000000
--- a/CONTRIBUTING.rst
+++ /dev/null
@@ -1,129 +0,0 @@
-Contributor Guide
-=================
-
-Thank you for your interest in improving this project.
-This project is open-source under the `Apache 2.0 license`_ and
-welcomes contributions in the form of bug reports, feature requests, and pull requests.
-
-Here is a list of important resources for contributors:
-
-- `Source Code`_
-- `Documentation`_
-- `Issue Tracker`_
-- `Code of Conduct`_
-
-.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
-.. _Source Code: https://github.com/iterative/datachain
-.. _Documentation: https://docs.dvc.ai/datachain
-.. _Issue Tracker: https://github.com/iterative/datachain/issues
-
-How to report a bug
--------------------
-
-Report bugs on the `Issue Tracker`_.
-
-When filing an issue, make sure to answer these questions:
-
-- Which operating system and Python version are you using?
-- Which version of this project are you using?
-- What did you do?
-- What did you expect to see?
-- What did you see instead?
-
-The best way to get your bug fixed is to provide a test case,
-and/or steps to reproduce the issue.
-
-
-How to request a feature
-------------------------
-
-Request features on the `Issue Tracker`_.
-
-
-How to set up your development environment
-------------------------------------------
-
-You need Python 3.8+ and the following tools:
-
-- Nox_
-
-Install the package with development requirements:
-
-.. code:: console
-
-   $ pip install nox
-
-.. _Nox: https://nox.thea.codes/
-
-
-How to test the project
------------------------
-
-Run the full test suite:
-
-.. code:: console
-
-   $ nox
-
-List the available Nox sessions:
-
-.. code:: console
-
-   $ nox --list-sessions
-
-You can also run a specific Nox session.
-For example, invoke the unit test suite like this:
-
-.. code:: console
-
-   $ nox --session=tests
-
-Unit tests are located in the ``tests`` directory,
-and are written using the pytest_ testing framework.
-
-.. _pytest: https://pytest.readthedocs.io/
-
-
-Build documentation
--------------------
-
-If you've made any changes to the documentation (including changes to function signatures,
-class definitions, or docstrings that will appear in the API documentation),
-make sure it builds successfully.
-
-.. code:: console
-
-   $ nox -s docs
-
-In order to run this locally with hot reload on changes:
-
-.. code:: console
-
-   $ mkdocs serve
-
-
-How to submit changes
----------------------
-
-Open a `pull request`_ to submit changes to this project.
-
-Your pull request needs to meet the following guidelines for acceptance:
-
-- The Nox test suite must pass without errors and warnings.
-- Include unit tests. This project maintains 100% code coverage.
-- If your changes add functionality, update the documentation accordingly.
-
-Feel free to submit early, though—we can always iterate on this.
-
-To run linting and code formatting checks, you can invoke a `lint` session in nox:
-
-.. code:: console
-
-   $ nox -s lint
-
-It is recommended to open an issue before starting work on anything.
-This will allow a chance to talk it over with the owners and validate your approach.
-
-.. _pull request: https://github.com/iterative/datachain/pulls
-.. github-only
-.. _Code of Conduct: CODE_OF_CONDUCT.rst
diff --git a/README.rst b/README.rst
index d2025b29d..a79b48f86 100644
--- a/README.rst
+++ b/README.rst
@@ -21,7 +21,7 @@
 
 DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
 data like images, audio, videos, text and PDFs. It integrates with external storage
-(e.g., S3) to process data efficiently without data duplication and manages metadata
+(e.g. S3) to process data efficiently without data duplication and manages metadata
 in an internal database for easy and efficient querying.
 
 
@@ -59,289 +59,30 @@ Key Features
    - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
 
 
-Quick Start
------------
+Getting Started
+===============
 
-.. code:: console
+Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
 
-   $ pip install datachain
 
+Contributing
+============
 
-Selecting files using JSON metadata
-======================================
-
-A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
-annotated with ground truth and model inferences in the 'json-pairs' format,
-where each image has a matching JSON file like `cat.1009.json`:
-
-.. code:: json
-
-    {
-        "class": "cat", "id": "1009", "num_annotators": 8,
-        "inference": {"class": "dog", "confidence": 0.68}
-    }
-
-Example of downloading only "high-confidence cat" inferred images using JSON metadata:
-
-
-.. code:: py
-
-    from datachain import Column, DataChain
-
-    meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
-    images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
-
-    images_id = images.map(id=lambda file: file.path.split('.')[-2])
-    annotated = images_id.merge(meta, on="id", right_on="meta.id")
-
-    likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
-                                   & (Column("meta.inference.class_") == "cat"))
-    likely_cats.export_files("high-confidence-cats/", signal="file")
-
-
-Data curation with a local AI model
-===================================
-Batch inference with a simple sentiment model using the `transformers` library:
-
-.. code:: shell
-
-    pip install transformers
-
-The code below downloads files from the cloud, and applies a user-defined function
-to each one of them. All files with a positive sentiment
-detected are then copied to the local directory.
-
-.. code:: py
-
-    from transformers import pipeline
-    from datachain import DataChain, Column
-
-    classifier = pipeline("sentiment-analysis", device="cpu",
-                    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
-
-    def is_positive_dialogue_ending(file) -> bool:
-        dialogue_ending = file.read()[-512:]
-        return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
-
-    chain = (
-       DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
-                              object_name="file", type="text")
-       .settings(parallel=8, cache=True)
-       .map(is_positive=is_positive_dialogue_ending)
-       .save("file_response")
-    )
-
-    positive_chain = chain.filter(Column("is_positive") == True)
-    positive_chain.export_files("./output")
-
-    print(f"{positive_chain.count()} files were exported")
-
-
-
-13 files were exported
-
-.. code:: shell
-
-    $ ls output/datachain-demo/chatbot-KiT/
-    15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
-    $ ls output/datachain-demo/chatbot-KiT/ | wc -l
-    13
-
-
-LLM judging chatbots
-====================
-
-LLMs can work as universal classifiers. In the example below,
-we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
-Mistral API key at https://console.mistral.ai
-
-
-.. code:: shell
-
-    $ pip install mistralai (Requires version >=1.0.0)
-    $ export MISTRAL_API_KEY=_your_key_
-
-DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
-
-.. code:: py
-
-    from mistralai import Mistral
-    from datachain import File, DataChain, Column
-
-    PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
-
-    def eval_dialogue(file: File) -> bool:
-         client = Mistral()
-         response = client.chat.complete(
-             model="open-mixtral-8x22b",
-             messages=[{"role": "system", "content": PROMPT},
-                       {"role": "user", "content": file.read()}])
-         result = response.choices[0].message.content
-         return result.lower().startswith("success")
-
-    chain = (
-       DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
-       .settings(parallel=4, cache=True)
-       .map(is_success=eval_dialogue)
-       .save("mistral_files")
-    )
-
-    successful_chain = chain.filter(Column("is_success") == True)
-    successful_chain.export_files("./output_mistral")
-
-    print(f"{successful_chain.count()} files were exported")
-
-
-With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues:
-
-.. code:: shell
-
-    $ ls output_mistral/datachain-demo/chatbot-KiT/
-    1.txt  15.txt 18.txt 2.txt  22.txt 25.txt 28.txt 33.txt 37.txt 4.txt  41.txt ...
-    $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
-    31
-
-
-
-Serializing Python-objects
-==========================
-
-LLM responses may contain valuable information for analytics – such as the number of tokens used, or the
-model performance parameters.
-
-Instead of extracting this information from the Mistral response data structure (class
-`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB:
-
-
-.. code:: py
-
-    from mistralai import Mistral
-    from mistralai.models import ChatCompletionResponse
-    from datachain import File, DataChain, Column
-
-    PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
-
-    def eval_dialog(file: File) -> ChatCompletionResponse:
-         client = MistralClient()
-         return client.chat(
-             model="open-mixtral-8x22b",
-             messages=[{"role": "system", "content": PROMPT},
-                       {"role": "user", "content": file.read()}])
-
-    chain = (
-       DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
-       .settings(parallel=4, cache=True)
-       .map(response=eval_dialog)
-       .map(status=lambda response: response.choices[0].message.content.lower()[:7])
-       .save("response")
-    )
-
-    chain.select("file.name", "status", "response.usage").show(5)
-
-    success_rate = chain.filter(Column("status") == "success").count() / chain.count()
-    print(f"{100*success_rate:.1f}% dialogs were successful")
-
-Output:
-
-.. code:: shell
-
-         file   status      response     response          response
-         name                  usage        usage             usage
-                       prompt_tokens total_tokens completion_tokens
-    0   1.txt  success           547          548                 1
-    1  10.txt  failure          3576         3578                 2
-    2  11.txt  failure           626          628                 2
-    3  12.txt  failure          1144         1182                38
-    4  13.txt  success          1100         1101                 1
-
-    [Limited by 5 rows]
-    64.0% dialogs were successful
-
-
-Iterating over Python data structures
-=====================================
-
-In the previous examples, datasets were saved in the embedded database
-(`SQLite`_ in folder `.datachain` of the working directory).
-These datasets were automatically versioned, and can be accessed using
-`DataChain.from_dataset("dataset_name")`.
-
-Here is how to retrieve a saved dataset and iterate over the objects:
-
-.. code:: py
-
-    chain = DataChain.from_dataset("response")
-
-    # Iterating one-by-one: support out-of-memory workflow
-    for file, response in chain.limit(5).collect("file", "response"):
-        # verify the collected Python objects
-        assert isinstance(response, ChatCompletionResponse)
-
-        status = response.choices[0].message.content[:7]
-        tokens = response.usage.total_tokens
-        print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
-
-Output:
-
-.. code:: shell
-
-    gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
-    gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
-    gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
-    gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
-    gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
-
-
-Vectorized analytics over Python objects
-========================================
-
-Some operations can run inside the DB without deserialization.
-For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens:
-
-.. code:: py
-
-    chain = DataChain.from_dataset("mistral_dataset")
-
-    cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
-               + chain.sum("response.usage.completion_tokens")*0.000006
-    print(f"Spent ${cost:.2f} on {chain.count()} calls")
-
-Output:
-
-.. code:: shell
-
-    Spent $0.08 on 50 calls
-
-
-PyTorch data loader
-===================
-
-Chain results can be exported or passed directly to PyTorch dataloader.
-For example, if we are interested in passing image and a label based on file
-name suffix, the following code will do it:
-
-.. code:: py
-
-    from torch.utils.data import DataLoader
-    from transformers import CLIPProcessor
+Contributions are very welcome. To learn more, see the `Contributor Guide`_.
 
-    from datachain import C, DataChain
 
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+Community and Support
+=====================
 
-    chain = (
-        DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
-        .map(label=lambda name: name.split(".")[0], params=["file.name"])
-        .select("file", "label").to_pytorch(
-            transform=processor.image_processor,
-            tokenizer=processor.tokenizer,
-        )
-    )
-    loader = DataLoader(chain, batch_size=1)
+* `Docs <https://docs.datachain.ai/>`_
+* `File an issue`_ if you encounter any problems
+* `Discord Chat <https://dvc.org/chat>`_
+* `Email <mailto:support@dvc.org>`_
+* `Twitter <https://twitter.com/DVCorg>`_
 
 
 DataChain Studio Platform
--------------------------
+========================
 
 `DataChain Studio`_ is a proprietary solution for teams that offers:
 
@@ -353,36 +94,10 @@ DataChain Studio Platform
   AI model inference.
 - **Access control** including SSO and team based collaboration.
 
-Tutorials
----------
-
-* `Getting Started`_
-* `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
-* `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
-* `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
-
-
-Contributions
--------------
-
-Contributions are very welcome.
-To learn more, see the `Contributor Guide`_.
-
-
-Community and Support
----------------------
-
-* `Docs <https://docs.datachain.ai/>`_
-* `File an issue`_ if you encounter any problems
-* `Discord Chat <https://dvc.org/chat>`_
-* `Email <mailto:support@dvc.org>`_
-* `Twitter <https://twitter.com/DVCorg>`_
-
-
 .. _PyPI: https://pypi.org/
 .. _file an issue: https://github.com/iterative/datachain/issues
 .. github-only
-.. _Contributor Guide: CONTRIBUTING.rst
+.. _Contributor Guide: https://docs.datachain.ai/contributing
 .. _Pydantic: https://github.com/pydantic/pydantic
 .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
 .. _SQLite: https://www.sqlite.org/
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 000000000..5d8a2e32c
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,111 @@
+# Contributor Guide
+
+Thank you for your interest in improving this project. This project is
+open-source under the [Apache 2.0
+license](https://opensource.org/licenses/Apache-2.0) and welcomes
+contributions in the form of bug reports, feature requests, and pull
+requests.
+
+Here is a list of important resources for contributors:
+
+-   [Source Code](https://github.com/iterative/datachain)
+-   [Documentation](https://docs.dvc.ai/datachain)
+-   [Issue Tracker](https://github.com/iterative/datachain/issues)
+-   [Code of Conduct](https://github.com/iterative/datachain?tab=coc-ov-file)
+
+## How to report a bug
+
+Report bugs on the [Issue
+Tracker](https://github.com/iterative/datachain/issues).
+
+When filing an issue, make sure to answer these questions:
+
+-   Which operating system and Python version are you using?
+-   Which version of this project are you using?
+-   What did you do?
+-   What did you expect to see?
+-   What did you see instead?
+
+The best way to get your bug fixed is to provide a test case, and/or
+steps to reproduce the issue.
+
+## How to request a feature
+
+Request features on the [Issue
+Tracker](https://github.com/iterative/datachain/issues).
+
+## How to set up your development environment
+
+You need Python 3.8+ and the following tools:
+
+-   [Nox](https://nox.thea.codes/)
+
+Install the package with development requirements:
+
+``` console
+$ pip install nox
+```
+
+## How to test the project
+
+Run the full test suite:
+
+``` console
+$ nox
+```
+
+List the available Nox sessions:
+
+``` console
+$ nox --list-sessions
+```
+
+You can also run a specific Nox session. For example, invoke the unit
+test suite like this:
+
+``` console
+$ nox --session=tests
+```
+
+Unit tests are located in the `tests` directory, and are written using
+the [pytest](https://pytest.readthedocs.io/) testing framework.
+
+## Build documentation
+
+If you've made any changes to the documentation (including changes to
+function signatures, class definitions, or docstrings that will appear
+in the API documentation), make sure it builds successfully.
+
+``` console
+$ nox -s docs
+```
+
+In order to run this locally with hot reload on changes:
+
+``` console
+$ mkdocs serve
+```
+
+## How to submit changes
+
+Open a [pull request](https://github.com/iterative/datachain/pulls) to
+submit changes to this project.
+
+Your pull request needs to meet the following guidelines for acceptance:
+
+-   The Nox test suite must pass without errors and warnings.
+-   Include unit tests. This project maintains 100% code coverage.
+-   If your changes add functionality, update the documentation
+    accordingly.
+
+Feel free to submit early, though---we can always iterate on this.
+
+To run linting and code formatting checks, you can invoke a `lint` session in nox:
+
+``` console
+$ nox -s lint
+```
+
+It is recommended to open an issue before starting work on anything.
+This will allow a chance to talk it over with the owners and validate
+your approach.
diff --git a/docs/examples.md b/docs/examples.md
new file mode 100644
index 000000000..6409cea8a
--- /dev/null
+++ b/docs/examples.md
@@ -0,0 +1,289 @@
+
+# Examples
+
+## DataChain Basics
+
+!!! example "DataChain Basics"
+
+    Datachain is built by composing wrangling operations.
+
+    For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
+
+    ```python
+    from datachain.lib.dc import Column, DataChain, File # (1)!
+    from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)!
+
+    images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
+
+    model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
+    processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
+
+    def process(file: File) -> str:
+      image=file.read().convert("RGB")
+      inputs = processor(text="caption", images=image, return_tensors="pt")
+      generate_ids = model.generate(**inputs, max_new_tokens=100)
+      return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+    chain = (
+          images.limit(5)
+          .settings(cache=True)
+          .map(scene=lambda file: process(file), output = str)
+          .save()
+    )
+    ```
+
+    1. `pip install datachain`
+    2. `pip install transformers`
+
+    Here is how we can view the results in a plot:
+
+    ```python
+    import matplotlib.pyplot as plt
+    import re
+    from textwrap import wrap
+
+    def trim_text(text):
+        match = re.search(r'[A-Z][^.]*\.', text)
+        return match.group(0) if match else ''
+
+    images = chain.collect("file")
+    captions = chain.collect("scene")
+    _ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
+
+    for ax, img, caption in zip(axes, images, captions):
+        ax.imshow(img.read(),cmap='gray')
+        ax.axis('off')
+        wrapped_caption = "\n".join(wrap(trim_text(caption), 30))
+        ax.set_title(wrapped_caption, fontsize=6)
+
+    plt.show()
+    ```
+
+    ![Untitled](assets/captioned_cartoons.png)
+
+If interested to see more examples, please check out the [tutorials](tutorials.md).
+
+### Handling Python objects
+
+In addition to storing primitive Python data types like strings, DataChain is also capable of using data models.
+
+For example, most LLMs return objects that carry additional fields. If provider offers a Pydantic model for their LLM, Datachain can use it as a schema.
+
+In the below example, we are calling a Mixtral 8x22b model to judge the “service chatbot” dataset from [Karlsruhe Institute of Technology](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot) and saving the results as Mistral’s *ChatCompletionResponse* objects:
+
+```python
+# pip install mistralai
+# this example requires a free Mistral API key, get yours at https://console.mistral.ai
+# $ export MISTRAL_API_KEY='your key'
+
+import os
+from datachain.lib.feature import Feature
+from datachain.lib.dc import Column, DataChain
+from mistralai.client import MistralClient
+from mistralai.models.chat_completion import ChatMessage
+from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
+from datachain.lib.data_model import DataModel
+
+prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
+api_key = os.environ["MISTRAL_API_KEY"]
+
+## register the data model ###
+DataModel.register(MistralModel)
+
+chain = (
+    DataChain
+    .from_storage("gs://datachain-demo/chatbot-KiT/", type="text")
+    .filter(Column("file.name").glob("*.txt"))
+    .limit(5)
+    .settings(parallel=4, cache=True)
+    .map(
+       mistral=lambda file: MistralClient(api_key=api_key).chat(
+                                          model="open-mixtral-8x22b",
+                                          response_format={"type": "json_object"},
+                                          messages= [
+                                             ChatMessage(role="system", content=f"{prompt}"),
+                                             ChatMessage(role="user", content=f"{file.read()}")
+                                          ]
+                            ),
+       output=MistralModel
+    )
+    .save("dialog-rating")
+)
+
+**iter = chain.collect("mistral")
+**print(*map(lambda chat_response: chat_response.choices[0].message.content, iter))
+```
+
+```
+{"result": "Yes"} {"result": "No"} {"result": "No"} {"result": "Yes"}
+```
+
+If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial:
+
+[https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
+
+### Vectorized analytics
+
+Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized:
+
+```python
+# continued from the previous example
+
+mistral_cost = chain.sum("mistral.usage.prompt_tokens")*0.000002 + \
+               chain.sum("mistral.usage.completion_tokens")*0.000006
+
+print(f"The cost of {chain.count()} calls to Mixtral 8x22b : ${mistral_cost:.4f}")
+```
+
+```
+The cost of 5 calls to Mixtral 8x22b : $0.0142
+```
+
+### Dataset persistence
+
+The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain:
+
+```python
+DataChain.from_dataset("rating").limit(2).save("dialog-rating")
+```
+
+Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry:
+
+```python
+mydatasets = DataChain.datasets()
+for ds in mydatasets.collect("dataset"):
+    print(f"{ds.name}@v{ds.version}")
+
+```
+
+```
+Processed: 1 rows [00:00, 777.59 rows/s]
+Generated: 14 rows [00:00, 11279.34 rows/s]
+dialog-rating@v1
+dialog-rating@v2
+```
+
+By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested:
+
+```python
+ds = DataChain.from_dataset("dialog-rating", version = 1)
+```
+
+### Chain execution, optimization and parallelism
+
+Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects from the chain.
+
+“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose.
+
+Datachain natively supports parallelism in execution. If an API or a local model supports parallel requests, the `settings` operator can split the load across multiple workers (see the [code example above](#handling-python-objects))
+
+### Reading external metadata
+
+It is common for AI data to come with pre-computed metadata (annotations, classes, etc).
+
+DataChain library understands common annotation formats (JSON, CSV, webdataset and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred.
+
+Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
+
+```python
+from datachain.lib.dc import DataChain
+
+uri="gs://datachain-demo/chatbot-csv/"
+csv_dataset = DataChain.from_csv(uri)
+
+print(csv_dataset.to_pandas())
+```
+
+Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples in blocks within JSON files.
+
+Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions:
+
+```json
+{
+  "images": [
+    {
+      "license": 4,
+      "file_name": "000000397133.jpg",
+      "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
+      "height": 427,
+      "width": 640,
+      "date_captured": "2013-11-14 17:02:52",
+      "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
+      "id": 397133
+    },
+    ...
+  ],
+  "annotations": [
+    {
+      "image_id"	:	"179765",
+      "id"	:	38,
+      "caption"	:	"A black Honda motorcycle parked in front of a garage."
+    },
+    ...
+  ],
+  ...
+}
+```
+
+Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
+
+However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
+
+```python
+
+from datachain.lib.dc import Column, DataChain
+
+images_uri="gs://datachain-demo/coco2017/images/val/"
+captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
+
+images = DataChain.from_storage(images_uri)
+meta = DataChain.from_json(captions_uri, jmespath = "images")
+captions = DataChain.from_json(captions_uri, jmespath = "annotations")
+
+images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
+captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
+```
+
+The resulting dataset has image entries as files decorated with all the metadata and captions:
+
+```python
+images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*"))
+images_with_dogs.select("annotations", "file.name").show()
+```
+
+```
+   captions captions                                           captions              file
+   image_id       id                                            caption              name
+0     17029   778902         a dog jumping to catch a frisbee in a yard  000000017029.jpg
+1     17029   779838   A dog jumping to catch a red frisbee in a garden  000000017029.jpg
+2     17029   781941  The dog is catching the Frisbee in mid air in ...  000000017029.jpg
+3     17029   782283      A dog catches a frisbee outside in the grass.  000000017029.jpg
+4     17029   783543              A dog leaping to catch a red frisbee.  000000017029.jpg
+5     18193   278544  A woman in green sweater eating a hotdog by ca...  000000018193.jpg
+...
+
+[Limited by 20 rows]
+```
+For in-depth review of working with JSON metadata, please follow this tutorial:
+
+[GitHub](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
+
+### Passing data to training
+
+Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it:
+
+```python
+
+ds = train.select("file", "caption_choices", "label_ind").to_pytorch(
+    transform=preprocess,
+    tokenizer=clip.tokenize,
+)
+
+loader = DataLoader(ds, batch_size=2)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+train(loader, model, optimizer)
+```
+
+See a larger example for CLIP fine-tuning here:
+
+[GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
diff --git a/docs/index.md b/docs/index.md
index 19067e362..fc86d14d7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,302 +1,103 @@
-# Get Started with DataChain
-
-🔨Wrangle unstructured AI data at scale
-
-
-Datachain enables multimodal API calls and local AI inferences to run in parallel over many samples as chained operations. The resulting datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. Datachain can persist features of Python objects returned by AI models, and enables vectorized analytical operations over them.
-
-The typical use cases are data curation, LLM analytics and validation, image segmentation, pose detection, and GenAI alignment. Datachain is especially helpful if batch operations can be optimized – for instance, when synchronous API calls can be parallelized  or where an LLM API offers batch processing.
-
----
-
-`pip install datachain`
-
----
-
-### Operation basics
-
-Datachain is built by composing wrangling operations.
-
-For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”:
-
-```python
-#
-# pip install transformers
-#
-
-from datachain.lib.dc import Column, DataChain, File
-from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
-
-images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image")
-
-model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
-processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
-
-def process(file: File) -> str:
-   image=file.read().convert("RGB")
-   inputs = processor(text="caption", images=image, return_tensors="pt")
-   generate_ids = model.generate(**inputs, max_new_tokens=100)
-   return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-
-chain = (
-      images.limit(5)
-      .settings(cache=True)
-      .map(scene=lambda file: process(file), output = str)
-      .save()
-)
-```
-
-Here is how we can view the results in a plot:
-
-```python
-import matplotlib.pyplot as plt
-import re
-from textwrap import wrap
-
-def trim_text(text):
-    match = re.search(r'[A-Z][^.]*\.', text)
-    return match.group(0) if match else ''
-
-images = chain.collect("file")
-captions = chain.collect("scene")
-_ , axes = plt.subplots(1, len(captions), figsize=(15, 5))
-
-for ax, img, caption in zip(axes, images, captions):
-    ax.imshow(img.read(),cmap='gray')
-    ax.axis('off')
-    wrapped_caption = "\n".join(wrap(trim_text(caption), 30))
-    ax.set_title(wrapped_caption, fontsize=6)
-
-plt.show()
-```
-
-![Untitled](assets/captioned_cartoons.png)
-
-If interested to see more multimodal examples for DataChain, please follow this tutorial:
-
-[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
-
-### Handling Python objects
-
-In addition to storing primitive Python data types like strings, DataChain is also capable of using data models.
-
-For example, most LLMs return objects that carry additional fields. If provider offers a Pydantic model for their LLM, Datachain can use it as a schema.
-
-In the below example, we are calling a Mixtral 8x22b model to judge the “service chatbot” dataset from [Karlsruhe Institute of Technology](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot) and saving the results as Mistral’s *ChatCompletionResponse* objects:
-
-```python
-# pip install mistralai
-# this example requires a free Mistral API key, get yours at https://console.mistral.ai
-# $ export MISTRAL_API_KEY='your key'
-
-import os
-from datachain.lib.feature import Feature
-from datachain.lib.dc import Column, DataChain
-from mistralai.client import MistralClient
-from mistralai.models.chat_completion import ChatMessage
-from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel
-from datachain.lib.data_model import DataModel
-
-prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON"
-api_key = os.environ["MISTRAL_API_KEY"]
-
-## register the data model ###
-DataModel.register(MistralModel)
-
-chain = (
-    DataChain
-    .from_storage("gs://datachain-demo/chatbot-KiT/", type="text")
-    .filter(Column("file.name").glob("*.txt"))
-    .limit(5)
-    .settings(parallel=4, cache=True)
-    .map(
-       mistral=lambda file: MistralClient(api_key=api_key).chat(
-                                          model="open-mixtral-8x22b",
-                                          response_format={"type": "json_object"},
-                                          messages= [
-                                             ChatMessage(role="system", content=f"{prompt}"),
-                                             ChatMessage(role="user", content=f"{file.read()}")
-                                          ]
-                            ),
-       output=MistralModel
-    )
-    .save("dialog-rating")
-)
-
-**iter = chain.collect("mistral")
-**print(*map(lambda chat_response: chat_response.choices[0].message.content, iter))
-```
-
-```
-{"result": "Yes"} {"result": "No"} {"result": "No"} {"result": "Yes"}
-```
-
-If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial:
-
-[https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
-
-### Vectorized analytics
-
-Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized:
-
-```python
-# continued from the previous example
-
-mistral_cost = chain.sum("mistral.usage.prompt_tokens")*0.000002 + \
-               chain.sum("mistral.usage.completion_tokens")*0.000006
-
-print(f"The cost of {chain.count()} calls to Mixtral 8x22b : ${mistral_cost:.4f}")
-```
-
-```
-The cost of 5 calls to Mixtral 8x22b : $0.0142
-```
-
-### Dataset persistence
-
-The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain:
-
-```python
-DataChain.from_dataset("rating").limit(2).save("dialog-rating")
-```
-
-Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry:
-
-```python
-mydatasets = DataChain.datasets()
-for ds in mydatasets.collect("dataset"):
-    print(f"{ds.name}@v{ds.version}")
-
-```
-
-```
-Processed: 1 rows [00:00, 777.59 rows/s]
-Generated: 14 rows [00:00, 11279.34 rows/s]
-dialog-rating@v1
-dialog-rating@v2
-```
-
-By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested:
-
-```python
-ds = DataChain.from_dataset("dialog-rating", version = 1)
-```
-
-### Chain execution, optimization and parallelism
-
-Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects from the chain.
-
-“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose.
-
-Datachain natively supports parallelism in execution. If an API or a local model supports parallel requests, the `settings` operator can split the load across multiple workers (see the [code example above](https://www.notion.so/DataChain-Getting-Started-3ed9414febac48f888f90cdaa2ca7667?pvs=21))
-
-### Reading external metadata
-
-It is common for AI data to come with pre-computed metadata (annotations, classes, etc).
-
-DataChain library understands common annotation formats (JSON, CSV, webdataset and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred.
-
-Here is an example of reading a simple CSV file where schema is heuristically derived from the header:
-
-```python
-from datachain.lib.dc import DataChain
-
-uri="gs://datachain-demo/chatbot-csv/"
-csv_dataset = DataChain.from_csv(uri)
-
-print(csv_dataset.to_pandas())
-```
-
-Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples in blocks within JSON files.
-
-Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions:
-
-```json
-{
-  "images": [
-    {
-      "license": 4,
-      "file_name": "000000397133.jpg",
-      "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",
-      "height": 427,
-      "width": 640,
-      "date_captured": "2013-11-14 17:02:52",
-      "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",
-      "id": 397133
-    },
-    ...
-  ],
-  "annotations": [
-    {
-      "image_id"	:	"179765",
-      "id"	:	38,
-      "caption"	:	"A black Honda motorcycle parked in front of a garage."
-    },
-    ...
-  ],
-  ...
-}
-```
-
-Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
-
-However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
-
-```python
-
-from datachain.lib.dc import Column, DataChain
-
-images_uri="gs://datachain-demo/coco2017/images/val/"
-captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json"
-
-images = DataChain.from_storage(images_uri)
-meta = DataChain.from_json(captions_uri, jmespath = "images")
-captions = DataChain.from_json(captions_uri, jmespath = "annotations")
-
-images_meta = images.merge(meta, on="file.name", right_on="images.file_name")
-captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id")
-```
-
-The resulting dataset has image entries as files decorated with all the metadata and captions:
-
-```python
-images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*"))
-images_with_dogs.select("annotations", "file.name").show()
-```
-
-```
-   captions captions                                           captions              file
-   image_id       id                                            caption              name
-0     17029   778902         a dog jumping to catch a frisbee in a yard  000000017029.jpg
-1     17029   779838   A dog jumping to catch a red frisbee in a garden  000000017029.jpg
-2     17029   781941  The dog is catching the Frisbee in mid air in ...  000000017029.jpg
-3     17029   782283      A dog catches a frisbee outside in the grass.  000000017029.jpg
-4     17029   783543              A dog leaping to catch a red frisbee.  000000017029.jpg
-5     18193   278544  A woman in green sweater eating a hotdog by ca...  000000018193.jpg
-...
-
-[Limited by 20 rows]
-```
-For in-depth review of working with JSON metadata, please follow this tutorial:
-
-[https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
-
-### Passing data to training
-
-Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it:
-
-```python
-
-ds = train.select("file", "caption_choices", "label_ind").to_pytorch(
-    transform=preprocess,
-    tokenizer=clip.tokenize,
-)
-
-loader = DataLoader(ds, batch_size=2)
-optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
-train(loader, model, optimizer)
-```
-
-See a larger example for CLIP fine-tuning here:
-
-[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
+# <a class="main-header-link" href="/" ><img style="display: inline-block;" src="/assets/datachain.svg" alt="DataChain"> <span style="display: inline-block;"> DataChain</span></a>
+
+<style>
+.md-content .md-typeset h1 { font-weight: bold; display: flex; align-items: center; justify-content: center; gap: 5px; }
+.md-content .md-typeset h1 .main-header-link { display: flex; align-items: center; justify-content: center; gap: 8px;
+ }
+</style>
+
+<p align="center">
+  <a href="https://pypi.org/project/datachain/" target="_blank">
+    <img src="https://img.shields.io/pypi/v/datachain.svg" alt="PyPI">
+  </a>
+  <a href="https://pypi.org/project/datachain/" target="_blank">
+    <img src="https://img.shields.io/pypi/pyversions/datachain" alt="Python Version">
+  </a>
+  <a href="https://codecov.io/gh/iterative/datachain" target="_blank">
+    <img src="https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB" alt="Codecov">
+  </a>
+  <a href="https://github.com/iterative/datachain/actions/workflows/tests.yml" target="_blank">
+    <img src="https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg" alt="Tests">
+  </a>
+</p>
+
+<p align="center">
+<em>🔨 Wrangle unstructured AI data at scale</em>
+</p>
+
+
+DataChain is a Python-based AI-data warehouse for transforming and
+analyzing unstructured data like images, audio, videos, text and PDFs.
+It integrates with external storage (e.g. S3, GCP, Azure, HuggingFace) to process data
+efficiently without data duplication and manages metadata in an internal
+database for easy and efficient querying.
+
+## Use Cases
+
+1.  **ETL.** Pythonic framework for describing and running unstructured
+    data transformations and enrichments, applying models to data,
+    including LLMs.
+2.  **Analytics.** DataChain dataset is a table that combines all the
+    information about data objects in one place + it provides
+    dataframe-like API and vecrorized engine to do analytics on these
+    tables at scale.
+3.  **Versioning.** DataChain doesn't store, require moving or copying
+    data (unlike DVC). Perfect use case is a bucket with thousands or
+    millions of images, videos, audio, PDFs.
+
+## Key Features
+
+📂 **Multimodal Dataset Versioning.**
+
+:   -   Version unstructured data without moving or creating data
+        copies, by supporting references to S3, GCP, Azure, and local
+        file systems.
+    -   Multimodal data support: images, video, text, PDFs, JSONs, CSVs,
+        parquet, etc.
+    -   Unite files and metadata together into persistent, versioned,
+        columnar datasets.
+
+🐍 **Python-friendly.**
+
+:   -   Operate on Python objects and object fields: float scores,
+        strings, matrixes, LLM response objects.
+    -   Run Python code in a high-scale, terabytes size datasets, with
+        built-in parallelization and memory-efficient computing --- no
+        SQL or Spark required.
+
+🧠 **Data Enrichment and Processing.**
+
+:   -   Generate metadata using local AI models and LLM APIs.
+    -   Filter, join, and group datasets by metadata. Search by vector
+        embeddings.
+    -   High-performance vectorized operations on Python objects: sum,
+        count, avg, etc.
+    -   Pass datasets to Pytorch and Tensorflow, or export them back
+        into storage.
+
+
+## Documentation Guide
+
+The following pages provide detailed documentation on DataChain's features, architecture, and usage patterns. You'll learn how to effectively use DataChain for managing and processing unstructured data at scale.
+
+- [🏃🏼‍♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time.
+- [🎯 Examples](examples.md): Explore practical examples and use cases.
+- [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks.
+- [📚 API Reference](references/index.md): Dive into the technical details and API reference.
+- [🤝 Contributing](contributing.md): Learn how to contribute to DataChain.
+
+
+<!-- Open source and Studio -->
+
+## Open Source and Studio
+
+DataChain is available as an open source project and Studio as a proprietary solution for teams.
+
+- [DataChain Studio](https://studio.datachain.ai/):
+    - **Centralized dataset registry** to manage data, code and dependencies in one place.
+    - **Data Lineage** for data sources as well as derivative dataset.
+    - **UI for Multimodal Data** like images, videos, and PDFs.
+    - **Scalable Compute** to handle large datasets (100M+ files) and in-house AI model inference.
+    - **Access control** including SSO and team based collaboration.
+- [DataChain Open Source](https://github.com/iterative/datachain):
+    - Python-based AI-data warehouse for transforming and analyzing unstructured data like images, audio, videos, text and PDFs.
diff --git a/docs/quick-start.md b/docs/quick-start.md
new file mode 100644
index 000000000..9d72a32a9
--- /dev/null
+++ b/docs/quick-start.md
@@ -0,0 +1,286 @@
+# Quick Start
+
+## Installation
+
+=== "pip"
+
+    ```bash
+    pip install datachain
+    ```
+
+=== "uv"
+
+    ```bash
+    uv add datachain
+    ```
+
+
+## Selecting files using JSON metadata
+
+A storage consists of images of cats and dogs
+(`dog.1048.jpg`, `cat.1009.jpg`), annotated with
+ground truth and model inferences in the _`json-pairs`_ format, where
+each image has a matching JSON file like `cat.1009.json`:
+
+``` json
+{
+    "class": "cat", "id": "1009", "num_annotators": 8,
+    "inference": {"class": "dog", "confidence": 0.68}
+}
+```
+
+Example of downloading only _`high-confidence cat`_ inferred images
+using JSON metadata:
+
+``` py
+from datachain import Column, DataChain
+
+meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
+images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
+
+images_id = images.map(id=lambda file: file.path.split('.')[-2])
+annotated = images_id.merge(meta, on="id", right_on="meta.id")
+
+likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
+                               & (Column("meta.inference.class_") == "cat"))
+likely_cats.export_files("high-confidence-cats/", signal="file")
+```
+
+## Data curation with a local AI model
+
+Batch inference with a simple sentiment model using the
+`transformers` library:
+
+``` shell
+pip install transformers
+```
+
+The code below downloads files from the cloud, and applies a
+user-defined function to each one of them. All files with a positive
+sentiment detected are then copied to the local directory.
+
+``` py
+from transformers import pipeline
+from datachain import DataChain, Column
+
+classifier = pipeline("sentiment-analysis", device="cpu",
+                model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+
+def is_positive_dialogue_ending(file) -> bool:
+    dialogue_ending = file.read()[-512:]
+    return classifier(dialogue_ending)[0]["label"] == "POSITIVE"
+
+chain = (
+   DataChain.from_storage("gs://datachain-demo/chatbot-KiT/",
+                          object_name="file", type="text")
+   .settings(parallel=8, cache=True)
+   .map(is_positive=is_positive_dialogue_ending)
+   .save("file_response")
+)
+
+positive_chain = chain.filter(Column("is_positive") == True)
+positive_chain.export_files("./output")
+
+print(f"{positive_chain.count()} files were exported")
+```
+
+13 files were exported
+
+``` shell
+$ ls output/datachain-demo/chatbot-KiT/
+15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ...
+$ ls output/datachain-demo/chatbot-KiT/ | wc -l
+13
+```
+
+## LLM judging chatbots
+
+LLMs can work as universal classifiers. In the example below, we employ
+a free API from Mistral to judge the [publicly
+available](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot)
+chatbot dialogs. Please get a free Mistral API key at
+<https://console.mistral.ai>
+
+``` shell
+$ pip install mistralai (Requires version >=1.0.0)
+$ export MISTRAL_API_KEY=_your_key_
+```
+
+DataChain can parallelize API calls; the free Mistral tier supports up
+to 4 requests at the same time.
+
+``` py
+from mistralai import Mistral
+from datachain import File, DataChain, Column
+
+PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
+
+def eval_dialogue(file: File) -> bool:
+     client = Mistral()
+     response = client.chat.complete(
+         model="open-mixtral-8x22b",
+         messages=[{"role": "system", "content": PROMPT},
+                   {"role": "user", "content": file.read()}])
+     result = response.choices[0].message.content
+     return result.lower().startswith("success")
+
+chain = (
+   DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
+   .settings(parallel=4, cache=True)
+   .map(is_success=eval_dialogue)
+   .save("mistral_files")
+)
+
+successful_chain = chain.filter(Column("is_success") == True)
+successful_chain.export_files("./output_mistral")
+
+print(f"{successful_chain.count()} files were exported")
+```
+
+With the instruction above, the Mistral model considers 31/50 files to
+hold the successful dialogues:
+
+``` shell
+$ ls output_mistral/datachain-demo/chatbot-KiT/
+1.txt  15.txt 18.txt 2.txt  22.txt 25.txt 28.txt 33.txt 37.txt 4.txt  41.txt ...
+$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l
+31
+```
+
+## Serializing Python-objects
+
+LLM responses may contain valuable information for analytics -- such as
+the number of tokens used, or the model performance parameters.
+
+Instead of extracting this information from the Mistral response data
+structure (class `ChatCompletionResponse`), DataChain can
+serialize the entire LLM response to the internal DB:
+
+``` py
+from mistralai import Mistral
+from mistralai.models import ChatCompletionResponse
+from datachain import File, DataChain, Column
+
+PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
+
+def eval_dialog(file: File) -> ChatCompletionResponse:
+     client = MistralClient()
+     return client.chat(
+         model="open-mixtral-8x22b",
+         messages=[{"role": "system", "content": PROMPT},
+                   {"role": "user", "content": file.read()}])
+
+chain = (
+   DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
+   .settings(parallel=4, cache=True)
+   .map(response=eval_dialog)
+   .map(status=lambda response: response.choices[0].message.content.lower()[:7])
+   .save("response")
+)
+
+chain.select("file.name", "status", "response.usage").show(5)
+
+success_rate = chain.filter(Column("status") == "success").count() / chain.count()
+print(f"{100*success_rate:.1f}% dialogs were successful")
+```
+
+Output:
+
+``` shell
+file   status      response     response          response
+name                  usage        usage             usage
+              prompt_tokens total_tokens completion_tokens
+0   1.txt  success           547          548                 1
+1  10.txt  failure          3576         3578                 2
+2  11.txt  failure           626          628                 2
+3  12.txt  failure          1144         1182                38
+4  13.txt  success          1100         1101                 1
+
+[Limited by 5 rows]
+64.0% dialogs were successful
+```
+
+## Iterating over Python data structures
+
+In the previous examples, datasets were saved in the embedded database
+(`SQLite` in folder `.datachain` of the working directory). These datasets were automatically versioned, and
+can be accessed using `DataChain.from_dataset("dataset_name")`.
+
+Here is how to retrieve a saved dataset and iterate over the objects:
+
+``` py
+chain = DataChain.from_dataset("response")
+
+# Iterating one-by-one: support out-of-memory workflow
+for file, response in chain.limit(5).collect("file", "response"):
+    # verify the collected Python objects
+    assert isinstance(response, ChatCompletionResponse)
+
+    status = response.choices[0].message.content[:7]
+    tokens = response.usage.total_tokens
+    print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}")
+```
+
+Output:
+
+``` shell
+gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548
+gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578
+gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628
+gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207
+gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101
+```
+
+## Vectorized analytics over Python objects
+
+Some operations can run inside the DB without deserialization. For
+instance, let's calculate the total cost of using the LLM APIs,
+assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M
+output tokens:
+
+``` py
+chain = DataChain.from_dataset("mistral_dataset")
+
+cost = chain.sum("response.usage.prompt_tokens")*0.000002 \
+           + chain.sum("response.usage.completion_tokens")*0.000006
+print(f"Spent ${cost:.2f} on {chain.count()} calls")
+```
+
+Output:
+
+``` shell
+Spent $0.08 on 50 calls
+```
+
+## PyTorch data loader
+
+Chain results can be exported or passed directly to PyTorch dataloader.
+For example, if we are interested in passing image and a label based on
+file name suffix, the following code will do it:
+
+``` py
+from torch.utils.data import DataLoader
+from transformers import CLIPProcessor
+
+from datachain import C, DataChain
+
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+chain = (
+    DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image")
+    .map(label=lambda name: name.split(".")[0], params=["file.name"])
+    .select("file", "label").to_pytorch(
+        transform=processor.image_processor,
+        tokenizer=processor.tokenizer,
+    )
+)
+
+loader = DataLoader(chain, batch_size=1)
+
+```
+
+**See also:**
+
+- [Examples](examples.md)
+- [Tutorials](tutorials.md)
+- [API Reference](references/index.md)
diff --git a/docs/references/index.md b/docs/references/index.md
index 6a6ea7492..389cf366a 100644
--- a/docs/references/index.md
+++ b/docs/references/index.md
@@ -1,8 +1,10 @@
 # API Reference
 
-- [DataChain](./datachain.md)
-- [DataType](./datatype.md)
-- [File](./file.md)
-- [UDF](./udf.md)
-- [SQL](./sql.md)
-- [Torch](./torch.md)
+DataChain's API is organized into several modules:
+
+- [DataChain](./datachain.md) - Core chain operations and dataset management
+- [DataType](./datatype.md) - Type system and schema definitions
+- [File](./file.md) - File handling and storage operations
+- [UDF](./udf.md) - User-defined functions and transformations
+- [SQL](./sql.md) - SQL query integration
+- [Torch](./torch.md) - PyTorch data loading utilities
diff --git a/docs/tutorials.md b/docs/tutorials.md
new file mode 100644
index 000000000..dae91d81b
--- /dev/null
+++ b/docs/tutorials.md
@@ -0,0 +1,5 @@
+# Tutorials
+
+* Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
+* LLM evaluations: [GitHub](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb)
+* Reading JSON metadata: [GitHub](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb)
diff --git a/mkdocs.yml b/mkdocs.yml
index 25c830cc8..b62e815f9 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: ''
+site_name: 'DataChain'
 site_url: https://docs.datachain.ai
 site_description: Wrangle unstructured AI data at scale
 
@@ -16,7 +16,7 @@ validation:
 theme:
   name: material
   custom_dir: docs/overrides
-  logo: assets/datachain-white.svg
+  logo: assets/datachain.svg
   favicon: assets/datachain.svg
   icon:
     repo: fontawesome/brands/github
@@ -63,8 +63,14 @@ theme:
         name: Switch to system preference
 
 nav:
-  - Home: index.md
-  - API reference:
+  - Home:
+      - 🔗 Welcome to DataChain: index.md
+      - 🏃🏼‍♂️ Quick Start: quick-start.md
+      - 🎯 Examples: examples.md
+      - 📚 Tutorials: tutorials.md
+      - 🐍 API Reference: references/index.md
+      - 🤝 Contributing: contributing.md
+  - API Reference:
       - references/index.md
       - references/datachain.md
       - references/datatype.md
@@ -73,6 +79,7 @@ nav:
       - references/torch.md
       - references/sql.md
   - DataChain Website: https://datachain.ai" target="_blank"
+  - Studio: https://studio.datachain.ai" target="_blank"
 
 markdown_extensions:
   - abbr

From 2954ce5319a8009707420c540cd384287d74e051 Mon Sep 17 00:00:00 2001
From: Helio Machado <0x2b3bfa0+git@googlemail.com>
Date: Mon, 9 Dec 2024 09:44:59 +0100
Subject: [PATCH 05/10] Allow testing external contributions using secrets
 (#667)

---
 .github/workflows/tests.yml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bd4dcba7e..eeec26b61 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,7 +3,7 @@ name: Tests
 on:
   push:
     branches: [main]
-  pull_request:
+  pull_request_target:
   workflow_dispatch:
 
 env:
@@ -14,13 +14,22 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  authorize:
+    environment: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && 'external' || 'internal' }}
+    runs-on: ubuntu-latest
+    steps:
+      - run: true
+
   lint:
+    needs: authorize
+
     runs-on: ubuntu-latest
     steps:
       - name: Check out the repository
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha || github.ref }}
 
       - name: Set up Python 3.9
         uses: actions/setup-python@v5
@@ -53,6 +62,8 @@ jobs:
         run: nox -s lint
 
   datachain:
+    needs: authorize
+
     timeout-minutes: 40
     runs-on: ${{ matrix.os }}
     strategy:
@@ -75,6 +86,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha || github.ref }}
 
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5
@@ -117,6 +129,8 @@ jobs:
         run: nox -s docs
 
   examples:
+    needs: authorize
+
     runs-on: ${{ matrix.os }}
     timeout-minutes: 60
     strategy:
@@ -132,9 +146,10 @@ jobs:
           - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal}
           - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal}
 
-
     steps:
       - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.ref }}
 
       - name: Set up Python ${{ matrix.pyv }}
         uses: actions/setup-python@v5

From 0bf3134eba12e9f8cbe1f2d5a1210222c5c34b1c Mon Sep 17 00:00:00 2001
From: Vladimir Rudnykh <dreadatour@gmail.com>
Date: Mon, 9 Dec 2024 17:02:33 +0700
Subject: [PATCH 06/10] Fix readme (#674)

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index a79b48f86..ca670d669 100644
--- a/README.rst
+++ b/README.rst
@@ -82,7 +82,7 @@ Community and Support
 
 
 DataChain Studio Platform
-========================
+=========================
 
 `DataChain Studio`_ is a proprietary solution for teams that offers:
 

From 9c685b289039e9a8321d754359464c5f72efc51a Mon Sep 17 00:00:00 2001
From: Vladimir Rudnykh <dreadatour@gmail.com>
Date: Mon, 9 Dec 2024 18:05:07 +0700
Subject: [PATCH 07/10] Allow adding new signal from existing signal field
 (#658)

---
 src/datachain/lib/signal_schema.py   | 34 +++++++++++++++++++++++-----
 tests/unit/lib/test_signal_schema.py |  5 ++++
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
index c851d761e..1436cc584 100644
--- a/src/datachain/lib/signal_schema.py
+++ b/src/datachain/lib/signal_schema.py
@@ -402,9 +402,20 @@ def _set_file_stream(
             if ModelStore.is_pydantic(finfo.annotation):
                 SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
 
-    def get_column_type(self, col_name: str) -> DataType:
+    def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
+        """
+        Returns column type by column name.
+
+        If `with_subtree` is True, then it will return the type of the column
+        even if it has a subtree (e.g. model with nested fields), otherwise it will
+        return the type of the column (standard type field, not the model).
+
+        If column is not found, raises `SignalResolvingError`.
+        """
         for path, _type, has_subtree, _ in self.get_flat_tree():
-            if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
+            if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
+                path
+            ) == col_name:
                 return _type
         raise SignalResolvingError([col_name], "is not found")
 
@@ -492,14 +503,25 @@ def mutate(self, args_map: dict) -> "SignalSchema":
                 # renaming existing signal
                 del new_values[value.name]
                 new_values[name] = self.values[value.name]
-            elif isinstance(value, Func):
+                continue
+            if isinstance(value, Column):
+                # adding new signal from existing signal field
+                try:
+                    new_values[name] = self.get_column_type(
+                        value.name, with_subtree=True
+                    )
+                    continue
+                except SignalResolvingError:
+                    pass
+            if isinstance(value, Func):
                 # adding new signal with function
                 new_values[name] = value.get_result_type(self)
-            elif isinstance(value, ColumnElement):
+                continue
+            if isinstance(value, ColumnElement):
                 # adding new signal
                 new_values[name] = sql_to_python(value)
-            else:
-                new_values[name] = value
+                continue
+            new_values[name] = value
 
         return SignalSchema(new_values)
 
diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py
index d3464a87c..7a33e11f1 100644
--- a/tests/unit/lib/test_signal_schema.py
+++ b/tests/unit/lib/test_signal_schema.py
@@ -703,6 +703,11 @@ def test_mutate_rename():
     assert schema.values == {"new_name": str}
 
 
+def test_mutate_rename_leaf(nested_file_schema):
+    schema = nested_file_schema.mutate({"new_name": Column("my_f__nested_file")})
+    assert schema.values == {**nested_file_schema.values, "new_name": File}
+
+
 def test_mutate_new_signal():
     schema = SignalSchema({"name": str})
     schema = schema.mutate({"age": Column("age", Float)})

From ddee0a7c0d14e46c54e66bd47f6f7583f53a3018 Mon Sep 17 00:00:00 2001
From: LeviLovie <107021730+LeviLovie@users.noreply.github.com>
Date: Mon, 9 Dec 2024 19:45:17 +0700
Subject: [PATCH 08/10] Fix a typo in `docs/examples.md` (#675)

---
 docs/examples.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples.md b/docs/examples.md
index 6409cea8a..2d4362862 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -225,7 +225,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti
 }
 ```
 
-Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
+Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array.
 
 However, Datachain can easily parse the entire COCO structure via several reading and merging operators:
 

From 200dc73f9b189b6e9a6844d74f4e3fafa67b4f13 Mon Sep 17 00:00:00 2001
From: Thomas Kunwar <20840228+yathomasi@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:36:53 +0545
Subject: [PATCH 09/10] chore: follow up docs navigation update (#673)

* chore: follow up docs navigaiton update

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* chore: update dark theme color

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/contributing.md                |  4 +++
 docs/css/github-permalink-style.css | 39 +++++++++++++++++++++++++++++
 docs/examples.md                    |  3 +++
 docs/index.md                       |  5 +++-
 docs/quick-start.md                 |  4 +++
 docs/references/index.md            |  4 +++
 docs/tutorials.md                   |  4 +++
 mkdocs.yml                          | 34 +++++++++++++------------
 8 files changed, 80 insertions(+), 17 deletions(-)
 create mode 100644 docs/css/github-permalink-style.css

diff --git a/docs/contributing.md b/docs/contributing.md
index 5d8a2e32c..ad4d22d37 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -1,3 +1,7 @@
+---
+title: Contributing
+---
+
 # Contributor Guide
 
 Thank you for your interest in improving this project. This project is
diff --git a/docs/css/github-permalink-style.css b/docs/css/github-permalink-style.css
new file mode 100644
index 000000000..77ff3e827
--- /dev/null
+++ b/docs/css/github-permalink-style.css
@@ -0,0 +1,39 @@
+.headerlink {
+	--permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */
+	--permalink-spacing: 4px;
+
+	width: calc(var(--permalink-size) + var(--permalink-spacing));
+	height: var(--permalink-size);
+	vertical-align: middle;
+	background-color: var(--md-default-fg-color--lighter);
+	background-size: var(--permalink-size);
+	mask-size: var(--permalink-size);
+	-webkit-mask-size: var(--permalink-size);
+	mask-repeat: no-repeat;
+	-webkit-mask-repeat: no-repeat;
+	visibility: visible;
+	mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
+	-webkit-mask-image: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="16" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg>');
+}
+
+[id]:target .headerlink {
+	background-color: var(--md-typeset-a-color);
+}
+
+.headerlink:hover {
+	background-color: var(--md-accent-fg-color) !important;
+}
+
+@media screen and (min-width: 76.25em) {
+	h1, h2, h3, h4, h5, h6 {
+		display: flex;
+		align-items: center;
+		flex-direction: row;
+		column-gap: 0.2em; /* fixes spaces in titles */
+	}
+
+	.headerlink {
+		order: -1;
+		margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important;
+	}
+}
diff --git a/docs/examples.md b/docs/examples.md
index 2d4362862..5b0d0b236 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -1,3 +1,6 @@
+---
+title: Examples
+---
 
 # Examples
 
diff --git a/docs/index.md b/docs/index.md
index fc86d14d7..af1341677 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,3 +1,6 @@
+---
+title: Welcome to DataChain
+---
 # <a class="main-header-link" href="/" ><img style="display: inline-block;" src="/assets/datachain.svg" alt="DataChain"> <span style="display: inline-block;"> DataChain</span></a>
 
 <style>
@@ -83,7 +86,7 @@ The following pages provide detailed documentation on DataChain's features, arch
 - [🏃🏼‍♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time.
 - [🎯 Examples](examples.md): Explore practical examples and use cases.
 - [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks.
-- [📚 API Reference](references/index.md): Dive into the technical details and API reference.
+- [🐍 API Reference](references/index.md): Dive into the technical details and API reference.
 - [🤝 Contributing](contributing.md): Learn how to contribute to DataChain.
 
 
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 9d72a32a9..a9d0dd894 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -1,3 +1,7 @@
+---
+title: Quick Start
+---
+
 # Quick Start
 
 ## Installation
diff --git a/docs/references/index.md b/docs/references/index.md
index 389cf366a..1704524ef 100644
--- a/docs/references/index.md
+++ b/docs/references/index.md
@@ -1,3 +1,7 @@
+---
+title: API Reference
+---
+
 # API Reference
 
 DataChain's API is organized into several modules:
diff --git a/docs/tutorials.md b/docs/tutorials.md
index dae91d81b..96824ffdc 100644
--- a/docs/tutorials.md
+++ b/docs/tutorials.md
@@ -1,3 +1,7 @@
+---
+title: Tutorials
+---
+
 # Tutorials
 
 * Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb)
diff --git a/mkdocs.yml b/mkdocs.yml
index b62e815f9..7bb9502de 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -27,7 +27,6 @@ theme:
     - navigation.tabs
     - navigation.path
     - navigation.top
-    - navigation.prune
     - navigation.footer
     - toc.follow
     - content.action.edit
@@ -37,7 +36,6 @@ theme:
     - content.tooltips
     - search.highlight
     - search.suggest
-    - navigation.sections
 
   palette:
     # Palette toggle for automatic mode
@@ -56,8 +54,8 @@ theme:
     # Palette toggle for dark mode
     - media: "(prefers-color-scheme: dark)"
       scheme: slate
-      primary: black
-      accent: lime
+      primary: teal
+      accent: teal
       toggle:
         icon: material/weather-night
         name: Switch to system preference
@@ -68,18 +66,18 @@ nav:
       - 🏃🏼‍♂️ Quick Start: quick-start.md
       - 🎯 Examples: examples.md
       - 📚 Tutorials: tutorials.md
-      - 🐍 API Reference: references/index.md
+      - 🐍 API Reference:
+          - Overview: references/index.md
+          - DataChain: references/datachain.md
+          - DataType: references/datatype.md
+          - File: references/file.md
+          - UDF: references/udf.md
+          - Torch: references/torch.md
+          - SQL: references/sql.md
       - 🤝 Contributing: contributing.md
-  - API Reference:
-      - references/index.md
-      - references/datachain.md
-      - references/datatype.md
-      - references/file.md
-      - references/udf.md
-      - references/torch.md
-      - references/sql.md
-  - DataChain Website: https://datachain.ai" target="_blank"
-  - Studio: https://studio.datachain.ai" target="_blank"
+
+  - DataChain Website ↗: https://datachain.ai" target="_blank"
+  - Studio ↗: https://studio.datachain.ai" target="_blank"
 
 markdown_extensions:
   - abbr
@@ -105,7 +103,11 @@ markdown_extensions:
   - pymdownx.tilde
   - tables
   - toc:
-      permalink: true
+      permalink: ''
+
+# Custom permalink style: https://github.com/squidfunk/mkdocs-material/discussions/3535
+extra_css:
+  - css/github-permalink-style.css
 
 extra:
   social:

From de6b1c683194de06e725bec23d6be481451abde0 Mon Sep 17 00:00:00 2001
From: skshetry <18718008+skshetry@users.noreply.github.com>
Date: Mon, 9 Dec 2024 19:07:04 +0545
Subject: [PATCH 10/10] ci: add a consolidated `check` step for required PR
 checks (#676)

This commit introduces a check step that acts as a single point of failure for specified jobs in a workflow. The step fails if any of the specified jobs fail and passes only if they succeed.
This is useful for adding a single step to "Required Checks" for PR.

This avoids the need to manually add each job to "Required Checks", and reduces maintenance overhead when renaming jobs.
---
 .github/workflows/tests.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index eeec26b61..09b0dcfea 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -170,3 +170,13 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
+
+  check:
+    if: always()
+    needs: [lint, datachain, examples]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: re-actors/alls-green@release/v1
+        with:
+          allowed-failures: examples
+          jobs: ${{ toJSON(needs) }}