From 403c7946fce888be1230b15d7c3abd7645ca7e49 Mon Sep 17 00:00:00 2001 From: shibuiwilliam Date: Sat, 7 Dec 2024 02:19:51 +0900 Subject: [PATCH 01/10] add some missing type hint (#648) * add some missing type hint * Move import under TYPE_CHECKING --------- Co-authored-by: Vladimir Rudnykh --- src/datachain/client/fsspec.py | 6 ++++-- src/datachain/client/local.py | 13 +++++++++---- src/datachain/lib/convert/flatten.py | 12 +++++++----- src/datachain/lib/convert/unflatten.py | 4 ++-- src/datachain/lib/convert/values_to_tuples.py | 2 +- src/datachain/lib/utils.py | 2 +- src/datachain/query/dataset.py | 2 +- 7 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/datachain/client/fsspec.py b/src/datachain/client/fsspec.py index 91774e381..b03abae12 100644 --- a/src/datachain/client/fsspec.py +++ b/src/datachain/client/fsspec.py @@ -172,7 +172,7 @@ def is_root_url(cls, url) -> bool: return url == cls.PREFIX @classmethod - def get_uri(cls, name) -> "StorageURI": + def get_uri(cls, name: str) -> "StorageURI": from datachain.dataset import StorageURI return StorageURI(f"{cls.PREFIX}{name}") @@ -278,7 +278,9 @@ async def _fetch_default( ) -> None: await self._fetch_nested(start_prefix, result_queue) - async def _fetch_dir(self, prefix, pbar, result_queue: ResultQueue) -> set[str]: + async def _fetch_dir( + self, prefix: str, pbar, result_queue: ResultQueue + ) -> set[str]: path = f"{self.name}/{prefix}" infos = await self.ls_dir(path) files = [] diff --git a/src/datachain/client/local.py b/src/datachain/client/local.py index d76c53681..d654ad600 100644 --- a/src/datachain/client/local.py +++ b/src/datachain/client/local.py @@ -12,6 +12,7 @@ from .fsspec import Client if TYPE_CHECKING: + from datachain.cache import DataChainCache from datachain.dataset import StorageURI @@ -21,7 +22,11 @@ class FileClient(Client): protocol = "file" def __init__( - self, name: str, fs_kwargs: dict[str, Any], cache, use_symlinks: bool = False + self, + name: str, + fs_kwargs: dict[str, Any], + cache: "DataChainCache", + use_symlinks: bool = False, ) -> None: super().__init__(name, fs_kwargs, cache) self.use_symlinks = use_symlinks @@ -30,7 +35,7 @@ def url(self, path: str, expires: int = 3600, **kwargs) -> str: raise TypeError("Signed urls are not implemented for local file system") @classmethod - def get_uri(cls, name) -> "StorageURI": + def get_uri(cls, name: str) -> "StorageURI": from datachain.dataset import StorageURI return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}') @@ -77,7 +82,7 @@ def split_url(cls, url: str) -> tuple[str, str]: return bucket, path @classmethod - def from_name(cls, name: str, cache, kwargs) -> "FileClient": + def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient": use_symlinks = kwargs.pop("use_symlinks", False) return cls(name, kwargs, cache, use_symlinks=use_symlinks) @@ -85,7 +90,7 @@ def from_name(cls, name: str, cache, kwargs) -> "FileClient": def from_source( cls, uri: str, - cache, + cache: "DataChainCache", use_symlinks: bool = False, **kwargs, ) -> "FileClient": diff --git a/src/datachain/lib/convert/flatten.py b/src/datachain/lib/convert/flatten.py index 7b73c5e2d..a35df07d9 100644 --- a/src/datachain/lib/convert/flatten.py +++ b/src/datachain/lib/convert/flatten.py @@ -1,19 +1,21 @@ +from collections.abc import Generator + from pydantic import BaseModel from datachain.lib.model_store import ModelStore -def flatten(obj: BaseModel): +def flatten(obj: BaseModel) -> tuple: return tuple(_flatten_fields_values(obj.model_fields, obj)) -def flatten_list(obj_list): +def flatten_list(obj_list: list[BaseModel]) -> tuple: return tuple( val for obj in obj_list for val in _flatten_fields_values(obj.model_fields, obj) ) -def _flatten_list_field(value: list): +def _flatten_list_field(value: list) -> list: assert isinstance(value, list) if value and ModelStore.is_pydantic(type(value[0])): return [val.model_dump() for val in value] @@ -22,7 +24,7 @@ def _flatten_list_field(value: list): return value -def _flatten_fields_values(fields, obj: BaseModel): +def _flatten_fields_values(fields: dict, obj: BaseModel) -> Generator: for name, f_info in fields.items(): anno = f_info.annotation # Optimization: Access attributes directly to skip the model_dump() call. @@ -40,5 +42,5 @@ def _flatten_fields_values(fields, obj: BaseModel): yield value -def _flatten(obj): +def _flatten(obj: BaseModel) -> tuple: return tuple(_flatten_fields_values(obj.model_fields, obj)) diff --git a/src/datachain/lib/convert/unflatten.py b/src/datachain/lib/convert/unflatten.py index 2a38959d8..a41a4b65a 100644 --- a/src/datachain/lib/convert/unflatten.py +++ b/src/datachain/lib/convert/unflatten.py @@ -9,12 +9,12 @@ from datachain.query.schema import DEFAULT_DELIMITER -def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos=0) -> dict: +def unflatten_to_json(model: type[BaseModel], row: Sequence[Any], pos: int = 0) -> dict: return unflatten_to_json_pos(model, row, pos)[0] def unflatten_to_json_pos( - model: type[BaseModel], row: Sequence[Any], pos=0 + model: type[BaseModel], row: Sequence[Any], pos: int = 0 ) -> tuple[dict, int]: res = {} for name, f_info in model.model_fields.items(): diff --git a/src/datachain/lib/convert/values_to_tuples.py b/src/datachain/lib/convert/values_to_tuples.py index c75f77da7..f92206912 100644 --- a/src/datachain/lib/convert/values_to_tuples.py +++ b/src/datachain/lib/convert/values_to_tuples.py @@ -11,7 +11,7 @@ class ValuesToTupleError(DataChainParamsError): - def __init__(self, ds_name, msg): + def __init__(self, ds_name: str, msg: str): if ds_name: ds_name = f"' {ds_name}'" super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}") diff --git a/src/datachain/lib/utils.py b/src/datachain/lib/utils.py index c7d90e65a..7865ac70b 100644 --- a/src/datachain/lib/utils.py +++ b/src/datachain/lib/utils.py @@ -28,7 +28,7 @@ def __init__(self, message): class DataChainColumnError(DataChainParamsError): - def __init__(self, col_name, msg): + def __init__(self, col_name: str, msg: str): super().__init__(f"Error for column {col_name}: {msg}") diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index effc24caf..46058ba83 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -215,7 +215,7 @@ def query( Should return select query that calculates desired diff between dataset queries """ - def apply(self, query_generator, temp_tables: list[str]): + def apply(self, query_generator, temp_tables: list[str]) -> "StepResult": source_query = query_generator.exclude(("sys__id",)) target_query = self.dq.apply_steps().select() temp_tables.extend(self.dq.temp_table_names) From e881c1fef3724a590c1da43c9fc253b2200d1d38 Mon Sep 17 00:00:00 2001 From: Ivan Shcheklein Date: Fri, 6 Dec 2024 17:01:50 -0800 Subject: [PATCH 02/10] update readme - lower use cases to the ground (#668) * update readme - lower use cases to the ground * lint --- README.rst | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 8cfd74d35..d2025b29d 100644 --- a/README.rst +++ b/README.rst @@ -28,16 +28,20 @@ in an internal database for easy and efficient querying. Use Cases ========= -1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and - refining data in pre-training, finetuning or LLM evaluating stages. -2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and - ad-hoc analytics using LLMs. +1. **ETL.** Pythonic framework for describing and running unstructured data transformations + and enrichments, applying models to data, including LLMs. +2. **Analytics.** DataChain dataset is a table that combines all the information about data + objects in one place + it provides dataframe-like API and vecrorized engine to do analytics + on these tables at scale. +3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC). + Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs. + Key Features ============ 📂 **Multimodal Dataset Versioning.** - - Version unstructured data without redundant data copies, by supporting + - Version unstructured data without moving or creating data copies, by supporting references to S3, GCP, Azure, and local file systems. - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc. - Unite files and metadata together into persistent, versioned, columnar datasets. @@ -145,7 +149,7 @@ detected are then copied to the local directory. LLM judging chatbots -============================= +==================== LLMs can work as universal classifiers. In the example below, we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free @@ -255,7 +259,7 @@ Output: Iterating over Python data structures -============================================= +===================================== In the previous examples, datasets were saved in the embedded database (`SQLite`_ in folder `.datachain` of the working directory). @@ -343,7 +347,7 @@ DataChain Studio Platform - **Centralized dataset registry** to manage data, code and dependency dependencies in one place. -- **Data Lineage** for data sources as well as direvative dataset. +- **Data Lineage** for data sources as well as derivative dataset. - **UI for Multimodal Data** like images, videos, and PDFs. - **Scalable Compute** to handle large datasets (100M+ files) and in-house AI model inference. @@ -368,7 +372,7 @@ To learn more, see the `Contributor Guide`_. Community and Support --------------------- -* `Docs `_ +* `Docs `_ * `File an issue`_ if you encounter any problems * `Discord Chat `_ * `Email `_ From d4cd7a8681e06f38be56c4b7f93038ac46647652 Mon Sep 17 00:00:00 2001 From: Vladimir Rudnykh Date: Mon, 9 Dec 2024 08:56:26 +0700 Subject: [PATCH 03/10] Add 'bit_hamming_distance' and 'byte_hamming_distance' SQLite functions (#669) --- src/datachain/func/__init__.py | 5 ++- src/datachain/func/numeric.py | 46 +++++++++++++++++++ src/datachain/func/string.py | 46 +++++++++++++++++++ src/datachain/sql/functions/numeric.py | 12 +++++ src/datachain/sql/functions/string.py | 12 +++++ src/datachain/sql/sqlite/base.py | 40 +++++++++++++++++ tests/unit/test_func.py | 62 +++++++++++++++++++++++++- 7 files changed, 220 insertions(+), 3 deletions(-) diff --git a/src/datachain/func/__init__.py b/src/datachain/func/__init__.py index cfbbedea0..7e77d770b 100644 --- a/src/datachain/func/__init__.py +++ b/src/datachain/func/__init__.py @@ -17,8 +17,9 @@ ) from .array import cosine_distance, euclidean_distance, length, sip_hash_64 from .conditional import greatest, least -from .numeric import bit_and, bit_or, bit_xor, int_hash_64 +from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64 from .random import rand +from .string import byte_hamming_distance from .window import window __all__ = [ @@ -26,8 +27,10 @@ "array", "avg", "bit_and", + "bit_hamming_distance", "bit_or", "bit_xor", + "byte_hamming_distance", "case", "collect", "concat", diff --git a/src/datachain/func/numeric.py b/src/datachain/func/numeric.py index a26e7ff51..7ae60460f 100644 --- a/src/datachain/func/numeric.py +++ b/src/datachain/func/numeric.py @@ -160,3 +160,49 @@ def int_hash_64(col: Union[ColT, int]) -> Func: return Func( "int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int ) + + +def bit_hamming_distance(*args: Union[ColT, int]) -> Func: + """ + Computes the Hamming distance between the bit representations of two integer values. + + The Hamming distance is the number of positions at which the corresponding bits + are different. This function returns the dissimilarity between the integers, + where 0 indicates identical integers and values closer to the number of bits + in the integer indicate higher dissimilarity. + + Args: + args (str | int): Two integers to compute the Hamming distance between. + If a str is provided, it is assumed to be the name of the column. + If an int is provided, it is assumed to be an integer literal. + + Returns: + Func: A Func object that represents the Hamming distance function. + + Example: + ```py + dc.mutate( + ham_dist=func.bit_hamming_distance("embed1", 123456), + ) + ``` + + Notes: + - Result column will always be of type int. + """ + cols, func_args = [], [] + for arg in args: + if isinstance(arg, int): + func_args.append(arg) + else: + cols.append(arg) + + if len(cols) + len(func_args) != 2: + raise ValueError("bit_hamming_distance() requires exactly two arguments") + + return Func( + "bit_hamming_distance", + inner=numeric.bit_hamming_distance, + cols=cols, + args=func_args, + result_type=int, + ) diff --git a/src/datachain/func/string.py b/src/datachain/func/string.py index 33aa5a22d..448c7f447 100644 --- a/src/datachain/func/string.py +++ b/src/datachain/func/string.py @@ -152,3 +152,49 @@ def inner(arg): args = None return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str) + + +def byte_hamming_distance(*args: Union[str, Func]) -> Func: + """ + Computes the Hamming distance between two strings. + + The Hamming distance is the number of positions at which the corresponding + characters are different. This function returns the dissimilarity between + the strings, where 0 indicates identical strings and values closer to the length + of the strings indicate higher dissimilarity. + + Args: + args (str | literal): Two strings to compute the Hamming distance between. + If a str is provided, it is assumed to be the name of the column. + If a Literal is provided, it is assumed to be a string literal. + + Returns: + Func: A Func object that represents the Hamming distance function. + + Example: + ```py + dc.mutate( + ham_dist=func.byte_hamming_distance("file.phash", literal("hello")), + ) + ``` + + Notes: + - Result column will always be of type int. + """ + cols, func_args = [], [] + for arg in args: + if get_origin(arg) is literal: + func_args.append(arg) + else: + cols.append(arg) + + if len(cols) + len(func_args) != 2: + raise ValueError("byte_hamming_distance() requires exactly two arguments") + + return Func( + "byte_hamming_distance", + inner=string.byte_hamming_distance, + cols=cols, + args=func_args, + result_type=int, + ) diff --git a/src/datachain/sql/functions/numeric.py b/src/datachain/sql/functions/numeric.py index 2a7a82d6c..968eecc8d 100644 --- a/src/datachain/sql/functions/numeric.py +++ b/src/datachain/sql/functions/numeric.py @@ -35,9 +35,21 @@ class int_hash_64(GenericFunction): # noqa: N801 inherit_cache = True +class bit_hamming_distance(GenericFunction): # noqa: N801 + """ + Returns the Hamming distance between two integers. + """ + + type = Int64() + package = "numeric" + name = "hamming_distance" + inherit_cache = True + + compiler_not_implemented(bit_and) compiler_not_implemented(bit_or) compiler_not_implemented(bit_xor) compiler_not_implemented(bit_rshift) compiler_not_implemented(bit_lshift) compiler_not_implemented(int_hash_64) +compiler_not_implemented(bit_hamming_distance) diff --git a/src/datachain/sql/functions/string.py b/src/datachain/sql/functions/string.py index 4ccee8444..545c706a9 100644 --- a/src/datachain/sql/functions/string.py +++ b/src/datachain/sql/functions/string.py @@ -48,7 +48,19 @@ class replace(GenericFunction): # noqa: N801 inherit_cache = True +class byte_hamming_distance(GenericFunction): # noqa: N801 + """ + Returns the Hamming distance between two strings. + """ + + type = Int64() + package = "string" + name = "hamming_distance" + inherit_cache = True + + compiler_not_implemented(length) compiler_not_implemented(split) compiler_not_implemented(regexp_replace) compiler_not_implemented(replace) +compiler_not_implemented(byte_hamming_distance) diff --git a/src/datachain/sql/sqlite/base.py b/src/datachain/sql/sqlite/base.py index d5589cf85..552d39efd 100644 --- a/src/datachain/sql/sqlite/base.py +++ b/src/datachain/sql/sqlite/base.py @@ -90,6 +90,7 @@ def setup(): compiles(string.split, "sqlite")(compile_string_split) compiles(string.regexp_replace, "sqlite")(compile_string_regexp_replace) compiles(string.replace, "sqlite")(compile_string_replace) + compiles(string.byte_hamming_distance, "sqlite")(compile_byte_hamming_distance) compiles(conditional.greatest, "sqlite")(compile_greatest) compiles(conditional.least, "sqlite")(compile_least) compiles(Values, "sqlite")(compile_values) @@ -104,6 +105,7 @@ def setup(): compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift) compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift) compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64) + compiles(numeric.bit_hamming_distance, "sqlite")(compile_bit_hamming_distance) if load_usearch_extension(sqlite3.connect(":memory:")): compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext) @@ -191,6 +193,26 @@ def sqlite_int_hash_64(x: int) -> int: return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64) +def sqlite_bit_hamming_distance(a: int, b: int) -> int: + """Calculate the Hamming distance between two integers.""" + diff = (a & MAX_INT64) ^ (b & MAX_INT64) + if hasattr(diff, "bit_count"): + return diff.bit_count() + return bin(diff).count("1") + + +def sqlite_byte_hamming_distance(a: str, b: str) -> int: + """Calculate the Hamming distance between two strings.""" + diff = 0 + if len(a) < len(b): + diff = len(b) - len(a) + b = b[: len(a)] + elif len(b) < len(a): + diff = len(a) - len(b) + a = a[: len(b)] + return diff + sum(c1 != c2 for c1, c2 in zip(a, b)) + + def register_user_defined_sql_functions() -> None: # Register optional functions if we have the necessary dependencies # and otherwise register functions that will raise an exception with @@ -225,6 +247,9 @@ def create_numeric_functions(conn): "bitwise_lshift", 2, lambda a, b: a << b, deterministic=True ) conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True) + conn.create_function( + "bit_hamming_distance", 2, sqlite_bit_hamming_distance, deterministic=True + ) _registered_function_creators["numeric_functions"] = create_numeric_functions @@ -237,6 +262,9 @@ def create_string_functions(conn): conn.create_function( "regexp_replace", 3, sqlite_regexp_replace, deterministic=True ) + conn.create_function( + "byte_hamming_distance", 2, sqlite_byte_hamming_distance, deterministic=True + ) _registered_function_creators["string_functions"] = create_string_functions @@ -383,6 +411,18 @@ def compile_int_hash_64(element, compiler, **kwargs): return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs) +def compile_bit_hamming_distance(element, compiler, **kwargs): + return compiler.process( + func.bit_hamming_distance(*element.clauses.clauses), **kwargs + ) + + +def compile_byte_hamming_distance(element, compiler, **kwargs): + return compiler.process( + func.byte_hamming_distance(*element.clauses.clauses), **kwargs + ) + + def py_json_array_length(arr): return len(orjson.loads(arr)) diff --git a/tests/unit/test_func.py b/tests/unit/test_func.py index 7ca4d3dde..fc9e93455 100644 --- a/tests/unit/test_func.py +++ b/tests/unit/test_func.py @@ -2,11 +2,20 @@ from sqlalchemy import Label from datachain import DataChain -from datachain.func import int_hash_64 +from datachain.func import ( + bit_hamming_distance, + byte_hamming_distance, + int_hash_64, + literal, +) from datachain.func.random import rand from datachain.func.string import length as strlen from datachain.lib.signal_schema import SignalSchema -from datachain.sql.sqlite.base import sqlite_int_hash_64 +from datachain.sql.sqlite.base import ( + sqlite_bit_hamming_distance, + sqlite_byte_hamming_distance, + sqlite_int_hash_64, +) @pytest.fixture() @@ -584,3 +593,52 @@ def test_int_hash_64_mutate(dc): 7766709361750702608, 15228578409069794350, ] + + +@pytest.mark.parametrize( + "value1,value2,distance", + [ + [0, 0, 0], + [0, 1, 1], + [2, 3, 1], + [2, 4, 2], + [0, 2**64 - 1, 64], + [-(2**63), 2**63 - 1, 64], + [-(2**63), 2**63, 0], + ], +) +def test_sqlite_bit_hamming_distance(value1, value2, distance): + assert sqlite_bit_hamming_distance(value1, value2) == distance + + +def test_bit_hamming_distance_mutate(dc): + res = ( + dc.mutate(test=bit_hamming_distance(strlen("val"), 5)) + .order_by("num") + .collect("test") + ) + assert list(res) == [1, 3, 2, 1, 0] + + +@pytest.mark.parametrize( + "value1,value2,distance", + [ + ["", "", 0], + ["", "a", 1], + ["foo", "foo", 0], + ["foo", "bar", 3], + ["foo", "foobar", 3], + ["karolin", "kathrin", 3], + ], +) +def test_sqlite_byte_hamming_distance(value1, value2, distance): + assert sqlite_byte_hamming_distance(value1, value2) == distance + + +def test_byte_hamming_distance_mutate(dc): + res = ( + dc.mutate(test=byte_hamming_distance("val", literal("xxx"))) + .order_by("num") + .collect("test") + ) + assert list(res) == [2, 1, 0, 1, 2] From 2c8226ac23b4af0505c1d33931bcb96524668b9e Mon Sep 17 00:00:00 2001 From: Thomas Kunwar <20840228+yathomasi@users.noreply.github.com> Date: Mon, 9 Dec 2024 11:35:51 +0545 Subject: [PATCH 04/10] feat: update documentation and enhance navigation structure (#670) * feat: update documentation and enhance navigation structure * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chore: minor updates and fixes * chore: update readme (#672) * chore: update readme * chore: minor update --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CONTRIBUTING.rst | 129 ------------- README.rst | 317 ++---------------------------- docs/contributing.md | 111 +++++++++++ docs/examples.md | 289 ++++++++++++++++++++++++++++ docs/index.md | 405 ++++++++++----------------------------- docs/quick-start.md | 286 +++++++++++++++++++++++++++ docs/references/index.md | 14 +- docs/tutorials.md | 5 + mkdocs.yml | 15 +- 9 files changed, 829 insertions(+), 742 deletions(-) delete mode 100644 CONTRIBUTING.rst create mode 100644 docs/contributing.md create mode 100644 docs/examples.md create mode 100644 docs/quick-start.md create mode 100644 docs/tutorials.md diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index e20a29e09..000000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,129 +0,0 @@ -Contributor Guide -================= - -Thank you for your interest in improving this project. -This project is open-source under the `Apache 2.0 license`_ and -welcomes contributions in the form of bug reports, feature requests, and pull requests. - -Here is a list of important resources for contributors: - -- `Source Code`_ -- `Documentation`_ -- `Issue Tracker`_ -- `Code of Conduct`_ - -.. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0 -.. _Source Code: https://github.com/iterative/datachain -.. _Documentation: https://docs.dvc.ai/datachain -.. _Issue Tracker: https://github.com/iterative/datachain/issues - -How to report a bug -------------------- - -Report bugs on the `Issue Tracker`_. - -When filing an issue, make sure to answer these questions: - -- Which operating system and Python version are you using? -- Which version of this project are you using? -- What did you do? -- What did you expect to see? -- What did you see instead? - -The best way to get your bug fixed is to provide a test case, -and/or steps to reproduce the issue. - - -How to request a feature ------------------------- - -Request features on the `Issue Tracker`_. - - -How to set up your development environment ------------------------------------------- - -You need Python 3.8+ and the following tools: - -- Nox_ - -Install the package with development requirements: - -.. code:: console - - $ pip install nox - -.. _Nox: https://nox.thea.codes/ - - -How to test the project ------------------------ - -Run the full test suite: - -.. code:: console - - $ nox - -List the available Nox sessions: - -.. code:: console - - $ nox --list-sessions - -You can also run a specific Nox session. -For example, invoke the unit test suite like this: - -.. code:: console - - $ nox --session=tests - -Unit tests are located in the ``tests`` directory, -and are written using the pytest_ testing framework. - -.. _pytest: https://pytest.readthedocs.io/ - - -Build documentation -------------------- - -If you've made any changes to the documentation (including changes to function signatures, -class definitions, or docstrings that will appear in the API documentation), -make sure it builds successfully. - -.. code:: console - - $ nox -s docs - -In order to run this locally with hot reload on changes: - -.. code:: console - - $ mkdocs serve - - -How to submit changes ---------------------- - -Open a `pull request`_ to submit changes to this project. - -Your pull request needs to meet the following guidelines for acceptance: - -- The Nox test suite must pass without errors and warnings. -- Include unit tests. This project maintains 100% code coverage. -- If your changes add functionality, update the documentation accordingly. - -Feel free to submit early, though—we can always iterate on this. - -To run linting and code formatting checks, you can invoke a `lint` session in nox: - -.. code:: console - - $ nox -s lint - -It is recommended to open an issue before starting work on anything. -This will allow a chance to talk it over with the owners and validate your approach. - -.. _pull request: https://github.com/iterative/datachain/pulls -.. github-only -.. _Code of Conduct: CODE_OF_CONDUCT.rst diff --git a/README.rst b/README.rst index d2025b29d..a79b48f86 100644 --- a/README.rst +++ b/README.rst @@ -21,7 +21,7 @@ DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured data like images, audio, videos, text and PDFs. It integrates with external storage -(e.g., S3) to process data efficiently without data duplication and manages metadata +(e.g. S3) to process data efficiently without data duplication and manages metadata in an internal database for easy and efficient querying. @@ -59,289 +59,30 @@ Key Features - Pass datasets to Pytorch and Tensorflow, or export them back into storage. -Quick Start ------------ +Getting Started +=============== -.. code:: console +Visit `Quick Start `_ to get started with `DataChain` and learn more. - $ pip install datachain +Contributing +============ -Selecting files using JSON metadata -====================================== - -A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`), -annotated with ground truth and model inferences in the 'json-pairs' format, -where each image has a matching JSON file like `cat.1009.json`: - -.. code:: json - - { - "class": "cat", "id": "1009", "num_annotators": 8, - "inference": {"class": "dog", "confidence": 0.68} - } - -Example of downloading only "high-confidence cat" inferred images using JSON metadata: - - -.. code:: py - - from datachain import Column, DataChain - - meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta") - images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg") - - images_id = images.map(id=lambda file: file.path.split('.')[-2]) - annotated = images_id.merge(meta, on="id", right_on="meta.id") - - likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \ - & (Column("meta.inference.class_") == "cat")) - likely_cats.export_files("high-confidence-cats/", signal="file") - - -Data curation with a local AI model -=================================== -Batch inference with a simple sentiment model using the `transformers` library: - -.. code:: shell - - pip install transformers - -The code below downloads files from the cloud, and applies a user-defined function -to each one of them. All files with a positive sentiment -detected are then copied to the local directory. - -.. code:: py - - from transformers import pipeline - from datachain import DataChain, Column - - classifier = pipeline("sentiment-analysis", device="cpu", - model="distilbert/distilbert-base-uncased-finetuned-sst-2-english") - - def is_positive_dialogue_ending(file) -> bool: - dialogue_ending = file.read()[-512:] - return classifier(dialogue_ending)[0]["label"] == "POSITIVE" - - chain = ( - DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", - object_name="file", type="text") - .settings(parallel=8, cache=True) - .map(is_positive=is_positive_dialogue_ending) - .save("file_response") - ) - - positive_chain = chain.filter(Column("is_positive") == True) - positive_chain.export_files("./output") - - print(f"{positive_chain.count()} files were exported") - - - -13 files were exported - -.. code:: shell - - $ ls output/datachain-demo/chatbot-KiT/ - 15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ... - $ ls output/datachain-demo/chatbot-KiT/ | wc -l - 13 - - -LLM judging chatbots -==================== - -LLMs can work as universal classifiers. In the example below, -we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free -Mistral API key at https://console.mistral.ai - - -.. code:: shell - - $ pip install mistralai (Requires version >=1.0.0) - $ export MISTRAL_API_KEY=_your_key_ - -DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time. - -.. code:: py - - from mistralai import Mistral - from datachain import File, DataChain, Column - - PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure." - - def eval_dialogue(file: File) -> bool: - client = Mistral() - response = client.chat.complete( - model="open-mixtral-8x22b", - messages=[{"role": "system", "content": PROMPT}, - {"role": "user", "content": file.read()}]) - result = response.choices[0].message.content - return result.lower().startswith("success") - - chain = ( - DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file") - .settings(parallel=4, cache=True) - .map(is_success=eval_dialogue) - .save("mistral_files") - ) - - successful_chain = chain.filter(Column("is_success") == True) - successful_chain.export_files("./output_mistral") - - print(f"{successful_chain.count()} files were exported") - - -With the instruction above, the Mistral model considers 31/50 files to hold the successful dialogues: - -.. code:: shell - - $ ls output_mistral/datachain-demo/chatbot-KiT/ - 1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ... - $ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l - 31 - - - -Serializing Python-objects -========================== - -LLM responses may contain valuable information for analytics – such as the number of tokens used, or the -model performance parameters. - -Instead of extracting this information from the Mistral response data structure (class -`ChatCompletionResponse`), DataChain can serialize the entire LLM response to the internal DB: - - -.. code:: py - - from mistralai import Mistral - from mistralai.models import ChatCompletionResponse - from datachain import File, DataChain, Column - - PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure." - - def eval_dialog(file: File) -> ChatCompletionResponse: - client = MistralClient() - return client.chat( - model="open-mixtral-8x22b", - messages=[{"role": "system", "content": PROMPT}, - {"role": "user", "content": file.read()}]) - - chain = ( - DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file") - .settings(parallel=4, cache=True) - .map(response=eval_dialog) - .map(status=lambda response: response.choices[0].message.content.lower()[:7]) - .save("response") - ) - - chain.select("file.name", "status", "response.usage").show(5) - - success_rate = chain.filter(Column("status") == "success").count() / chain.count() - print(f"{100*success_rate:.1f}% dialogs were successful") - -Output: - -.. code:: shell - - file status response response response - name usage usage usage - prompt_tokens total_tokens completion_tokens - 0 1.txt success 547 548 1 - 1 10.txt failure 3576 3578 2 - 2 11.txt failure 626 628 2 - 3 12.txt failure 1144 1182 38 - 4 13.txt success 1100 1101 1 - - [Limited by 5 rows] - 64.0% dialogs were successful - - -Iterating over Python data structures -===================================== - -In the previous examples, datasets were saved in the embedded database -(`SQLite`_ in folder `.datachain` of the working directory). -These datasets were automatically versioned, and can be accessed using -`DataChain.from_dataset("dataset_name")`. - -Here is how to retrieve a saved dataset and iterate over the objects: - -.. code:: py - - chain = DataChain.from_dataset("response") - - # Iterating one-by-one: support out-of-memory workflow - for file, response in chain.limit(5).collect("file", "response"): - # verify the collected Python objects - assert isinstance(response, ChatCompletionResponse) - - status = response.choices[0].message.content[:7] - tokens = response.usage.total_tokens - print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}") - -Output: - -.. code:: shell - - gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548 - gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578 - gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628 - gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207 - gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101 - - -Vectorized analytics over Python objects -======================================== - -Some operations can run inside the DB without deserialization. -For instance, let's calculate the total cost of using the LLM APIs, assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M output tokens: - -.. code:: py - - chain = DataChain.from_dataset("mistral_dataset") - - cost = chain.sum("response.usage.prompt_tokens")*0.000002 \ - + chain.sum("response.usage.completion_tokens")*0.000006 - print(f"Spent ${cost:.2f} on {chain.count()} calls") - -Output: - -.. code:: shell - - Spent $0.08 on 50 calls - - -PyTorch data loader -=================== - -Chain results can be exported or passed directly to PyTorch dataloader. -For example, if we are interested in passing image and a label based on file -name suffix, the following code will do it: - -.. code:: py - - from torch.utils.data import DataLoader - from transformers import CLIPProcessor +Contributions are very welcome. To learn more, see the `Contributor Guide`_. - from datachain import C, DataChain - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") +Community and Support +===================== - chain = ( - DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image") - .map(label=lambda name: name.split(".")[0], params=["file.name"]) - .select("file", "label").to_pytorch( - transform=processor.image_processor, - tokenizer=processor.tokenizer, - ) - ) - loader = DataLoader(chain, batch_size=1) +* `Docs `_ +* `File an issue`_ if you encounter any problems +* `Discord Chat `_ +* `Email `_ +* `Twitter `_ DataChain Studio Platform -------------------------- +======================== `DataChain Studio`_ is a proprietary solution for teams that offers: @@ -353,36 +94,10 @@ DataChain Studio Platform AI model inference. - **Access control** including SSO and team based collaboration. -Tutorials ---------- - -* `Getting Started`_ -* `Multimodal `_ (try in `Colab `__) -* `LLM evaluations `_ (try in `Colab `__) -* `Reading JSON metadata `_ (try in `Colab `__) - - -Contributions -------------- - -Contributions are very welcome. -To learn more, see the `Contributor Guide`_. - - -Community and Support ---------------------- - -* `Docs `_ -* `File an issue`_ if you encounter any problems -* `Discord Chat `_ -* `Email `_ -* `Twitter `_ - - .. _PyPI: https://pypi.org/ .. _file an issue: https://github.com/iterative/datachain/issues .. github-only -.. _Contributor Guide: CONTRIBUTING.rst +.. _Contributor Guide: https://docs.datachain.ai/contributing .. _Pydantic: https://github.com/pydantic/pydantic .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot .. _SQLite: https://www.sqlite.org/ diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..5d8a2e32c --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,111 @@ +# Contributor Guide + +Thank you for your interest in improving this project. This project is +open-source under the [Apache 2.0 +license](https://opensource.org/licenses/Apache-2.0) and welcomes +contributions in the form of bug reports, feature requests, and pull +requests. + +Here is a list of important resources for contributors: + +- [Source Code](https://github.com/iterative/datachain) +- [Documentation](https://docs.dvc.ai/datachain) +- [Issue Tracker](https://github.com/iterative/datachain/issues) +- [Code of Conduct](https://github.com/iterative/datachain?tab=coc-ov-file) + +## How to report a bug + +Report bugs on the [Issue +Tracker](https://github.com/iterative/datachain/issues). + +When filing an issue, make sure to answer these questions: + +- Which operating system and Python version are you using? +- Which version of this project are you using? +- What did you do? +- What did you expect to see? +- What did you see instead? + +The best way to get your bug fixed is to provide a test case, and/or +steps to reproduce the issue. + +## How to request a feature + +Request features on the [Issue +Tracker](https://github.com/iterative/datachain/issues). + +## How to set up your development environment + +You need Python 3.8+ and the following tools: + +- [Nox](https://nox.thea.codes/) + +Install the package with development requirements: + +``` console +$ pip install nox +``` + +## How to test the project + +Run the full test suite: + +``` console +$ nox +``` + +List the available Nox sessions: + +``` console +$ nox --list-sessions +``` + +You can also run a specific Nox session. For example, invoke the unit +test suite like this: + +``` console +$ nox --session=tests +``` + +Unit tests are located in the `tests` directory, and are written using +the [pytest](https://pytest.readthedocs.io/) testing framework. + +## Build documentation + +If you've made any changes to the documentation (including changes to +function signatures, class definitions, or docstrings that will appear +in the API documentation), make sure it builds successfully. + +``` console +$ nox -s docs +``` + +In order to run this locally with hot reload on changes: + +``` console +$ mkdocs serve +``` + +## How to submit changes + +Open a [pull request](https://github.com/iterative/datachain/pulls) to +submit changes to this project. + +Your pull request needs to meet the following guidelines for acceptance: + +- The Nox test suite must pass without errors and warnings. +- Include unit tests. This project maintains 100% code coverage. +- If your changes add functionality, update the documentation + accordingly. + +Feel free to submit early, though---we can always iterate on this. + +To run linting and code formatting checks, you can invoke a `lint` session in nox: + +``` console +$ nox -s lint +``` + +It is recommended to open an issue before starting work on anything. +This will allow a chance to talk it over with the owners and validate +your approach. diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 000000000..6409cea8a --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,289 @@ + +# Examples + +## DataChain Basics + +!!! example "DataChain Basics" + + Datachain is built by composing wrangling operations. + + For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”: + + ```python + from datachain.lib.dc import Column, DataChain, File # (1)! + from transformers import AutoProcessor, PaliGemmaForConditionalGeneration # (2)! + + images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image") + + model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224") + processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224") + + def process(file: File) -> str: + image=file.read().convert("RGB") + inputs = processor(text="caption", images=image, return_tensors="pt") + generate_ids = model.generate(**inputs, max_new_tokens=100) + return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + chain = ( + images.limit(5) + .settings(cache=True) + .map(scene=lambda file: process(file), output = str) + .save() + ) + ``` + + 1. `pip install datachain` + 2. `pip install transformers` + + Here is how we can view the results in a plot: + + ```python + import matplotlib.pyplot as plt + import re + from textwrap import wrap + + def trim_text(text): + match = re.search(r'[A-Z][^.]*\.', text) + return match.group(0) if match else '' + + images = chain.collect("file") + captions = chain.collect("scene") + _ , axes = plt.subplots(1, len(captions), figsize=(15, 5)) + + for ax, img, caption in zip(axes, images, captions): + ax.imshow(img.read(),cmap='gray') + ax.axis('off') + wrapped_caption = "\n".join(wrap(trim_text(caption), 30)) + ax.set_title(wrapped_caption, fontsize=6) + + plt.show() + ``` + + ![Untitled](assets/captioned_cartoons.png) + +If interested to see more examples, please check out the [tutorials](tutorials.md). + +### Handling Python objects + +In addition to storing primitive Python data types like strings, DataChain is also capable of using data models. + +For example, most LLMs return objects that carry additional fields. If provider offers a Pydantic model for their LLM, Datachain can use it as a schema. + +In the below example, we are calling a Mixtral 8x22b model to judge the “service chatbot” dataset from [Karlsruhe Institute of Technology](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot) and saving the results as Mistral’s *ChatCompletionResponse* objects: + +```python +# pip install mistralai +# this example requires a free Mistral API key, get yours at https://console.mistral.ai +# $ export MISTRAL_API_KEY='your key' + +import os +from datachain.lib.feature import Feature +from datachain.lib.dc import Column, DataChain +from mistralai.client import MistralClient +from mistralai.models.chat_completion import ChatMessage +from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel +from datachain.lib.data_model import DataModel + +prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON" +api_key = os.environ["MISTRAL_API_KEY"] + +## register the data model ### +DataModel.register(MistralModel) + +chain = ( + DataChain + .from_storage("gs://datachain-demo/chatbot-KiT/", type="text") + .filter(Column("file.name").glob("*.txt")) + .limit(5) + .settings(parallel=4, cache=True) + .map( + mistral=lambda file: MistralClient(api_key=api_key).chat( + model="open-mixtral-8x22b", + response_format={"type": "json_object"}, + messages= [ + ChatMessage(role="system", content=f"{prompt}"), + ChatMessage(role="user", content=f"{file.read()}") + ] + ), + output=MistralModel + ) + .save("dialog-rating") +) + +**iter = chain.collect("mistral") +**print(*map(lambda chat_response: chat_response.choices[0].message.content, iter)) +``` + +``` +{"result": "Yes"} {"result": "No"} {"result": "No"} {"result": "Yes"} +``` + +If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial: + +[https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) + +### Vectorized analytics + +Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized: + +```python +# continued from the previous example + +mistral_cost = chain.sum("mistral.usage.prompt_tokens")*0.000002 + \ + chain.sum("mistral.usage.completion_tokens")*0.000006 + +print(f"The cost of {chain.count()} calls to Mixtral 8x22b : ${mistral_cost:.4f}") +``` + +``` +The cost of 5 calls to Mixtral 8x22b : $0.0142 +``` + +### Dataset persistence + +The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain: + +```python +DataChain.from_dataset("rating").limit(2).save("dialog-rating") +``` + +Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry: + +```python +mydatasets = DataChain.datasets() +for ds in mydatasets.collect("dataset"): + print(f"{ds.name}@v{ds.version}") + +``` + +``` +Processed: 1 rows [00:00, 777.59 rows/s] +Generated: 14 rows [00:00, 11279.34 rows/s] +dialog-rating@v1 +dialog-rating@v2 +``` + +By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested: + +```python +ds = DataChain.from_dataset("dialog-rating", version = 1) +``` + +### Chain execution, optimization and parallelism + +Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects from the chain. + +“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose. + +Datachain natively supports parallelism in execution. If an API or a local model supports parallel requests, the `settings` operator can split the load across multiple workers (see the [code example above](#handling-python-objects)) + +### Reading external metadata + +It is common for AI data to come with pre-computed metadata (annotations, classes, etc). + +DataChain library understands common annotation formats (JSON, CSV, webdataset and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred. + +Here is an example of reading a simple CSV file where schema is heuristically derived from the header: + +```python +from datachain.lib.dc import DataChain + +uri="gs://datachain-demo/chatbot-csv/" +csv_dataset = DataChain.from_csv(uri) + +print(csv_dataset.to_pandas()) +``` + +Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples in blocks within JSON files. + +Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions: + +```json +{ + "images": [ + { + "license": 4, + "file_name": "000000397133.jpg", + "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg", + "height": 427, + "width": 640, + "date_captured": "2013-11-14 17:02:52", + "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg", + "id": 397133 + }, + ... + ], + "annotations": [ + { + "image_id" : "179765", + "id" : 38, + "caption" : "A black Honda motorcycle parked in front of a garage." + }, + ... + ], + ... +} +``` + +Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array. + +However, Datachain can easily parse the entire COCO structure via several reading and merging operators: + +```python + +from datachain.lib.dc import Column, DataChain + +images_uri="gs://datachain-demo/coco2017/images/val/" +captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json" + +images = DataChain.from_storage(images_uri) +meta = DataChain.from_json(captions_uri, jmespath = "images") +captions = DataChain.from_json(captions_uri, jmespath = "annotations") + +images_meta = images.merge(meta, on="file.name", right_on="images.file_name") +captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id") +``` + +The resulting dataset has image entries as files decorated with all the metadata and captions: + +```python +images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*")) +images_with_dogs.select("annotations", "file.name").show() +``` + +``` + captions captions captions file + image_id id caption name +0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg +1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg +2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg +3 17029 782283 A dog catches a frisbee outside in the grass. 000000017029.jpg +4 17029 783543 A dog leaping to catch a red frisbee. 000000017029.jpg +5 18193 278544 A woman in green sweater eating a hotdog by ca... 000000018193.jpg +... + +[Limited by 20 rows] +``` +For in-depth review of working with JSON metadata, please follow this tutorial: + +[GitHub](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) + +### Passing data to training + +Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it: + +```python + +ds = train.select("file", "caption_choices", "label_ind").to_pytorch( + transform=preprocess, + tokenizer=clip.tokenize, +) + +loader = DataLoader(ds, batch_size=2) +optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) +train(loader, model, optimizer) +``` + +See a larger example for CLIP fine-tuning here: + +[GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) diff --git a/docs/index.md b/docs/index.md index 19067e362..fc86d14d7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,302 +1,103 @@ -# Get Started with DataChain - -🔨Wrangle unstructured AI data at scale - - -Datachain enables multimodal API calls and local AI inferences to run in parallel over many samples as chained operations. The resulting datasets can be saved, versioned, and sent directly to PyTorch and TensorFlow for training. Datachain can persist features of Python objects returned by AI models, and enables vectorized analytical operations over them. - -The typical use cases are data curation, LLM analytics and validation, image segmentation, pose detection, and GenAI alignment. Datachain is especially helpful if batch operations can be optimized – for instance, when synchronous API calls can be parallelized or where an LLM API offers batch processing. - ---- - -`pip install datachain` - ---- - -### Operation basics - -Datachain is built by composing wrangling operations. - -For example, let us consider the New Yorker Cartoon caption contest dataset, where cartoons are matched against the potential titles. Let us imagine we want to augment this dataset with synthetic scene descriptions coming from an AI model. The below code takes images from the cloud, and applies PaliGemma model to caption the first five of them and put the results in the column “scene”: - -```python -# -# pip install transformers -# - -from datachain.lib.dc import Column, DataChain, File -from transformers import AutoProcessor, PaliGemmaForConditionalGeneration - -images = DataChain.from_storage("gs://datachain-demo/newyorker_caption_contest/images", type="image") - -model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224") -processor = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224") - -def process(file: File) -> str: - image=file.read().convert("RGB") - inputs = processor(text="caption", images=image, return_tensors="pt") - generate_ids = model.generate(**inputs, max_new_tokens=100) - return processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - -chain = ( - images.limit(5) - .settings(cache=True) - .map(scene=lambda file: process(file), output = str) - .save() -) -``` - -Here is how we can view the results in a plot: - -```python -import matplotlib.pyplot as plt -import re -from textwrap import wrap - -def trim_text(text): - match = re.search(r'[A-Z][^.]*\.', text) - return match.group(0) if match else '' - -images = chain.collect("file") -captions = chain.collect("scene") -_ , axes = plt.subplots(1, len(captions), figsize=(15, 5)) - -for ax, img, caption in zip(axes, images, captions): - ax.imshow(img.read(),cmap='gray') - ax.axis('off') - wrapped_caption = "\n".join(wrap(trim_text(caption), 30)) - ax.set_title(wrapped_caption, fontsize=6) - -plt.show() -``` - -![Untitled](assets/captioned_cartoons.png) - -If interested to see more multimodal examples for DataChain, please follow this tutorial: - -[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) - -### Handling Python objects - -In addition to storing primitive Python data types like strings, DataChain is also capable of using data models. - -For example, most LLMs return objects that carry additional fields. If provider offers a Pydantic model for their LLM, Datachain can use it as a schema. - -In the below example, we are calling a Mixtral 8x22b model to judge the “service chatbot” dataset from [Karlsruhe Institute of Technology](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot) and saving the results as Mistral’s *ChatCompletionResponse* objects: - -```python -# pip install mistralai -# this example requires a free Mistral API key, get yours at https://console.mistral.ai -# $ export MISTRAL_API_KEY='your key' - -import os -from datachain.lib.feature import Feature -from datachain.lib.dc import Column, DataChain -from mistralai.client import MistralClient -from mistralai.models.chat_completion import ChatMessage -from mistralai.models.chat_completion import ChatCompletionResponse as MistralModel -from datachain.lib.data_model import DataModel - -prompt = "Was this dialog successful? Describe the 'result' as 'Yes' or 'No' in a short JSON" -api_key = os.environ["MISTRAL_API_KEY"] - -## register the data model ### -DataModel.register(MistralModel) - -chain = ( - DataChain - .from_storage("gs://datachain-demo/chatbot-KiT/", type="text") - .filter(Column("file.name").glob("*.txt")) - .limit(5) - .settings(parallel=4, cache=True) - .map( - mistral=lambda file: MistralClient(api_key=api_key).chat( - model="open-mixtral-8x22b", - response_format={"type": "json_object"}, - messages= [ - ChatMessage(role="system", content=f"{prompt}"), - ChatMessage(role="user", content=f"{file.read()}") - ] - ), - output=MistralModel - ) - .save("dialog-rating") -) - -**iter = chain.collect("mistral") -**print(*map(lambda chat_response: chat_response.choices[0].message.content, iter)) -``` - -``` -{"result": "Yes"} {"result": "No"} {"result": "No"} {"result": "Yes"} -``` - -If you are interested in more LLM evaluation examples for DataChain, please follow this tutorial: - -[https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) - -### Vectorized analytics - -Datachain internally represents datasets as tables, so analytical queries on the chain are automatically vectorized: - -```python -# continued from the previous example - -mistral_cost = chain.sum("mistral.usage.prompt_tokens")*0.000002 + \ - chain.sum("mistral.usage.completion_tokens")*0.000006 - -print(f"The cost of {chain.count()} calls to Mixtral 8x22b : ${mistral_cost:.4f}") -``` - -``` -The cost of 5 calls to Mixtral 8x22b : $0.0142 -``` - -### Dataset persistence - -The “save” operation makes chain dataset persistent in the current (working) directory of the query. A hidden folder `.datachain/` holds the records. A persistent dataset can be accessed later to start a derivative chain: - -```python -DataChain.from_dataset("rating").limit(2).save("dialog-rating") -``` - -Persistent datasets are immutable and automatically versioned. Here is how to access the dataset registry: - -```python -mydatasets = DataChain.datasets() -for ds in mydatasets.collect("dataset"): - print(f"{ds.name}@v{ds.version}") - -``` - -``` -Processed: 1 rows [00:00, 777.59 rows/s] -Generated: 14 rows [00:00, 11279.34 rows/s] -dialog-rating@v1 -dialog-rating@v2 -``` - -By default, when a saved dataset is loaded, the latest version is fetched but another version can be requested: - -```python -ds = DataChain.from_dataset("dialog-rating", version = 1) -``` - -### Chain execution, optimization and parallelism - -Datachain avoids redundant operations. Execution is triggered only when a downstream operation requests the processed results. However, it would be inefficient to run, say, LLM queries again every time you just want to collect several objects from the chain. - -“Save” operation nails execution results and automatically refers to them every time the downstream functions ask for data. Saving without an explicit name generates an auto-named dataset which serves the same purpose. - -Datachain natively supports parallelism in execution. If an API or a local model supports parallel requests, the `settings` operator can split the load across multiple workers (see the [code example above](https://www.notion.so/DataChain-Getting-Started-3ed9414febac48f888f90cdaa2ca7667?pvs=21)) - -### Reading external metadata - -It is common for AI data to come with pre-computed metadata (annotations, classes, etc). - -DataChain library understands common annotation formats (JSON, CSV, webdataset and parquet), and can unite data samples from storage with side-loaded metadata. The schema for metadata can be set explicitly or be inferred. - -Here is an example of reading a simple CSV file where schema is heuristically derived from the header: - -```python -from datachain.lib.dc import DataChain - -uri="gs://datachain-demo/chatbot-csv/" -csv_dataset = DataChain.from_csv(uri) - -print(csv_dataset.to_pandas()) -``` - -Reading metadata from JSON format is a more complicated scenario because a JSON-annotated dataset typically references data samples in blocks within JSON files. - -Here is an example from MS COCO “captions” JSON which employs separate sections for image meta and captions: - -```json -{ - "images": [ - { - "license": 4, - "file_name": "000000397133.jpg", - "coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg", - "height": 427, - "width": 640, - "date_captured": "2013-11-14 17:02:52", - "flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg", - "id": 397133 - }, - ... - ], - "annotations": [ - { - "image_id" : "179765", - "id" : 38, - "caption" : "A black Honda motorcycle parked in front of a garage." - }, - ... - ], - ... -} -``` - -Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array. - -However, Datachain can easily parse the entire COCO structure via several reading and merging operators: - -```python - -from datachain.lib.dc import Column, DataChain - -images_uri="gs://datachain-demo/coco2017/images/val/" -captions_uri="gs://datachain-demo/coco2017/annotations/captions_val2017.json" - -images = DataChain.from_storage(images_uri) -meta = DataChain.from_json(captions_uri, jmespath = "images") -captions = DataChain.from_json(captions_uri, jmespath = "annotations") - -images_meta = images.merge(meta, on="file.name", right_on="images.file_name") -captioned_images = images_meta.merge(captions, on="images.id", right_on="annotations.image_id") -``` - -The resulting dataset has image entries as files decorated with all the metadata and captions: - -```python -images_with_dogs = captioned_images.filter(Column("annotations.caption").glob("*dog*")) -images_with_dogs.select("annotations", "file.name").show() -``` - -``` - captions captions captions file - image_id id caption name -0 17029 778902 a dog jumping to catch a frisbee in a yard 000000017029.jpg -1 17029 779838 A dog jumping to catch a red frisbee in a garden 000000017029.jpg -2 17029 781941 The dog is catching the Frisbee in mid air in ... 000000017029.jpg -3 17029 782283 A dog catches a frisbee outside in the grass. 000000017029.jpg -4 17029 783543 A dog leaping to catch a red frisbee. 000000017029.jpg -5 18193 278544 A woman in green sweater eating a hotdog by ca... 000000018193.jpg -... - -[Limited by 20 rows] -``` -For in-depth review of working with JSON metadata, please follow this tutorial: - -[https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) - -### Passing data to training - -Chain results can be exported or passed directly to Pytorch dataloader. For example, if we are interested in passing three columns to training, the following Pytorch code will do it: - -```python - -ds = train.select("file", "caption_choices", "label_ind").to_pytorch( - transform=preprocess, - tokenizer=clip.tokenize, -) - -loader = DataLoader(ds, batch_size=2) -optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) -train(loader, model, optimizer) -``` - -See a larger example for CLIP fine-tuning here: - -[https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) +# DataChain DataChain + + + +

+ + PyPI + + + Python Version + + + Codecov + + + Tests + +

+ +

+🔨 Wrangle unstructured AI data at scale +

+ + +DataChain is a Python-based AI-data warehouse for transforming and +analyzing unstructured data like images, audio, videos, text and PDFs. +It integrates with external storage (e.g. S3, GCP, Azure, HuggingFace) to process data +efficiently without data duplication and manages metadata in an internal +database for easy and efficient querying. + +## Use Cases + +1. **ETL.** Pythonic framework for describing and running unstructured + data transformations and enrichments, applying models to data, + including LLMs. +2. **Analytics.** DataChain dataset is a table that combines all the + information about data objects in one place + it provides + dataframe-like API and vecrorized engine to do analytics on these + tables at scale. +3. **Versioning.** DataChain doesn't store, require moving or copying + data (unlike DVC). Perfect use case is a bucket with thousands or + millions of images, videos, audio, PDFs. + +## Key Features + +📂 **Multimodal Dataset Versioning.** + +: - Version unstructured data without moving or creating data + copies, by supporting references to S3, GCP, Azure, and local + file systems. + - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, + parquet, etc. + - Unite files and metadata together into persistent, versioned, + columnar datasets. + +🐍 **Python-friendly.** + +: - Operate on Python objects and object fields: float scores, + strings, matrixes, LLM response objects. + - Run Python code in a high-scale, terabytes size datasets, with + built-in parallelization and memory-efficient computing --- no + SQL or Spark required. + +🧠 **Data Enrichment and Processing.** + +: - Generate metadata using local AI models and LLM APIs. + - Filter, join, and group datasets by metadata. Search by vector + embeddings. + - High-performance vectorized operations on Python objects: sum, + count, avg, etc. + - Pass datasets to Pytorch and Tensorflow, or export them back + into storage. + + +## Documentation Guide + +The following pages provide detailed documentation on DataChain's features, architecture, and usage patterns. You'll learn how to effectively use DataChain for managing and processing unstructured data at scale. + +- [🏃🏼‍♂️ Quick Start](quick-start.md): Get up and running with DataChain in no time. +- [🎯 Examples](examples.md): Explore practical examples and use cases. +- [📚 Tutorials](tutorials.md): Learn how to use DataChain for specific tasks. +- [📚 API Reference](references/index.md): Dive into the technical details and API reference. +- [🤝 Contributing](contributing.md): Learn how to contribute to DataChain. + + + + +## Open Source and Studio + +DataChain is available as an open source project and Studio as a proprietary solution for teams. + +- [DataChain Studio](https://studio.datachain.ai/): + - **Centralized dataset registry** to manage data, code and dependencies in one place. + - **Data Lineage** for data sources as well as derivative dataset. + - **UI for Multimodal Data** like images, videos, and PDFs. + - **Scalable Compute** to handle large datasets (100M+ files) and in-house AI model inference. + - **Access control** including SSO and team based collaboration. +- [DataChain Open Source](https://github.com/iterative/datachain): + - Python-based AI-data warehouse for transforming and analyzing unstructured data like images, audio, videos, text and PDFs. diff --git a/docs/quick-start.md b/docs/quick-start.md new file mode 100644 index 000000000..9d72a32a9 --- /dev/null +++ b/docs/quick-start.md @@ -0,0 +1,286 @@ +# Quick Start + +## Installation + +=== "pip" + + ```bash + pip install datachain + ``` + +=== "uv" + + ```bash + uv add datachain + ``` + + +## Selecting files using JSON metadata + +A storage consists of images of cats and dogs +(`dog.1048.jpg`, `cat.1009.jpg`), annotated with +ground truth and model inferences in the _`json-pairs`_ format, where +each image has a matching JSON file like `cat.1009.json`: + +``` json +{ + "class": "cat", "id": "1009", "num_annotators": 8, + "inference": {"class": "dog", "confidence": 0.68} +} +``` + +Example of downloading only _`high-confidence cat`_ inferred images +using JSON metadata: + +``` py +from datachain import Column, DataChain + +meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta") +images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg") + +images_id = images.map(id=lambda file: file.path.split('.')[-2]) +annotated = images_id.merge(meta, on="id", right_on="meta.id") + +likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \ + & (Column("meta.inference.class_") == "cat")) +likely_cats.export_files("high-confidence-cats/", signal="file") +``` + +## Data curation with a local AI model + +Batch inference with a simple sentiment model using the +`transformers` library: + +``` shell +pip install transformers +``` + +The code below downloads files from the cloud, and applies a +user-defined function to each one of them. All files with a positive +sentiment detected are then copied to the local directory. + +``` py +from transformers import pipeline +from datachain import DataChain, Column + +classifier = pipeline("sentiment-analysis", device="cpu", + model="distilbert/distilbert-base-uncased-finetuned-sst-2-english") + +def is_positive_dialogue_ending(file) -> bool: + dialogue_ending = file.read()[-512:] + return classifier(dialogue_ending)[0]["label"] == "POSITIVE" + +chain = ( + DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", + object_name="file", type="text") + .settings(parallel=8, cache=True) + .map(is_positive=is_positive_dialogue_ending) + .save("file_response") +) + +positive_chain = chain.filter(Column("is_positive") == True) +positive_chain.export_files("./output") + +print(f"{positive_chain.count()} files were exported") +``` + +13 files were exported + +``` shell +$ ls output/datachain-demo/chatbot-KiT/ +15.txt 20.txt 24.txt 27.txt 28.txt 29.txt 33.txt 37.txt 38.txt 43.txt ... +$ ls output/datachain-demo/chatbot-KiT/ | wc -l +13 +``` + +## LLM judging chatbots + +LLMs can work as universal classifiers. In the example below, we employ +a free API from Mistral to judge the [publicly +available](https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot) +chatbot dialogs. Please get a free Mistral API key at + + +``` shell +$ pip install mistralai (Requires version >=1.0.0) +$ export MISTRAL_API_KEY=_your_key_ +``` + +DataChain can parallelize API calls; the free Mistral tier supports up +to 4 requests at the same time. + +``` py +from mistralai import Mistral +from datachain import File, DataChain, Column + +PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure." + +def eval_dialogue(file: File) -> bool: + client = Mistral() + response = client.chat.complete( + model="open-mixtral-8x22b", + messages=[{"role": "system", "content": PROMPT}, + {"role": "user", "content": file.read()}]) + result = response.choices[0].message.content + return result.lower().startswith("success") + +chain = ( + DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file") + .settings(parallel=4, cache=True) + .map(is_success=eval_dialogue) + .save("mistral_files") +) + +successful_chain = chain.filter(Column("is_success") == True) +successful_chain.export_files("./output_mistral") + +print(f"{successful_chain.count()} files were exported") +``` + +With the instruction above, the Mistral model considers 31/50 files to +hold the successful dialogues: + +``` shell +$ ls output_mistral/datachain-demo/chatbot-KiT/ +1.txt 15.txt 18.txt 2.txt 22.txt 25.txt 28.txt 33.txt 37.txt 4.txt 41.txt ... +$ ls output_mistral/datachain-demo/chatbot-KiT/ | wc -l +31 +``` + +## Serializing Python-objects + +LLM responses may contain valuable information for analytics -- such as +the number of tokens used, or the model performance parameters. + +Instead of extracting this information from the Mistral response data +structure (class `ChatCompletionResponse`), DataChain can +serialize the entire LLM response to the internal DB: + +``` py +from mistralai import Mistral +from mistralai.models import ChatCompletionResponse +from datachain import File, DataChain, Column + +PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure." + +def eval_dialog(file: File) -> ChatCompletionResponse: + client = MistralClient() + return client.chat( + model="open-mixtral-8x22b", + messages=[{"role": "system", "content": PROMPT}, + {"role": "user", "content": file.read()}]) + +chain = ( + DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file") + .settings(parallel=4, cache=True) + .map(response=eval_dialog) + .map(status=lambda response: response.choices[0].message.content.lower()[:7]) + .save("response") +) + +chain.select("file.name", "status", "response.usage").show(5) + +success_rate = chain.filter(Column("status") == "success").count() / chain.count() +print(f"{100*success_rate:.1f}% dialogs were successful") +``` + +Output: + +``` shell +file status response response response +name usage usage usage + prompt_tokens total_tokens completion_tokens +0 1.txt success 547 548 1 +1 10.txt failure 3576 3578 2 +2 11.txt failure 626 628 2 +3 12.txt failure 1144 1182 38 +4 13.txt success 1100 1101 1 + +[Limited by 5 rows] +64.0% dialogs were successful +``` + +## Iterating over Python data structures + +In the previous examples, datasets were saved in the embedded database +(`SQLite` in folder `.datachain` of the working directory). These datasets were automatically versioned, and +can be accessed using `DataChain.from_dataset("dataset_name")`. + +Here is how to retrieve a saved dataset and iterate over the objects: + +``` py +chain = DataChain.from_dataset("response") + +# Iterating one-by-one: support out-of-memory workflow +for file, response in chain.limit(5).collect("file", "response"): + # verify the collected Python objects + assert isinstance(response, ChatCompletionResponse) + + status = response.choices[0].message.content[:7] + tokens = response.usage.total_tokens + print(f"{file.get_uri()}: {status}, file size: {file.size}, tokens: {tokens}") +``` + +Output: + +``` shell +gs://datachain-demo/chatbot-KiT/1.txt: Success, file size: 1776, tokens: 548 +gs://datachain-demo/chatbot-KiT/10.txt: Failure, file size: 11576, tokens: 3578 +gs://datachain-demo/chatbot-KiT/11.txt: Failure, file size: 2045, tokens: 628 +gs://datachain-demo/chatbot-KiT/12.txt: Failure, file size: 3833, tokens: 1207 +gs://datachain-demo/chatbot-KiT/13.txt: Success, file size: 3657, tokens: 1101 +``` + +## Vectorized analytics over Python objects + +Some operations can run inside the DB without deserialization. For +instance, let's calculate the total cost of using the LLM APIs, +assuming the Mixtral call costs $2 per 1M input tokens and $6 per 1M +output tokens: + +``` py +chain = DataChain.from_dataset("mistral_dataset") + +cost = chain.sum("response.usage.prompt_tokens")*0.000002 \ + + chain.sum("response.usage.completion_tokens")*0.000006 +print(f"Spent ${cost:.2f} on {chain.count()} calls") +``` + +Output: + +``` shell +Spent $0.08 on 50 calls +``` + +## PyTorch data loader + +Chain results can be exported or passed directly to PyTorch dataloader. +For example, if we are interested in passing image and a label based on +file name suffix, the following code will do it: + +``` py +from torch.utils.data import DataLoader +from transformers import CLIPProcessor + +from datachain import C, DataChain + +processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + +chain = ( + DataChain.from_storage("gs://datachain-demo/dogs-and-cats/", type="image") + .map(label=lambda name: name.split(".")[0], params=["file.name"]) + .select("file", "label").to_pytorch( + transform=processor.image_processor, + tokenizer=processor.tokenizer, + ) +) + +loader = DataLoader(chain, batch_size=1) + +``` + +**See also:** + +- [Examples](examples.md) +- [Tutorials](tutorials.md) +- [API Reference](references/index.md) diff --git a/docs/references/index.md b/docs/references/index.md index 6a6ea7492..389cf366a 100644 --- a/docs/references/index.md +++ b/docs/references/index.md @@ -1,8 +1,10 @@ # API Reference -- [DataChain](./datachain.md) -- [DataType](./datatype.md) -- [File](./file.md) -- [UDF](./udf.md) -- [SQL](./sql.md) -- [Torch](./torch.md) +DataChain's API is organized into several modules: + +- [DataChain](./datachain.md) - Core chain operations and dataset management +- [DataType](./datatype.md) - Type system and schema definitions +- [File](./file.md) - File handling and storage operations +- [UDF](./udf.md) - User-defined functions and transformations +- [SQL](./sql.md) - SQL query integration +- [Torch](./torch.md) - PyTorch data loading utilities diff --git a/docs/tutorials.md b/docs/tutorials.md new file mode 100644 index 000000000..dae91d81b --- /dev/null +++ b/docs/tutorials.md @@ -0,0 +1,5 @@ +# Tutorials + +* Multimodal: [GitHub](https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb) +* LLM evaluations: [GitHub](https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb) +* Reading JSON metadata: [GitHub](https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) or [Google Colab](https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb) diff --git a/mkdocs.yml b/mkdocs.yml index 25c830cc8..b62e815f9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: '' +site_name: 'DataChain' site_url: https://docs.datachain.ai site_description: Wrangle unstructured AI data at scale @@ -16,7 +16,7 @@ validation: theme: name: material custom_dir: docs/overrides - logo: assets/datachain-white.svg + logo: assets/datachain.svg favicon: assets/datachain.svg icon: repo: fontawesome/brands/github @@ -63,8 +63,14 @@ theme: name: Switch to system preference nav: - - Home: index.md - - API reference: + - Home: + - 🔗 Welcome to DataChain: index.md + - 🏃🏼‍♂️ Quick Start: quick-start.md + - 🎯 Examples: examples.md + - 📚 Tutorials: tutorials.md + - 🐍 API Reference: references/index.md + - 🤝 Contributing: contributing.md + - API Reference: - references/index.md - references/datachain.md - references/datatype.md @@ -73,6 +79,7 @@ nav: - references/torch.md - references/sql.md - DataChain Website: https://datachain.ai" target="_blank" + - Studio: https://studio.datachain.ai" target="_blank" markdown_extensions: - abbr From 2954ce5319a8009707420c540cd384287d74e051 Mon Sep 17 00:00:00 2001 From: Helio Machado <0x2b3bfa0+git@googlemail.com> Date: Mon, 9 Dec 2024 09:44:59 +0100 Subject: [PATCH 05/10] Allow testing external contributions using secrets (#667) --- .github/workflows/tests.yml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bd4dcba7e..eeec26b61 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,7 +3,7 @@ name: Tests on: push: branches: [main] - pull_request: + pull_request_target: workflow_dispatch: env: @@ -14,13 +14,22 @@ concurrency: cancel-in-progress: true jobs: + authorize: + environment: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.repo.full_name != github.repository && 'external' || 'internal' }} + runs-on: ubuntu-latest + steps: + - run: true + lint: + needs: authorize + runs-on: ubuntu-latest steps: - name: Check out the repository uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha || github.ref }} - name: Set up Python 3.9 uses: actions/setup-python@v5 @@ -53,6 +62,8 @@ jobs: run: nox -s lint datachain: + needs: authorize + timeout-minutes: 40 runs-on: ${{ matrix.os }} strategy: @@ -75,6 +86,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha || github.ref }} - name: Set up Python ${{ matrix.pyv }} uses: actions/setup-python@v5 @@ -117,6 +129,8 @@ jobs: run: nox -s docs examples: + needs: authorize + runs-on: ${{ matrix.os }} timeout-minutes: 60 strategy: @@ -132,9 +146,10 @@ jobs: - {os: ubuntu-latest-4-cores, pyv: "3.9", group: multimodal} - {os: ubuntu-latest-4-cores, pyv: "3.12", group: multimodal} - steps: - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha || github.ref }} - name: Set up Python ${{ matrix.pyv }} uses: actions/setup-python@v5 From 0bf3134eba12e9f8cbe1f2d5a1210222c5c34b1c Mon Sep 17 00:00:00 2001 From: Vladimir Rudnykh Date: Mon, 9 Dec 2024 17:02:33 +0700 Subject: [PATCH 06/10] Fix readme (#674) --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a79b48f86..ca670d669 100644 --- a/README.rst +++ b/README.rst @@ -82,7 +82,7 @@ Community and Support DataChain Studio Platform -======================== +========================= `DataChain Studio`_ is a proprietary solution for teams that offers: From 9c685b289039e9a8321d754359464c5f72efc51a Mon Sep 17 00:00:00 2001 From: Vladimir Rudnykh Date: Mon, 9 Dec 2024 18:05:07 +0700 Subject: [PATCH 07/10] Allow adding new signal from existing signal field (#658) --- src/datachain/lib/signal_schema.py | 34 +++++++++++++++++++++++----- tests/unit/lib/test_signal_schema.py | 5 ++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py index c851d761e..1436cc584 100644 --- a/src/datachain/lib/signal_schema.py +++ b/src/datachain/lib/signal_schema.py @@ -402,9 +402,20 @@ def _set_file_stream( if ModelStore.is_pydantic(finfo.annotation): SignalSchema._set_file_stream(getattr(obj, field), catalog, cache) - def get_column_type(self, col_name: str) -> DataType: + def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType: + """ + Returns column type by column name. + + If `with_subtree` is True, then it will return the type of the column + even if it has a subtree (e.g. model with nested fields), otherwise it will + return the type of the column (standard type field, not the model). + + If column is not found, raises `SignalResolvingError`. + """ for path, _type, has_subtree, _ in self.get_flat_tree(): - if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name: + if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join( + path + ) == col_name: return _type raise SignalResolvingError([col_name], "is not found") @@ -492,14 +503,25 @@ def mutate(self, args_map: dict) -> "SignalSchema": # renaming existing signal del new_values[value.name] new_values[name] = self.values[value.name] - elif isinstance(value, Func): + continue + if isinstance(value, Column): + # adding new signal from existing signal field + try: + new_values[name] = self.get_column_type( + value.name, with_subtree=True + ) + continue + except SignalResolvingError: + pass + if isinstance(value, Func): # adding new signal with function new_values[name] = value.get_result_type(self) - elif isinstance(value, ColumnElement): + continue + if isinstance(value, ColumnElement): # adding new signal new_values[name] = sql_to_python(value) - else: - new_values[name] = value + continue + new_values[name] = value return SignalSchema(new_values) diff --git a/tests/unit/lib/test_signal_schema.py b/tests/unit/lib/test_signal_schema.py index d3464a87c..7a33e11f1 100644 --- a/tests/unit/lib/test_signal_schema.py +++ b/tests/unit/lib/test_signal_schema.py @@ -703,6 +703,11 @@ def test_mutate_rename(): assert schema.values == {"new_name": str} +def test_mutate_rename_leaf(nested_file_schema): + schema = nested_file_schema.mutate({"new_name": Column("my_f__nested_file")}) + assert schema.values == {**nested_file_schema.values, "new_name": File} + + def test_mutate_new_signal(): schema = SignalSchema({"name": str}) schema = schema.mutate({"age": Column("age", Float)}) From ddee0a7c0d14e46c54e66bd47f6f7583f53a3018 Mon Sep 17 00:00:00 2001 From: LeviLovie <107021730+LeviLovie@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:45:17 +0700 Subject: [PATCH 08/10] Fix a typo in `docs/examples.md` (#675) --- docs/examples.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples.md b/docs/examples.md index 6409cea8a..2d4362862 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -225,7 +225,7 @@ Here is an example from MS COCO “captions” JSON which employs separate secti } ``` -Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations’ array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array. +Note how complicated the setup is. Every image is references by the name, and the metadata for this file is keyed by the “id” field. This same field is references later in the “annotations” array, which is present in JSON files describing captions and the detected instances. The categories for the instances are stored in the “categories” array. However, Datachain can easily parse the entire COCO structure via several reading and merging operators: From 200dc73f9b189b6e9a6844d74f4e3fafa67b4f13 Mon Sep 17 00:00:00 2001 From: Thomas Kunwar <20840228+yathomasi@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:36:53 +0545 Subject: [PATCH 09/10] chore: follow up docs navigation update (#673) * chore: follow up docs navigaiton update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chore: update dark theme color --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/contributing.md | 4 +++ docs/css/github-permalink-style.css | 39 +++++++++++++++++++++++++++++ docs/examples.md | 3 +++ docs/index.md | 5 +++- docs/quick-start.md | 4 +++ docs/references/index.md | 4 +++ docs/tutorials.md | 4 +++ mkdocs.yml | 34 +++++++++++++------------ 8 files changed, 80 insertions(+), 17 deletions(-) create mode 100644 docs/css/github-permalink-style.css diff --git a/docs/contributing.md b/docs/contributing.md index 5d8a2e32c..ad4d22d37 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,3 +1,7 @@ +--- +title: Contributing +--- + # Contributor Guide Thank you for your interest in improving this project. This project is diff --git a/docs/css/github-permalink-style.css b/docs/css/github-permalink-style.css new file mode 100644 index 000000000..77ff3e827 --- /dev/null +++ b/docs/css/github-permalink-style.css @@ -0,0 +1,39 @@ +.headerlink { + --permalink-size: 16px; /* for font-relative sizes, 0.6em is a good choice */ + --permalink-spacing: 4px; + + width: calc(var(--permalink-size) + var(--permalink-spacing)); + height: var(--permalink-size); + vertical-align: middle; + background-color: var(--md-default-fg-color--lighter); + background-size: var(--permalink-size); + mask-size: var(--permalink-size); + -webkit-mask-size: var(--permalink-size); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + visibility: visible; + mask-image: url('data:image/svg+xml;utf8,'); + -webkit-mask-image: url('data:image/svg+xml;utf8,'); +} + +[id]:target .headerlink { + background-color: var(--md-typeset-a-color); +} + +.headerlink:hover { + background-color: var(--md-accent-fg-color) !important; +} + +@media screen and (min-width: 76.25em) { + h1, h2, h3, h4, h5, h6 { + display: flex; + align-items: center; + flex-direction: row; + column-gap: 0.2em; /* fixes spaces in titles */ + } + + .headerlink { + order: -1; + margin-left: calc(var(--permalink-size) * -1 - var(--permalink-spacing)) !important; + } +} diff --git a/docs/examples.md b/docs/examples.md index 2d4362862..5b0d0b236 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,3 +1,6 @@ +--- +title: Examples +--- # Examples diff --git a/docs/index.md b/docs/index.md index fc86d14d7..af1341677 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,6 @@ +--- +title: Welcome to DataChain +--- # DataChain DataChain