From 7ba447743cb4dd7374de087318eaeaac010f974e Mon Sep 17 00:00:00 2001 From: Ivan Shcheklein Date: Fri, 1 Nov 2024 10:38:57 -0700 Subject: [PATCH] fix(hf): use proper source when we create a file entry (#555) * fix(hf): use proper source when we create a file entry * add more details to the unsupported PyArrow type message * add example: HF -> OpenAI -> HF -> analyze * use HF inference endpoint Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> * use to_parquet / from_parquet to preserve schema * add a bit of comments, fix them * use HF_TOKEN to run e2e HF example --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- .github/workflows/tests.yml | 2 + examples/llm_and_nlp/hf-dataset-llm-eval.py | 59 +++++++++++++++++++++ src/datachain/client/hf.py | 1 + src/datachain/lib/arrow.py | 2 +- 4 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 examples/llm_and_nlp/hf-dataset-llm-eval.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 70e3e03f2..ef32bf47f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -152,4 +152,6 @@ jobs: run: uv pip install nox --system - name: Run examples + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}" diff --git a/examples/llm_and_nlp/hf-dataset-llm-eval.py b/examples/llm_and_nlp/hf-dataset-llm-eval.py new file mode 100644 index 000000000..9a89e8440 --- /dev/null +++ b/examples/llm_and_nlp/hf-dataset-llm-eval.py @@ -0,0 +1,59 @@ +from huggingface_hub import InferenceClient + +from datachain import C, DataChain, DataModel + +PROMPT = """ +Was this dialog successful? Put result as a single word: Success or Failure. +Explain the reason in a few words. +""" + + +class DialogEval(DataModel): + result: str + reason: str + + +# DataChain function to evaluate dialog. +# DataChain is using types for inputs, results to automatically infer schema. +def eval_dialog(user_input: str, bot_response: str) -> DialogEval: + client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct") + + completion = client.chat_completion( + messages=[ + { + "role": "user", + "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}", + }, + ], + response_format={"type": "json", "value": DialogEval.model_json_schema()}, + ) + + message = completion.choices[0].message + try: + return DialogEval.model_validate_json(message.content) + except ValueError: + return DialogEval(result="Error", reason="Failed to parse response.") + + +# Run HF inference in parallel for each example. +# Get result as Pydantic model that DataChain can understand and serialize it. +# Save to HF as Parquet. Dataset can be previewed here: +# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer +( + DataChain.from_csv( + "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv" + ) + .settings(parallel=10) + .map(response=eval_dialog) + .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet") +) + +# Read it back to filter and show. +# It restores the Pydantic model from Parquet under the hood. +( + DataChain.from_parquet( + "hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False + ) + .filter(C("response.result") == "Failure") + .show(3) +) diff --git a/src/datachain/client/hf.py b/src/datachain/client/hf.py index 68b02cfa6..9a2bee234 100644 --- a/src/datachain/client/hf.py +++ b/src/datachain/client/hf.py @@ -23,6 +23,7 @@ def create_fs(cls, **kwargs) -> HfFileSystem: def info_to_file(self, v: dict[str, Any], path: str) -> File: return File( + source=self.uri, path=path, size=v["size"], version=v["last_commit"].oid, diff --git a/src/datachain/lib/arrow.py b/src/datachain/lib/arrow.py index 4048848d5..47a3418e4 100644 --- a/src/datachain/lib/arrow.py +++ b/src/datachain/lib/arrow.py @@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: return dict if isinstance(col_type, pa.lib.DictionaryType): return arrow_type_mapper(col_type.value_type) # type: ignore[return-value] - raise TypeError(f"{col_type!r} datatypes not supported") + raise TypeError(f"{col_type!r} datatypes not supported, column: {column}") def _nrows_file(file: File, nrows: int) -> str: