From 7ba447743cb4dd7374de087318eaeaac010f974e Mon Sep 17 00:00:00 2001
From: Ivan Shcheklein <shcheklein@gmail.com>
Date: Fri, 1 Nov 2024 10:38:57 -0700
Subject: [PATCH] fix(hf): use proper source when we create a file entry (#555)

* fix(hf): use proper source when we create a file entry

* add more details to the unsupported PyArrow type message

* add example: HF -> OpenAI -> HF -> analyze

* use HF inference endpoint

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>

* use to_parquet / from_parquet to preserve schema

* add a bit of comments, fix them

* use HF_TOKEN to run e2e HF example

---------

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 .github/workflows/tests.yml                 |  2 +
 examples/llm_and_nlp/hf-dataset-llm-eval.py | 59 +++++++++++++++++++++
 src/datachain/client/hf.py                  |  1 +
 src/datachain/lib/arrow.py                  |  2 +-
 4 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 examples/llm_and_nlp/hf-dataset-llm-eval.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 70e3e03f2..ef32bf47f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -152,4 +152,6 @@ jobs:
         run: uv pip install nox --system
 
       - name: Run examples
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
diff --git a/examples/llm_and_nlp/hf-dataset-llm-eval.py b/examples/llm_and_nlp/hf-dataset-llm-eval.py
new file mode 100644
index 000000000..9a89e8440
--- /dev/null
+++ b/examples/llm_and_nlp/hf-dataset-llm-eval.py
@@ -0,0 +1,59 @@
+from huggingface_hub import InferenceClient
+
+from datachain import C, DataChain, DataModel
+
+PROMPT = """
+Was this dialog successful? Put result as a single word: Success or Failure.
+Explain the reason in a few words.
+"""
+
+
+class DialogEval(DataModel):
+    result: str
+    reason: str
+
+
+# DataChain function to evaluate dialog.
+# DataChain is using types for inputs, results to automatically infer schema.
+def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
+    client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")
+
+    completion = client.chat_completion(
+        messages=[
+            {
+                "role": "user",
+                "content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
+            },
+        ],
+        response_format={"type": "json", "value": DialogEval.model_json_schema()},
+    )
+
+    message = completion.choices[0].message
+    try:
+        return DialogEval.model_validate_json(message.content)
+    except ValueError:
+        return DialogEval(result="Error", reason="Failed to parse response.")
+
+
+# Run HF inference in parallel for each example.
+# Get result as Pydantic model that DataChain can understand and serialize it.
+# Save to HF as Parquet. Dataset can be previewed here:
+# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
+(
+    DataChain.from_csv(
+        "hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
+    )
+    .settings(parallel=10)
+    .map(response=eval_dialog)
+    .to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
+)
+
+# Read it back to filter and show.
+# It restores the Pydantic model from Parquet under the hood.
+(
+    DataChain.from_parquet(
+        "hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
+    )
+    .filter(C("response.result") == "Failure")
+    .show(3)
+)
diff --git a/src/datachain/client/hf.py b/src/datachain/client/hf.py
index 68b02cfa6..9a2bee234 100644
--- a/src/datachain/client/hf.py
+++ b/src/datachain/client/hf.py
@@ -23,6 +23,7 @@ def create_fs(cls, **kwargs) -> HfFileSystem:
 
     def info_to_file(self, v: dict[str, Any], path: str) -> File:
         return File(
+            source=self.uri,
             path=path,
             size=v["size"],
             version=v["last_commit"].oid,
diff --git a/src/datachain/lib/arrow.py b/src/datachain/lib/arrow.py
index 4048848d5..47a3418e4 100644
--- a/src/datachain/lib/arrow.py
+++ b/src/datachain/lib/arrow.py
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
-    raise TypeError(f"{col_type!r} datatypes not supported")
+    raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
 
 
 def _nrows_file(file: File, nrows: int) -> str: