Skip to content

Commit

Permalink
fix(hf): use proper source when we create a file entry (#555)
Browse files Browse the repository at this point in the history
* fix(hf): use proper source when we create a file entry

* add more details to the unsupported PyArrow type message

* add example: HF -> OpenAI -> HF -> analyze

* use HF inference endpoint

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>

* use to_parquet / from_parquet to preserve schema

* add a bit of comments, fix them

* use HF_TOKEN to run e2e HF example

---------

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
  • Loading branch information
shcheklein and lhoestq authored Nov 1, 2024
1 parent a516c94 commit 7ba4477
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,6 @@ jobs:
run: uv pip install nox --system

- name: Run examples
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
59 changes: 59 additions & 0 deletions examples/llm_and_nlp/hf-dataset-llm-eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from huggingface_hub import InferenceClient

from datachain import C, DataChain, DataModel

PROMPT = """
Was this dialog successful? Put result as a single word: Success or Failure.
Explain the reason in a few words.
"""


class DialogEval(DataModel):
result: str
reason: str


# DataChain function to evaluate dialog.
# DataChain is using types for inputs, results to automatically infer schema.
def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")

completion = client.chat_completion(
messages=[
{
"role": "user",
"content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
},
],
response_format={"type": "json", "value": DialogEval.model_json_schema()},
)

message = completion.choices[0].message
try:
return DialogEval.model_validate_json(message.content)
except ValueError:
return DialogEval(result="Error", reason="Failed to parse response.")


# Run HF inference in parallel for each example.
# Get result as Pydantic model that DataChain can understand and serialize it.
# Save to HF as Parquet. Dataset can be previewed here:
# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
(
DataChain.from_csv(
"hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
)
.settings(parallel=10)
.map(response=eval_dialog)
.to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
)

# Read it back to filter and show.
# It restores the Pydantic model from Parquet under the hood.
(
DataChain.from_parquet(
"hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
)
.filter(C("response.result") == "Failure")
.show(3)
)
1 change: 1 addition & 0 deletions src/datachain/client/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def create_fs(cls, **kwargs) -> HfFileSystem:

def info_to_file(self, v: dict[str, Any], path: str) -> File:
return File(
source=self.uri,
path=path,
size=v["size"],
version=v["last_commit"].oid,
Expand Down
2 changes: 1 addition & 1 deletion src/datachain/lib/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
return dict
if isinstance(col_type, pa.lib.DictionaryType):
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
raise TypeError(f"{col_type!r} datatypes not supported")
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")


def _nrows_file(file: File, nrows: int) -> str:
Expand Down

0 comments on commit 7ba4477

Please sign in to comment.