Skip to content

Commit

Permalink
Configure Hugging Face cache directories for dataset preparation
Browse files Browse the repository at this point in the history
  • Loading branch information
strickvl committed Jan 27, 2025
1 parent df2de46 commit aecfe12
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
7 changes: 6 additions & 1 deletion llm-finetuning/configs/generate_code_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
settings:
docker:
requirements: requirements.txt
apt_packages:
- git
environment:
HF_HOME: "/tmp/huggingface"
HF_HUB_CACHE: "/tmp/huggingface"

# pipeline configuration
parameters:
Expand All @@ -11,4 +16,4 @@ steps:
mirror_repositories:
parameters:
repositories:
- zenml
- zenml
8 changes: 7 additions & 1 deletion llm-finetuning/steps/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@
"""

import os
from typing import Dict
from pathlib import Path

# Set cache directories before importing HF libraries
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.makedirs("/tmp/huggingface", exist_ok=True)

from typing import Dict
import pandas as pd
from datasets import Dataset
from huggingface_hub import HfApi
Expand Down

0 comments on commit aecfe12

Please sign in to comment.