Skip to content

Commit

Permalink
Working commit looking at reading in from huggingface.
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark committed May 2, 2024
1 parent 9239cdd commit a30bfbe
Show file tree
Hide file tree
Showing 26 changed files with 47 additions and 9 deletions.
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ sentence-transformers = { version = "^2.2.2", optional = true }
torch = { version = "^2.0.0", optional = true }
spacy = { version = "^3.5.1", optional = true }
poetry = "^1.8.2"
flatten-dict = "^0.4.2"

[tool.poetry.extras]
vespa = ["pyvespa", "pyyaml", "sentence-transformers", "torch"]
Expand Down
21 changes: 19 additions & 2 deletions src/cpr_sdk/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TypeVar,
Literal,
Annotated,
Iterator,
)
from pathlib import Path
import datetime
Expand All @@ -35,6 +36,7 @@
from tqdm.auto import tqdm
import numpy as np
import random
from flatten_dict import unflatten as unflatten_dict

from datasets import Dataset as HFDataset, DatasetInfo, load_dataset
import cpr_sdk.data_adaptors as adaptors
Expand Down Expand Up @@ -1279,6 +1281,7 @@ def _from_huggingface_parquet(
self,
huggingface_dataset: HFDataset,
limit: Optional[int] = None,
unflatten: bool = False,
) -> "Dataset":
"""
Create a dataset from a huggingface dataset.
Expand All @@ -1287,9 +1290,23 @@ def _from_huggingface_parquet(
:param limit: optionally limit the number of documents to load
:return self: with documents loaded from huggingface dataset
"""
hf_dataframe = huggingface_dataset.to_pandas()
if not isinstance(hf_dataframe, pd.DataFrame):
raise ValueError(
"The huggingface dataset is not a DataFrame it is a: "
f"{type(hf_dataframe)}."
)

if unflatten:
unflattened_columns = unflatten_dict(
{k: None for k in hf_dataframe.columns}, splitter="dot"
)

# TODO: validate that we really do have a DataFrame & not an iterator
hf_dataframe: pd.DataFrame = huggingface_dataset.to_pandas() # type: ignore
df_unflattened = pd.DataFrame({}, columns=unflattened_columns)
for indx, row in hf_dataframe.iterrows():
unflattened_row = unflatten_dict(row.to_dict(), splitter="dot")
df_unflattened.loc[indx] = pd.Series(unflattened_row)
hf_dataframe = df_unflattened

# This undoes the renaming of columns done in to_huggingface()
hf_dataframe = hf_dataframe.rename(columns={"document_languages": "languages"})
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
18 changes: 12 additions & 6 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,19 @@ def test_huggingface_dataset_gst() -> HuggingFaceDataset:
@pytest.fixture
def test_huggingface_dataset_cpr_passage_level_flat() -> HuggingFaceDataset:
"""Test HuggingFace dataset with flattened passage level schema."""
# TODO Make sure we have some translated documents in this dataset sample
dataset_dir = "tests/test_data/huggingface/cpr_passage_level_flat"
dataset_files = os.listdir(dataset_dir)
dataset = HuggingFaceDataset.from_parquet(
path_or_paths=[os.path.join(dataset_dir, f) for f in dataset_files]
)
assert isinstance(dataset, HuggingFaceDataset)
return dataset
# TODO read in each file to a df, fill missing columns with None, and concatenate

dfs = []
for f in [os.path.join(dataset_dir, f) for f in dataset_files]:
df = pd.read_parquet(f)
dfs.append(df)

df_all = pd.concat(dfs)

return HuggingFaceDataset.from_pandas(df_all)


def test_dataset_metadata_df(test_dataset):
Expand Down Expand Up @@ -463,7 +469,7 @@ def test_dataset_from_huggingface_gst(

# CPR Dataset from passage level flat dataset schema
dataset = Dataset(document_model=CPRDocument)._from_huggingface_parquet(
test_huggingface_dataset_cpr_passage_level_flat
test_huggingface_dataset_cpr_passage_level_flat, unflatten=True
)

assert isinstance(dataset, Dataset)
Expand Down

0 comments on commit a30bfbe

Please sign in to comment.