Adding a failing test for dataset instantiation from passage level / …

…flat data.
climatepolicyradar · May 1, 2024 · 9239cdd · 9239cdd
1 parent 17bbff4
commit 9239cdd
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 7 deletions.
diff --git a/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000002.n0000.parquet b/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000002.n0000.parquet
diff --git a/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000004.n0000.parquet b/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000004.n0000.parquet
diff --git a/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000006.n0000.parquet b/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000006.n0000.parquet
diff --git a/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000008.n0000.parquet b/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000008.n0000.parquet
diff --git a/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000010.n0000.parquet b/tests/test_data/huggingface/cpr_passage_level_flat/CCLW.document.i00000010.n0000.parquet
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import Iterable
+import os
 
 import pandas as pd
 import pytest
@@ -85,6 +86,18 @@ def test_huggingface_dataset_gst() -> HuggingFaceDataset:
     )
 
 
+@pytest.fixture
+def test_huggingface_dataset_cpr_passage_level_flat() -> HuggingFaceDataset:
+    """Test HuggingFace dataset with flattened passage level schema."""
+    dataset_dir = "tests/test_data/huggingface/cpr_passage_level_flat"
+    dataset_files = os.listdir(dataset_dir)
+    dataset = HuggingFaceDataset.from_parquet(
+        path_or_paths=[os.path.join(dataset_dir, f) for f in dataset_files]
+    )
+    assert isinstance(dataset, HuggingFaceDataset)
+    return dataset
+
+
 def test_dataset_metadata_df(test_dataset):
     metadata_df = test_dataset.metadata_df
 
@@ -427,25 +440,35 @@ def test_dataset_from_huggingface_cpr(test_huggingface_dataset_cpr, limit):
     assert len(dataset) == limit
 
 
-def test_dataset_from_huggingface_gst(test_huggingface_dataset_gst):
+def test_dataset_from_huggingface_gst(
+    test_huggingface_dataset_gst, test_huggingface_dataset_cpr_passage_level_flat
+):
     """Test that a dataset can be created from a HuggingFace dataset."""
+    # GST Dataset
     dataset = Dataset(document_model=GSTDocument)._from_huggingface_parquet(
         test_huggingface_dataset_gst
     )
 
     assert isinstance(dataset, Dataset)
     assert all(isinstance(doc, GSTDocument) for doc in dataset.documents)
-
     assert any(doc.languages is not None for doc in dataset.documents)
 
-    # Check hugingface dataset has the same number of documents as the dataset
-    assert len(dataset) == len({d["document_id"] for d in test_huggingface_dataset_gst})
+    unique_document_ids = set(d["document_id"] for d in test_huggingface_dataset_gst)
+    assert len(dataset) == len(unique_document_ids)
 
-    # Check huggingface dataset has the same number of text blocks as the dataset
-    assert sum(len(doc.text_blocks or []) for doc in dataset.documents) == len(
-        test_huggingface_dataset_gst
+    dataset_text_blocks_number = sum(
+        len(doc.text_blocks or []) for doc in dataset.documents
+    )
+    assert dataset_text_blocks_number == len(test_huggingface_dataset_gst)
+
+    # CPR Dataset from passage level flat dataset schema
+    dataset = Dataset(document_model=CPRDocument)._from_huggingface_parquet(
+        test_huggingface_dataset_cpr_passage_level_flat
     )
 
+    assert isinstance(dataset, Dataset)
+    assert all(isinstance(doc, CPRDocument) for doc in dataset.documents)
+
 
 def test_dataset_indexable(test_dataset):
     """Tests that the dataset can be indexed to get documents"""