add remaining properties to splink_datasets

moj-analytical-services · Sep 4, 2024 · b837676 · b837676
1 parent 10907eb
commit b837676
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 1 deletion.
diff --git a/splink/internals/datasets/metadata.py b/splink/internals/datasets/metadata.py
@@ -127,7 +127,7 @@ def __post_init__(self):
     "febrl4a": _ds_febrl4a,
     "febrl4b": _ds_febrl4b,
     "transactions_origin": _ds_transactions_origin,
-    "transactions_destinations": _ds_transactions_destination,
+    "transactions_destination": _ds_transactions_destination,
 }
 dataset_labels = {
     "fake_1000": _dsl_fake_1000,

diff --git a/splink/internals/datasets/splink_datasets.py b/splink/internals/datasets/splink_datasets.py
@@ -90,6 +90,76 @@ def historical_50k(self):
         """  # NOQA: E501
         return datasets["historical_50k"]
 
+    @property
+    @dataset_property
+    def febrl3(self):
+        """
+        The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
+        of comparison patterns from an epidemiological cancer study in Germany.
+        FEBRL3 data set contains 5000 records (2000 originals and 3000 duplicates),
+        with a maximum of 5 duplicates based on one original record.
+
+        Columns:
+        rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
+        """  # NOQA: E501
+        return datasets["febrl3"]
+
+    @property
+    @dataset_property
+    def febrl4a(self):
+        """
+        The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
+        of comparison patterns from an epidemiological cancer study in Germany.
+        FEBRL4a contains 5000 original records.
+
+        Columns:
+        rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
+        """  # NOQA: E501
+        return datasets["febrl4a"]
+
+    @property
+    @dataset_property
+    def febrl4b(self):
+        """
+        The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
+        of comparison patterns from an epidemiological cancer study in Germany.
+        FEBRL4b contains 5000 duplicate records, one for each record in FEBRL4a.
+
+        Columns:
+        rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
+        """  # NOQA: E501
+        return datasets["febrl4b"]
+
+    @property
+    @dataset_property
+    def transactions_origin(self):
+        """
+        This data has been generated to resemble bank transactions leaving an
+        account. There are no duplicates within the dataset and each transaction
+        is designed to have a counterpart arriving in 'transactions_destination'.
+        Memo is sometimes truncated or missing."
+
+        Columns:
+        ground_truth, memo, transaction_date, amount, unique_id
+        """
+        return datasets["transactions_origin"]
+
+    @property
+    @dataset_property
+    def transactions_destination(self):
+        """
+        This data has been generated to resemble bank transactions leaving an
+        account. There are no duplicates within the dataset and each transaction
+        is designed to have a counterpart sent from 'transactions_origin'.
+        There may be a delay between the source and destination account,
+        and the amount may vary due to hidden fees and foreign exchange rates.
+        Memo is sometimes truncated or missing.
+
+        Columns:
+        ground_truth, memo, transaction_date, amount, unique_id
+        """
+        return datasets["transactions_destination"]
+
 
 class SplinkDataSetLabels:
     def __init__(self):