Skip to content

Commit

Permalink
add remaining properties to splink_datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
ADBond committed Sep 4, 2024
1 parent 10907eb commit b837676
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 1 deletion.
2 changes: 1 addition & 1 deletion splink/internals/datasets/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __post_init__(self):
"febrl4a": _ds_febrl4a,
"febrl4b": _ds_febrl4b,
"transactions_origin": _ds_transactions_origin,
"transactions_destinations": _ds_transactions_destination,
"transactions_destination": _ds_transactions_destination,
}
dataset_labels = {
"fake_1000": _dsl_fake_1000,
Expand Down
70 changes: 70 additions & 0 deletions splink/internals/datasets/splink_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,76 @@ def historical_50k(self):
""" # NOQA: E501
return datasets["historical_50k"]

@property
@dataset_property
def febrl3(self):
"""
The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
of comparison patterns from an epidemiological cancer study in Germany.
FEBRL3 data set contains 5000 records (2000 originals and 3000 duplicates),
with a maximum of 5 duplicates based on one original record.
Columns:
rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
""" # NOQA: E501
return datasets["febrl3"]

@property
@dataset_property
def febrl4a(self):
"""
The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
of comparison patterns from an epidemiological cancer study in Germany.
FEBRL4a contains 5000 original records.
Columns:
rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
""" # NOQA: E501
return datasets["febrl4a"]

@property
@dataset_property
def febrl4b(self):
"""
The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist
of comparison patterns from an epidemiological cancer study in Germany.
FEBRL4b contains 5000 duplicate records, one for each record in FEBRL4a.
Columns:
rec_id, given_name, surname, street_number, address_1, address_2, suburb, postcode, state, date_of_birth, soc_sec_id
""" # NOQA: E501
return datasets["febrl4b"]

@property
@dataset_property
def transactions_origin(self):
"""
This data has been generated to resemble bank transactions leaving an
account. There are no duplicates within the dataset and each transaction
is designed to have a counterpart arriving in 'transactions_destination'.
Memo is sometimes truncated or missing."
Columns:
ground_truth, memo, transaction_date, amount, unique_id
"""
return datasets["transactions_origin"]

@property
@dataset_property
def transactions_destination(self):
"""
This data has been generated to resemble bank transactions leaving an
account. There are no duplicates within the dataset and each transaction
is designed to have a counterpart sent from 'transactions_origin'.
There may be a delay between the source and destination account,
and the amount may vary due to hidden fees and foreign exchange rates.
Memo is sometimes truncated or missing.
Columns:
ground_truth, memo, transaction_date, amount, unique_id
"""
return datasets["transactions_destination"]


class SplinkDataSetLabels:
def __init__(self):
Expand Down

0 comments on commit b837676

Please sign in to comment.