forked from insilicomedicine/Fair-Evaluation-BERT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_data.py
28 lines (21 loc) · 964 Bytes
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from utils import read_dataset, save_dataset
from argparse import ArgumentParser
import pandas as pd
from typing import List, Dict
def process_dataset(test_dataset: List[Dict[str, str]], train_dataset: List[Dict[str, str]]):
train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)
refined_set = test_df[~test_df.entity_text.isin(train_df.entity_text)]
return refined_set.drop_duplicates().to_dict('records')
def get_arguments() -> ArgumentParser:
parser = ArgumentParser()
parser.add_argument('--train_data_folder')
parser.add_argument('--test_data_folder')
parser.add_argument('--save_to')
return parser.parse_args()
if __name__ == '__main__':
args = get_arguments()
train_dataset = read_dataset(args.train_data_folder)
test_dataset = read_dataset(args.test_data_folder)
refined_test_set = process_dataset(test_dataset, train_dataset)
save_dataset(refined_test_set, args.save_to)