diff --git a/preprocess/feature_engineering.py b/preprocess/feature_engineering.py index 630c5f2..e960ac1 100644 --- a/preprocess/feature_engineering.py +++ b/preprocess/feature_engineering.py @@ -149,44 +149,3 @@ def cal_features(path_to_tx_log:str, banks=None, windows:int=1, overlap:float=0. datasets.append((trainset, testset)) return datasets - - -def main(): - - dataset = '10K_accts' - path = f'../AMLsim/outputs/{dataset}/tx_log.csv' - df = load_data(path) - banks = ['handelsbanken', 'swedbank'] - overlap = 0.9 # overlap of training and testing data - windows = [(0, 182), (183, 365), (0, 365)] # int or list of tuples - if int then the number of windows, if list of tuples then the start and end step for each window - - for bank in banks: - df_bank = df[(df['bankOrig'] == bank) | (df['bankDest'] == bank)] - train_start = df_bank['step'].min() - train_end = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (overlap+(1-overlap)/2) - test_start = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (1-overlap)/2 - test_end = df_bank['step'].max() - df_bank_train = df_bank[(df_bank['step'] >= train_start) & (df_bank['step'] <= train_end)] - df_bank_test = df_bank[(df_bank['step'] >= test_start) & (df_bank['step'] <= test_end)] - df_nodes_train = get_nodes(df_bank_train, bank, windows) - df_edges_train = get_edges(df_bank_train[(df_bank_train['bankOrig'] == bank) & (df_bank_train['bankDest'] == bank)], windows, aggregated=True, directional=True) # TODO: enable edges to/from the bank? the node features use these txs but unclear how to ceate a edge in this case, the edge can't be connected to a node with node features (could create node features based on edge txs, then the node features and edge features will look the same and some node features will be missing) - df_nodes_test = get_nodes(df_bank_test, bank, windows) - df_edges_test = get_edges(df_bank_test[(df_bank_test['bankOrig'] == bank) & (df_bank_test['bankDest'] == bank)], windows, aggregated=True, directional=True) - df_nodes_train.reset_index(inplace=True) - node_to_index = pd.Series(df_nodes_train.index, index=df_nodes_train['account']).to_dict() - df_edges_train['src'] = df_edges_train['src'].map(node_to_index) # OBS: in the csv files it looks like the edge src refers to the node two rows above the acculat node, this is due to the column head and that it starts counting at 0 - df_edges_train['dst'] = df_edges_train['dst'].map(node_to_index) - df_nodes_test.reset_index(inplace=True) - node_to_index = pd.Series(df_nodes_test.index, index=df_nodes_test['account']).to_dict() - df_edges_test['src'] = df_edges_test['src'].map(node_to_index) - df_edges_test['dst'] = df_edges_test['dst'].map(node_to_index) - os.makedirs(f'data/{dataset}/{bank}/train', exist_ok=True) - os.makedirs(f'data/{dataset}/{bank}/test', exist_ok=True) - df_nodes_train.to_csv(f'data/{dataset}/{bank}/train/nodes.csv', index=False) - df_edges_train.to_csv(f'data/{dataset}/{bank}/train/edges.csv', index=False) - df_nodes_test.to_csv(f'data/{dataset}/{bank}/test/nodes.csv', index=False) - df_edges_test.to_csv(f'data/{dataset}/{bank}/test/edges.csv', index=False) - - -if __name__ == "__main__": - main() diff --git a/preprocess/noise.py b/preprocess/noise.py index 9927913..91bf489 100644 --- a/preprocess/noise.py +++ b/preprocess/noise.py @@ -81,10 +81,3 @@ def topology_noise(nodes:pd.DataFrame, alert_members:pd.DataFrame, topologies:li nodes.loc[nodes['account'].isin(accounts_to_flip), 'is_sar'] = 0 return nodes - - -def main(): - pass - -if __name__ == '__main__': - main() \ No newline at end of file