rm main functions

aidotse · Aug 26, 2024 · fca1f84 · fca1f84
1 parent 51357dd
commit fca1f84
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 48 deletions.
diff --git a/preprocess/feature_engineering.py b/preprocess/feature_engineering.py
@@ -149,44 +149,3 @@ def cal_features(path_to_tx_log:str, banks=None, windows:int=1, overlap:float=0.
         datasets.append((trainset, testset))
 
     return datasets
-
-
-def main():
-
-    dataset = '10K_accts'
-    path = f'../AMLsim/outputs/{dataset}/tx_log.csv'
-    df = load_data(path)
-    banks = ['handelsbanken', 'swedbank']
-    overlap = 0.9 # overlap of training and testing data
-    windows = [(0, 182), (183, 365), (0, 365)] # int or list of tuples - if int then the number of windows, if list of tuples then the start and end step for each window
-
-    for bank in banks:
-        df_bank = df[(df['bankOrig'] == bank) | (df['bankDest'] == bank)]
-        train_start = df_bank['step'].min()
-        train_end = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (overlap+(1-overlap)/2)
-        test_start = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (1-overlap)/2
-        test_end = df_bank['step'].max()
-        df_bank_train = df_bank[(df_bank['step'] >= train_start) & (df_bank['step'] <= train_end)]
-        df_bank_test = df_bank[(df_bank['step'] >= test_start) & (df_bank['step'] <= test_end)]
-        df_nodes_train = get_nodes(df_bank_train, bank, windows)
-        df_edges_train = get_edges(df_bank_train[(df_bank_train['bankOrig'] == bank) & (df_bank_train['bankDest'] == bank)], windows, aggregated=True, directional=True) # TODO: enable edges to/from the bank? the node features use these txs but unclear how to ceate a edge in this case, the edge can't be connected to a node with node features (could create node features based on edge txs, then the node features and edge features will look the same and some node features will be missing)
-        df_nodes_test = get_nodes(df_bank_test, bank, windows)
-        df_edges_test = get_edges(df_bank_test[(df_bank_test['bankOrig'] == bank) & (df_bank_test['bankDest'] == bank)], windows, aggregated=True, directional=True)
-        df_nodes_train.reset_index(inplace=True)
-        node_to_index = pd.Series(df_nodes_train.index, index=df_nodes_train['account']).to_dict()
-        df_edges_train['src'] = df_edges_train['src'].map(node_to_index) # OBS: in the csv files it looks like the edge src refers to the node two rows above the acculat node, this is due to the column head and that it starts counting at 0
-        df_edges_train['dst'] = df_edges_train['dst'].map(node_to_index)
-        df_nodes_test.reset_index(inplace=True)
-        node_to_index = pd.Series(df_nodes_test.index, index=df_nodes_test['account']).to_dict()
-        df_edges_test['src'] = df_edges_test['src'].map(node_to_index)
-        df_edges_test['dst'] = df_edges_test['dst'].map(node_to_index)
-        os.makedirs(f'data/{dataset}/{bank}/train', exist_ok=True)
-        os.makedirs(f'data/{dataset}/{bank}/test', exist_ok=True)
-        df_nodes_train.to_csv(f'data/{dataset}/{bank}/train/nodes.csv', index=False)
-        df_edges_train.to_csv(f'data/{dataset}/{bank}/train/edges.csv', index=False)
-        df_nodes_test.to_csv(f'data/{dataset}/{bank}/test/nodes.csv', index=False)
-        df_edges_test.to_csv(f'data/{dataset}/{bank}/test/edges.csv', index=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/preprocess/noise.py b/preprocess/noise.py
@@ -81,10 +81,3 @@ def topology_noise(nodes:pd.DataFrame, alert_members:pd.DataFrame, topologies:li
         nodes.loc[nodes['account'].isin(accounts_to_flip), 'is_sar'] = 0
 
     return nodes
-
-
-def main():
-    pass
-
-if __name__ == '__main__':
-    main()