Skip to content

Commit

Permalink
rm main functions
Browse files Browse the repository at this point in the history
  • Loading branch information
TheColdIce committed Aug 26, 2024
1 parent 51357dd commit fca1f84
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 48 deletions.
41 changes: 0 additions & 41 deletions preprocess/feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,44 +149,3 @@ def cal_features(path_to_tx_log:str, banks=None, windows:int=1, overlap:float=0.
datasets.append((trainset, testset))

return datasets


def main():

dataset = '10K_accts'
path = f'../AMLsim/outputs/{dataset}/tx_log.csv'
df = load_data(path)
banks = ['handelsbanken', 'swedbank']
overlap = 0.9 # overlap of training and testing data
windows = [(0, 182), (183, 365), (0, 365)] # int or list of tuples - if int then the number of windows, if list of tuples then the start and end step for each window

for bank in banks:
df_bank = df[(df['bankOrig'] == bank) | (df['bankDest'] == bank)]
train_start = df_bank['step'].min()
train_end = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (overlap+(1-overlap)/2)
test_start = df_bank['step'].min() + (df_bank['step'].max() - df_bank['step'].min()) * (1-overlap)/2
test_end = df_bank['step'].max()
df_bank_train = df_bank[(df_bank['step'] >= train_start) & (df_bank['step'] <= train_end)]
df_bank_test = df_bank[(df_bank['step'] >= test_start) & (df_bank['step'] <= test_end)]
df_nodes_train = get_nodes(df_bank_train, bank, windows)
df_edges_train = get_edges(df_bank_train[(df_bank_train['bankOrig'] == bank) & (df_bank_train['bankDest'] == bank)], windows, aggregated=True, directional=True) # TODO: enable edges to/from the bank? the node features use these txs but unclear how to ceate a edge in this case, the edge can't be connected to a node with node features (could create node features based on edge txs, then the node features and edge features will look the same and some node features will be missing)
df_nodes_test = get_nodes(df_bank_test, bank, windows)
df_edges_test = get_edges(df_bank_test[(df_bank_test['bankOrig'] == bank) & (df_bank_test['bankDest'] == bank)], windows, aggregated=True, directional=True)
df_nodes_train.reset_index(inplace=True)
node_to_index = pd.Series(df_nodes_train.index, index=df_nodes_train['account']).to_dict()
df_edges_train['src'] = df_edges_train['src'].map(node_to_index) # OBS: in the csv files it looks like the edge src refers to the node two rows above the acculat node, this is due to the column head and that it starts counting at 0
df_edges_train['dst'] = df_edges_train['dst'].map(node_to_index)
df_nodes_test.reset_index(inplace=True)
node_to_index = pd.Series(df_nodes_test.index, index=df_nodes_test['account']).to_dict()
df_edges_test['src'] = df_edges_test['src'].map(node_to_index)
df_edges_test['dst'] = df_edges_test['dst'].map(node_to_index)
os.makedirs(f'data/{dataset}/{bank}/train', exist_ok=True)
os.makedirs(f'data/{dataset}/{bank}/test', exist_ok=True)
df_nodes_train.to_csv(f'data/{dataset}/{bank}/train/nodes.csv', index=False)
df_edges_train.to_csv(f'data/{dataset}/{bank}/train/edges.csv', index=False)
df_nodes_test.to_csv(f'data/{dataset}/{bank}/test/nodes.csv', index=False)
df_edges_test.to_csv(f'data/{dataset}/{bank}/test/edges.csv', index=False)


if __name__ == "__main__":
main()
7 changes: 0 additions & 7 deletions preprocess/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,3 @@ def topology_noise(nodes:pd.DataFrame, alert_members:pd.DataFrame, topologies:li
nodes.loc[nodes['account'].isin(accounts_to_flip), 'is_sar'] = 0

return nodes


def main():
pass

if __name__ == '__main__':
main()

0 comments on commit fca1f84

Please sign in to comment.