Skip to content

Commit

Permalink
Merge pull request #37 from aidotse/dev-auto-tuner
Browse files Browse the repository at this point in the history
Dev auto tuner
  • Loading branch information
johanos1 authored Sep 2, 2024
2 parents 63c9170 + 9c71700 commit 963f598
Show file tree
Hide file tree
Showing 47 changed files with 308 additions and 10,900 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__
*.pyc
.vscode
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
datasets
__pycache__
.vscode
*.pyc
.vscode
.venv
8 changes: 7 additions & 1 deletion AMLsim/paramFiles/10K_accts/alertPatterns.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
count,type,schedule_id,min_accounts,max_accounts,min_amount,max_amount,min_period,max_period,bank_id,is_sar,source_type
50,stack,2,10,20,100,1000,1,28,bank,True,CASH
1,fan_out,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,fan_in,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,cycle,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,bipartite,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,stack,2,5,5,100,1000,2,28,bank,True,TRANSFER
1,gather_scatter,2,6,6,100,1000,2,28,bank,True,TRANSFER
1,scatter_gather,2,6,6,100,1000,2,28,bank,True,TRANSFER
14 changes: 7 additions & 7 deletions AMLsim/paramFiles/10K_accts/conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"max_amount": 150000,
"mean_amount": 637,
"std_amount": 300,
"mean_amount_sar": 643,
"mean_amount_sar": 637,
"std_amount_sar": 300,
"prob_income": 0.0,
"mean_income": 0.0,
Expand All @@ -21,18 +21,18 @@
"std_outcome": 100.0,
"mean_outcome_sar": 500.0,
"std_outcome_sar": 100.0,
"prob_spend_cash": 0.15,
"prob_spend_cash": 0.0,
"n_steps_balance_history": 7,
"mean_phone_change_frequency": 1460,
"std_phone_change_frequency": 365,
"mean_phone_change_frequency_sar": 1330,
"std_phone_change_frequency_sar": 543,
"mean_phone_change_frequency_sar": 1460,
"std_phone_change_frequency_sar": 365,
"mean_bank_change_frequency": 1460,
"std_bank_change_frequency": 365,
"mean_bank_change_frequency_sar": 1414,
"std_bank_change_frequency_sar": 541,
"mean_bank_change_frequency_sar": 1460,
"std_bank_change_frequency_sar": 365,
"margin_ratio": 0.1,
"prob_participate_in_multiple_sars": 0.06
"prob_participate_in_multiple_sars": 0.0
},
"input": {
"directory": "paramFiles/10K_accts",
Expand Down
25 changes: 17 additions & 8 deletions AMLsim/scripts/transaction_graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1229,13 +1229,19 @@ def add_edge(_orig, _bene, _amount, _date):
n_origs = random.randint(1, len(members) - 1)
origs = members[:n_origs]
benes = members[n_origs:]
for orig, bene in zip(origs, benes):
scatter_amount = RandomAmount(min_amount, max_amount).getAmount()
scatter_date = random.randrange(start_date, end_date)
add_edge(orig, mid_acct, scatter_amount, scatter_date)
gather_amount = scatter_amount - scatter_amount * self.margin_ratio
gather_date = random.randrange(scatter_date, end_date)
add_edge(mid_acct, bene, gather_amount, gather_date)
sum_gather = 0.0
last_gather_date = 0
for orig in origs:
gather_amount = RandomAmount(min_amount, max_amount).getAmount()
sum_gather += gather_amount
gather_date = random.randrange(start_date, end_date)
add_edge(orig, mid_acct, gather_amount, gather_date)
last_gather_date = max(last_gather_date, gather_date)
sum_gather *= self.margin_ratio
scatter_amount = sum_gather / len(benes)
for bene in benes:
scatter_date = random.randrange(last_gather_date, end_date)
add_edge(mid_acct, bene, scatter_amount, scatter_date)

# TODO: User-defined typology implementations goes here

Expand Down Expand Up @@ -1320,7 +1326,10 @@ def get_out_edge_attrs(g, vid, name):
for n in sub_g.nodes(): # go over all nodes in the subgraph
is_main = "true" if n == main_id else "false"
is_sar = "true" if sub_g.graph[IS_SAR_KEY] else "false"
min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
try:
min_amt = '{:.2f}'.format(min(get_out_edge_attrs(sub_g, n, "amount")))
except:
pass
max_amt = '{:.2f}'.format(max(get_out_edge_attrs(sub_g, n, "amount")))
min_step = start
max_step = end
Expand Down
66 changes: 55 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,62 @@
# Base image
FROM ubuntu:22.04

WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive

# Set the working directory
WORKDIR /flib

# Install dependencies
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
python3-dev \
python3-setuptools \
python3-wheel \
&& rm -rf /var/lib/apt/lists/*

COPY federated-learning-v2/requirements.txt .
wget \
openjdk-11-jdk \
python3.10 \
python3-pip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download and install Maven
RUN wget https://downloads.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz -O - | tar xzf - -C /usr/share && \
ln -s /usr/share/apache-maven-3.9.6 /usr/share/maven && \
ln -s /usr/share/maven/bin/mvn /usr/bin/mvn

# Install java dependencies
COPY AMLsim/jars AMLsim/jars
RUN mvn install:install-file \
-Dfile=AMLsim/jars/mason.20.jar \
-DgroupId=mason \
-DartifactId=mason \
-Dversion=20 \
-Dpackaging=jar \
-DgeneratePom=true

# Set the default Python version to Python 3.10
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

# Install Python dependencies
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

COPY federated-learning-v2/ .
# Setup AMLsim
WORKDIR /flib/AMLsim
COPY AMLsim/scripts scripts
COPY AMLsim/src src
COPY AMLsim/pom.xml pom.xml
RUN mvn clean package -DskipTests
RUN sh scripts/run.sh

# Setup preprocess
WORKDIR /flib
COPY preprocess/ preprocess/

# Setup auto-aml-data-gen
WORKDIR /flib/auto-aml-data-gen
COPY auto-aml-data-gen/classifier.py classifier.py
COPY auto-aml-data-gen/main.py main.py
COPY auto-aml-data-gen/optimizer.py optimizer.py
COPY auto-aml-data-gen/simulate.py simulate.py
COPY auto-aml-data-gen/utils.py utils.py
RUN mkdir data

RUN echo "hello"
# Start with a bash shell
ENTRYPOINT ["python3", "main.py"]
2 changes: 2 additions & 0 deletions auto-aml-data-gen/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
param_files
data
21 changes: 0 additions & 21 deletions auto-aml-data-gen/README.md

This file was deleted.

Empty file added auto-aml-data-gen/__init__.py
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed auto-aml-data-gen/__pycache__/train.cpython-37.pyc
Binary file not shown.
Binary file removed auto-aml-data-gen/__pycache__/utils.cpython-37.pyc
Binary file not shown.
10 changes: 0 additions & 10 deletions auto-aml-data-gen/best_params.txt

This file was deleted.

41 changes: 31 additions & 10 deletions auto-aml-data-gen/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
self.model = grid.best_estimator_
else:
self.model = model().fit(self.X_train, self.y_train)
elif model == 'GradientBoostingClassifier':
model = getattr(sklearn.ensemble, model)
if tune_hyperparameters:
param_grid = {
'loss': ['log_loss', 'exponential'], # 'log_loss', 'exponential'
'learning_rate': [0.01, 0.1], # [0.0, inf)
'n_estimators': [100, 200], # [1, inf)
'criterion': ['friedman_mse', 'squared_error'], # 'friedman_mse', 'squared_error'
'min_samples_split': [2, 5], # [2, inf)
'min_samples_leaf': [1, 5], # [1, inf)
'min_weight_fraction_leaf': [0.0, 0.1], # [0.0, 0.5]
'max_depth': [None, 3, 5], # None or [1, inf), tune for best performance
'min_impurity_decrease': [0.0, 0.1], # [0.0, inf)
'max_leaf_nodes': [None, 10], # None or [2, inf)
'random_state': [42],
}
grid = GridSearchCV(model(), param_grid, scoring='balanced_accuracy', verbose=1, n_jobs=-1)
grid.fit(self.X_train, self.y_train)
self.model = grid.best_estimator_
else:
self.model = model().fit(self.X_train, self.y_train)
else:
self.model = model.fit(self.X_train, self.y_train)
return self.model
Expand All @@ -80,12 +101,20 @@ def train(self, model='RandomForestClassifier', tune_hyperparameters=False):
def evaluate(self, operating_recall:int=0.8):
y_pred = self.model.predict_proba(self.X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(self.y_test, y_pred)
if len(thresholds) == 1: # if only one threshold, all predict_proba are the same -> fpr = 1.0
return 1.0, self.model.feature_importances_
threshold = thresholds[np.argmax(recall <= operating_recall)]
y_pred = (y_pred > threshold).astype(int)

# calc recall
recall = recall_score(self.y_test, y_pred)
print(f'Recall: {recall:.4f}')

tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
#print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')
fpr = fp/(fp+tp)
if tp+fp == 0:
fpr = 1.0
else:
fpr = fp/(fp+tp)
print(f'False positive rate: {fpr:.4f}')

# Print the important features
Expand All @@ -101,12 +130,4 @@ def evaluate(self, operating_recall:int=0.8):
print(f'Average importance error: {sum_avg_importance_error:.4f}')

return fpr, importances


def precision_after_recall(self, X, y_true):
y_pred = self.model.predict_proba(X)[:,1]
precision, recall, threshold = precision_recall_curve(y_true, y_pred)
recall = 0.75
idx = np.argmax(recall <= recall)
return precision[idx]

Loading

0 comments on commit 963f598

Please sign in to comment.