Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding updated scripts to automate testing #22

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,9 @@ def prepare_bosch(dataset_folder, nrows):

os.system("kaggle competitions download -c bosch-production-line-performance -f " +
filename + " -p " + dataset_folder)
X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32,
nrows=nrows)
X = pd.read_csv(local_url,compression='zip', dtype=np.float32)
X = X.set_index('Id')
X.index = X.index.astype('int64')
y = X.iloc[:, -1].to_numpy(dtype=np.float32)
X.drop(X.columns[-1], axis=1, inplace=True)
X = X.to_numpy(dtype=np.float32)
Expand Down
54 changes: 54 additions & 0 deletions plotter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import sys
import csv
import json
import numpy as np
import pandas as pd
import argparse
import matplotlib.pyplot as plt
import json2csv
import csv_merger

def parse_args():
parser = argparse.ArgumentParser(
description="Visualize benchmarks against another version")
parser.add_argument("-d1", required=True, type=str,
help="comma sperated csv files to merge")
parser.add_argument("-metric", default="train_time", type=str,
help=("The metric we want to visulaize"))
parser.add_argument("-dataset", required=True, type=str,
help="dataset to plot")
parser.add_argument("-title", default="graph", type=str,
help=("The title of the graph"))
parser.add_argument("-output", default=sys.path[0] + "/results.png", type=str,
help="Output json file with visualization")
args = parser.parse_args()
return args

def plot_error_bars(df_lis, args):
fig, ax = plt.subplots()
labels = ["xgb", "cat"]
ngpu = [1, 2, 4, 6, 8]
for idx,df in enumerate(df_lis):
gp = df.groupby("dataset")
means = gp.mean()
errors = gp.std()
means = means.T
means = means.drop(columns=[x for x,y in means.iteritems() if x != args.dataset])
plt.plot(ngpu, means[args.dataset].tolist(), label=labels[idx])
ax.legend()
plt.ylabel(args.metric)
plt.xticks()
plt.title(f"{args.title} plot")
plt.savefig(args.output)

def main():
df_lis = []
args = parse_args()
groups = args.d1.split(":")
for x in groups:
df_lis.append(csv_merger.import_main(x))
plot_error_bars(df_lis, args)

if __name__ == '__main__':
main()
86 changes: 86 additions & 0 deletions trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os
import sys
import argparse
import json
import ast
import psutil
import algorithms
from metrics import get_metrics
from runme import benchmark
from runme import get_number_processors
from runme import print_sys_info
from datasets import prepare_dataset

def parse_args():
parser = argparse.ArgumentParser(
description="Benchmark xgboost/lightgbm/catboost on real datasets")
parser.add_argument("-dataset", default="all", type=str,
help="The dataset to be used for benchmarking. 'all' for all datasets.")
parser.add_argument("-root", default="/opt/gbm-datasets",
type=str, help="The root datasets folder")
parser.add_argument("-algorithm", default="all", type=str,
help=("Comma-separated list of algorithms to run; "
"'all' run all"))
parser.add_argument("-gpus", default=-1, type=int,
help=("#GPUs to use for the benchmarks; "
"ignored when not supported. Default is to use all."))
parser.add_argument("-ngpus", default='1', type=str,
help=("#GPUs to use for the benchmarks; "
"ignored when not supported. Default is to use all."))
parser.add_argument("-cpus", default=0, type=int,
help=("#CPUs to use for the benchmarks; "
"0 means psutil.cpu_count(logical=False)"))
parser.add_argument("-output", default=sys.path[0] + "/results.json", type=str,
help="Output json file with runtime/accuracy stats")
parser.add_argument("-ntrees", default=500, type=int,
help=("Number of trees. Default is as specified in "
"the respective dataset configuration"))
parser.add_argument("-nrows", default=None, type=int,
help=(
"Subset of rows in the datasets to use. Useful for test running "
"benchmarks on small amounts of data. WARNING: Some datasets will "
"give incorrect accuracy results if nrows is specified as they have "
"predefined train/test splits."))
parser.add_argument("-cycles", default=1, type=int,
help=("#training of training cycles for each iteration"))
parser.add_argument("-train_cycles", default=1, type=int,
help=("#training of training cycles"))
parser.add_argument("-warmup", action="store_true",
help=("Whether to run a small benchmark (fraud) as a warmup"))
parser.add_argument("-verbose", action="store_true", help="Produce verbose output")
parser.add_argument("-extra", default='{}', help="Extra arguments as a python dictionary")
args = parser.parse_args()
# default value for output json file
if not args.output:
args.output = "%s.json" % args.dataset
return args

def main():
args = parse_args()
args.cpus = get_number_processors(args)
args.extra = ast.literal_eval(args.extra)
print_sys_info(args)
gpu_lis = []
if args.warmup:
benchmark(args, os.path.join(args.root, "fraud"), "fraud")
if args.dataset == 'all':
args.dataset = 'airline,bosch,fraud,higgs,year,epsilon,covtype'
gpu_lis = args.ngpus.split(",")
if len(gpu_lis) != args.train_cycles:
print("please match npus with train_cycles")
else:
for idx, ele in enumerate(range(args.train_cycles)):
results = {}
args.gpus = int(gpu_lis[idx])
for dataset in args.dataset.split(","):
folder = os.path.join(args.root, dataset)
results.update({dataset: benchmark(args, folder, dataset)})
print(json.dumps({dataset: results[dataset]}, indent=2, sort_keys=True))
output = json.dumps(results, indent=2, sort_keys=True)
output_file = open(args.output[:-5]+str(idx)+args.output[-5:], "w")
output_file.write(output + "\n")
output_file.close()
print("Results written to file '%s'" % args.output)

if __name__ == '__main__':
main()