forked from IntelPython/scikit-learn_bench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
df_clsf.py
130 lines (104 loc) · 5.04 KB
/
df_clsf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# ===============================================================================
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===============================================================================
import argparse
import bench
import numpy as np
from daal4py import (decision_forest_classification_prediction,
decision_forest_classification_training, engines_mt2203)
from daal4py.sklearn._utils import getFPType
from sklearn.metrics import accuracy_score
def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345,
n_features_per_node=0, max_depth=0, min_impurity=0,
bootstrap=True, verbose=False):
fptype = getFPType(X)
features_per_node = X.shape[1]
if n_features_per_node > 0 and n_features_per_node < features_per_node:
features_per_node = n_features_per_node
engine = engines_mt2203(seed=seed, fptype=fptype)
algorithm = decision_forest_classification_training(
nClasses=n_classes,
fptype=fptype,
method='defaultDense',
nTrees=n_trees,
observationsPerTreeFraction=1.,
featuresPerNode=features_per_node,
maxTreeDepth=max_depth,
minObservationsInLeafNode=1,
engine=engine,
impurityThreshold=min_impurity,
varImportance='MDI',
resultsToCompute='',
memorySavingMode=False,
bootstrap=bootstrap
)
df_clsf_result = algorithm.compute(X, y)
return df_clsf_result
def df_clsf_predict(X, training_result, n_classes, verbose=False):
algorithm = decision_forest_classification_prediction(
nClasses=n_classes,
fptype='float', # we give float here specifically to match sklearn
)
result = algorithm.compute(X, training_result.model)
return result.prediction
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='daal4py random forest '
'classification benchmark')
parser.add_argument('--criterion', type=str, default='gini',
choices=('gini'),
help='The function to measure the quality of a split')
parser.add_argument('--num-trees', type=int, default=100,
help='Number of trees in the forest')
parser.add_argument('--max-features', type=bench.float_or_int, default=0,
help='Upper bound on features used at each split')
parser.add_argument('--max-depth', type=int, default=0,
help='Upper bound on depth of constructed trees')
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
help='Minimum samples number for node splitting')
parser.add_argument('--max-leaf-nodes', type=int, default=None,
help='Maximum leaf nodes per tree')
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
help='Needed impurity decrease for node splitting')
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
action='store_false',
help="Don't control bootstraping")
params = bench.parse_args(parser, prefix='daal4py')
# Load data
X_train, X_test, y_train, y_test = bench.load_data(
params, add_dtype=True, label_2d=True)
params.n_classes = len(np.unique(y_train))
if isinstance(params.max_features, float):
params.max_features = int(X_train.shape[1] * params.max_features)
# Time fit and predict
fit_time, res = bench.measure_function_time(
df_clsf_fit, X_train, y_train,
params.n_classes,
n_trees=params.num_trees,
n_features_per_node=params.max_features,
max_depth=params.max_depth,
min_impurity=params.min_impurity_decrease,
bootstrap=params.bootstrap,
seed=params.seed,
params=params)
yp = df_clsf_predict(X_train, res, params.n_classes)
train_acc = 100 * accuracy_score(yp, y_train)
predict_time, yp = bench.measure_function_time(
df_clsf_predict, X_test, res, params.n_classes, params=params)
test_acc = 100 * accuracy_score(yp, y_test)
bench.print_output(library='daal4py', algorithm='decision_forest_classification',
stages=['training', 'prediction'], params=params,
functions=['df_clsf.fit', 'df_clsf.predict'],
times=[fit_time, predict_time], metric_type='accuracy[%]',
metrics=[train_acc, test_acc], data=[X_train, X_test])