-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
171 lines (142 loc) · 7.53 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import typing
import pandas as pd
def compress_performance(input_file: str, performance_type: str, base_output_directory: str):
if performance_type not in RESULT_TYPES:
raise ValueError(f'performance_type must be one equal to {RESULT_TYPES}, got: {performance_type}')
df = pd.read_csv(input_file)
headers_to_group = [
(['SERVICES', 'SERVICE_DATA'], f'performance_{performance_type}_service_first'),
(['SERVICE_DATA', 'SERVICES'], f'performance_{performance_type}_servicedata_first')
]
for header, base_name in headers_to_group:
if performance_type == RESULT_TYPE_NEGOTIATION:
columns_to_drop = ['MIN', 'MAX', 'STD']
else:
columns_to_drop = [col for col in df.columns if 'STD' in col or 'MAX' in col or 'MIN' in col]
grouped = df.groupby(header).mean(numeric_only=True).drop(columns_to_drop, axis='columns')
# let's move the multi-index to multiple column, so it is easier to plot.
grouped = grouped.unstack(level=-1)
# now, we have a multi-level column to be flattened. Let's do it.
grouped.columns = grouped.columns.to_flat_index()
# now, the name of the columns is like services, (avg, 10), (avg, 20), etc.
# Let's "aggregate" the column names. Note that the first column is actually the index,
# so we just have to wrap the tuple.
grouped = grouped.rename(lambda col: f'{col[0]}_{col[1]}', axis='columns')
grouped.to_csv(f'{base_output_directory}/{base_name}.csv', index=True)
def compress(input_files: typing.List[str], base_output_directory: str, prefix: str = '',
columns_to_remove: typing.Optional[typing.List[str]] = None,
drop_std: bool = False):
# one DataFrame under the other
df = pd.concat([pd.read_csv(f) for f in input_files])
# let's first drop columns if necessary
# If we are asked to remove STD-related columns, we add those columns to the list of columns to drop.
if drop_std:
to_drop = list(filter(lambda col: 'STD' in col, df.columns))
columns_to_remove = to_drop if columns_to_remove is None else columns_to_remove + to_drop
if columns_to_remove is not None and len(columns_to_remove) > 0:
df = df.drop(columns_to_remove, axis='columns')
# map between the headers to be used in group by and the name to use in output
headers_to_group = [
(['SERVICES', 'SERVICE_DATA'], f'{prefix}_service_first'),
(['SERVICE_DATA', 'SERVICES'], f'{prefix}_servicedata_first')
]
for header, base_name in headers_to_group:
# compute the mean and retrieve avg only.
grouped = df.groupby(header).mean(numeric_only=True)
# let's move the multi-index to multiple column, so it is easier to plot.
grouped = grouped.unstack(level=-1)
# now, we have a multi-level column to be flattened. Let's do it.
grouped.columns = grouped.columns.to_flat_index()
# now, the name of the columns is like (col_name, 10), (col_name, 20), etc.,
# where "col_name" is the col_name and 10 is the value we are grouping.
# Let's join the column names from tuples to string.
# Note that the first column is actually the index, so we just have to wrap the tuple.
grouped = grouped.rename(lambda col: f'{col[0]}_{col[1]}', axis='columns')
grouped.to_csv(f'{base_output_directory}/{base_name}.csv', index=True)
def group_func_group_change(setting: str) -> str:
"""
Examples
-------
>>> setting_name = 'G2.3.3'
>>> group_func_group_change(setting_name)
... 'G2.3.X'
"""
parts = setting.split('.')
if len(parts) != 3:
raise ValueError(f'Split on {setting} failed, got {parts}')
# join all parts but the last one.
rejoined = '.'.join(parts[:-1])
# and add 'X' add the end.
return f'{rejoined}.X'
def group_func_group_basic(setting: str) -> str:
"""
Examples
-------
>>> setting_name = 'G2.3.3'
>>> group_func_group_basic(setting_name)
... 'GX.X.X'
"""
parts = setting.split('.')
if len(parts) != 3:
raise ValueError(f'Split on {setting} failed, got {parts}')
return f'GX.X.{parts[-1]}'
def average(input_file: str, grouping_func, output_file: str, drop_std: bool = False):
# first we read the cvs file.
df = pd.read_csv(input_file)
# rename the first column which is unnamed because it is an index (when it has been exported)
df = df.rename({'Unnamed: 0': 'Setting'}, axis='columns')
# this seems very complicated, but actually we just change the value of the "Setting" column using group.
df = df.apply(lambda row: pd.Series(
[grouping_func(setting=row['Setting'])] + [row[k] for k in row.index if k != 'Setting'],
index=row.index), axis='columns')
# now, we just group and we are almost done.
# NOTE: reset_index move the group-by index to the first column
grouped = df.groupby('Setting').mean().reset_index()
if drop_std:
to_drop = list(filter(lambda col: 'STD' in col, df.columns))
grouped = grouped.drop(to_drop, axis='columns')
grouped.to_csv(output_file, index=False)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
sub_parsers = parser.add_subparsers()
RESULT_TYPE_NEGOTIATION = 'negotiation'
RESULT_TYPE_DYNAMIC = 'dynamic'
RESULT_TYPES = [RESULT_TYPE_NEGOTIATION, RESULT_TYPE_DYNAMIC]
parser_compress_performance = sub_parsers.add_parser('compress-performance')
parser_compress_performance.add_argument('--base-output-directory', required=True, type=str)
parser_compress_performance.add_argument('--input-file', required=True, type=str)
parser_compress_performance.add_argument('--mode', choices=RESULT_TYPES, required=True, type=str)
parser_compress_performance.set_defaults(func=lambda args_: compress_performance(
input_file=args_.input_file,
base_output_directory=args_.base_output_directory,
performance_type=args_.mode
))
parser_compress_quality = sub_parsers.add_parser('compress-quality')
parser_compress_quality.add_argument('--base-output-directory', required=True, type=str)
parser_compress_quality.add_argument('--input-files', required=True, nargs='*', type=str)
parser_compress_quality.add_argument('--prefix', required=False, type=str)
parser_compress_quality.add_argument('--columns-to-remove', nargs='*', required=False, type=str)
parser_compress_quality.add_argument('--drop-std', required=False, type=bool)
parser_compress_quality.set_defaults(func=lambda args_: compress(
input_files=args_.input_files,
prefix=args.prefix,
columns_to_remove=args.columns_to_remove,
drop_std=args.drop_std,
base_output_directory=args_.base_output_directory,
))
parser_compress_average = sub_parsers.add_parser('compress-average')
parser_compress_average.add_argument('--input-file', required=True, type=str)
parser_compress_average.add_argument('--output-file', required=True, type=str)
parser_compress_average.add_argument('--drop-std', required=False, type=bool)
parser_compress_average.add_argument('--mode', choices=RESULT_TYPES, required=True,
type=str)
parser_compress_average.set_defaults(func=lambda args_: average(
input_file=args_.input_file,
output_file=args_.output_file,
drop_std=args.drop_std,
grouping_func=group_func_group_change if args.mode == RESULT_TYPE_NEGOTIATION
else group_func_group_basic
))
args = parser.parse_args()
args.func(args)