forked from salvacarrion/autonmt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_plot_results.py
79 lines (64 loc) · 2.99 KB
/
3_plot_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
from autonmt.bundle import utils
from autonmt.bundle.report import generate_multivariable_report
from autonmt.preprocessing import DatasetBuilder
from autonmt.preprocessing.processors import preprocess_pairs, preprocess_lines, normalize_lines
# Preprocess functions
normalize_fn = lambda x: normalize_lines(x)
preprocess_raw_fn = lambda x, y: preprocess_pairs(x, y, normalize_fn=normalize_fn, min_len=1, max_len=None, remove_duplicates=True, shuffle_lines=True)
preprocess_splits_fn = lambda x, y: preprocess_pairs(x, y, normalize_fn=normalize_fn)
preprocess_predict_fn = lambda x: preprocess_lines(x, normalize_fn=normalize_fn)
def main():
# Create preprocessing for training
builder = DatasetBuilder(
# Root folder for datasets
base_path="datasets/translate",
# Set of datasets, languages, training sizes to try
datasets=[
{"name": "europarl", "languages": ["es-en", "fr-en", "de-en"], "sizes": [("original", None), ("100k", 100000)]},
{"name": "scielo/health", "languages": ["es-en"], "sizes": [("100k", 100000)], "split_sizes": (None, 1000, 1000)},
],
# Set of subword models and vocab sizes to try
encoding=[
{"subword_models": ["bpe", "unigram+bytes"], "vocab_sizes": [8000, 16000, 32000]},
{"subword_models": ["bytes", "char", "char+bytes"], "vocab_sizes": [1000]},
],
# Preprocessing functions
preprocess_raw_fn=preprocess_raw_fn,
preprocess_splits_fn=preprocess_splits_fn,
# Additional args
merge_vocabs=False,
).build(make_plots=False, force_overwrite=False)
# Create preprocessing for training and testing
tr_datasets = builder.get_train_ds()
ts_datasets = builder.get_test_ds()
# Train & Score a model for each dataset
stats = []
for ds in tr_datasets:
# Get ds stats
ds_stats = utils.load_json(ds.get_stats_path("stats.json"))
# Add stats
ds_stats["scores"] = {}
row = {
"subword_model": ds.subword_model,
"vocab_size": ds.vocab_size,
"unknown_avg_tokens": ds_stats["val.en"]["unknown_avg_tokens"],
}
stats.append(row)
# Create dataframes
# assert len(ts_datasets) == 1
df_report = pd.DataFrame(stats)
df_report["dataset"] = [f"{ds.dataset_name}-{ds.dataset_size_name}".replace("_lc", "").title() for ds in tr_datasets]
df_report["vocab_size"] = df_report["vocab_size"].astype(int)
# Make report and print it
output_path = f".outputs/myplots"
prefix = "unknowns_"
generate_multivariable_report(data=df_report,
x="vocab_size",
y_left=("unknown_avg_tokens", "subword_model"), y_right=None,
output_path=output_path, prefix=prefix,
save_figures=True, show_figures=False, save_csv=True)
print("Summary:")
print(df_report.to_string(index=False))
if __name__ == "__main__":
main()