Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Show only bergamot supported languages in aggregated section #137

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions evals/eval/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import os
import re
import shutil
import subprocess
import os
from collections import defaultdict
import statistics
import subprocess
import traceback
from sacrebleu import dataset
import click
from toolz import groupby
from collections import defaultdict
from glob import glob
from os.path import exists

import click
import pandas as pd
from mtdata import iso
from os.path import exists
from sacrebleu import dataset
from toolz import groupby

EVALUATION_LANGUAGES = [
"bg",
Expand Down Expand Up @@ -686,13 +687,22 @@ def evaluate(pair, set_name, translator, evaluation_engine, gpus, models_dir, re
print("Attempt failed, retrying")


def is_supported(translator, source, target):
return (
translator in SUPPORTED_LANGUAGES
and source in SUPPORTED_LANGUAGES[translator]
and target in SUPPORTED_LANGUAGES[translator][source]
)


def run_dir(
lang_pairs, skip_existing, translators, evaluation_engines, gpus, results_dir, models_dir
):
reordered = sorted(translators.split(","), key=lambda x: TRANS_ORDER[x])

for evaluation_engine in evaluation_engines.split(","):
for pair in lang_pairs:
source, target = pair
if "nn" in pair:
print(
"There are no evaluation datasets for Norwegian Nynorsk "
Expand All @@ -702,9 +712,8 @@ def run_dir(

for dataset_name in find_datasets(pair):
for translator in reordered:
if translator in SUPPORTED_LANGUAGES and pair[1] not in SUPPORTED_LANGUAGES[
translator
].get(pair[0], {}):
if not is_supported(translator, source, target):
print(f"Language pair {source}-{target} is not supported for {translator}")
continue

print(
Expand Down Expand Up @@ -763,12 +772,15 @@ def run_comet_compare(lang_pairs, skip_existing, translators, gpus, models_dir,
and os.path.isfile(output_filename)
and os.stat(output_filename).st_size > 0
):
print(f"Comparison exists. Skipping...")
print("Comparison exists. Skipping...")
continue

source_dataset = f"{dataset_name}.{source}"
targets = ""
for translator in translators.split(","):
if not is_supported(translator, source, target):
print(f"Language pair {source}-{target} is not supported for {translator}")
continue
targets += f"{dataset_name}.{translator}.{target} "
command = ""
if dataset_name in CUSTOM_DATASETS:
Expand Down Expand Up @@ -801,7 +813,8 @@ def build_report(res_dir, evaluation_engines):
lines = [l.strip() for l in f.readlines()]

avg_results = get_avg_scores(results)
build_section(avg_results, "avg", lines, res_dir, evaluation_engine)
# show only bergamot supported languages in the aggregated section
build_section(avg_results, "avg", lines, res_dir, evaluation_engine, require_bergamot=True)

for lang_pair, datasets in results.items():
build_section(datasets, lang_pair, lines, res_dir, evaluation_engine)
Expand All @@ -812,7 +825,14 @@ def build_report(res_dir, evaluation_engines):
print(f"Results are written to {results_path}")


def build_section(datasets, key, lines, res_dir, evaluation_engine):
def build_section(datasets, key, lines, res_dir, evaluation_engine, require_bergamot=False):
if require_bergamot:
datasets = {
dataset_name: translators
for dataset_name, translators in datasets.items()
if "bergamot" in translators
}

lines.append(f"\n## {key}\n")
lines.append(f'| Translator/Dataset | {" | ".join(datasets.keys())} |')
lines.append(f"| {' | '.join(['---' for _ in range(len(datasets) + 1)])} |")
Expand All @@ -822,8 +842,10 @@ def build_section(datasets, key, lines, res_dir, evaluation_engine):
comet_comparisons = defaultdict(dict)
for dataset_name, translators in datasets.items():
bergamot_res = translators.get("bergamot")
reordered = sorted(translators.items(), key=lambda x: TRANS_ORDER[x[0]])
if require_bergamot and bergamot_res is None:
continue

reordered = sorted(translators.items(), key=lambda x: TRANS_ORDER[x[0]])
for translator, score in reordered:
if score == 0:
formatted_score = "N/A"
Expand Down
Loading