From 766d8fb0066dd67fb10980d3df3b38d7fd7522f8 Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Sun, 26 Dec 2021 21:53:03 +0530 Subject: [PATCH 1/8] Stage 1: Standardize BLEU and CHRF --- tests/text/test_bleu.py | 54 ++-- tests/text/test_chrf.py | 46 ++-- torchmetrics/functional/text/bleu.py | 92 +++---- torchmetrics/functional/text/chrf.py | 354 +++++++++++++-------------- torchmetrics/text/bleu.py | 32 +-- torchmetrics/text/chrf.py | 33 +-- 6 files changed, 306 insertions(+), 305 deletions(-) diff --git a/tests/text/test_bleu.py b/tests/text/test_bleu.py index 1866094fb6f..01713aeacd3 100644 --- a/tests/text/test_bleu.py +++ b/tests/text/test_bleu.py @@ -27,12 +27,12 @@ smooth_func = SmoothingFunction().method2 -def _compute_bleu_metric_nltk(list_of_references, hypotheses, weights, smoothing_function, **kwargs): - hypotheses_ = [hypothesis.split() for hypothesis in hypotheses] - list_of_references_ = [[line.split() for line in ref] for ref in list_of_references] +def _compute_bleu_metric_nltk(predictions, list_of_targets, weights, smoothing_function, **kwargs): + predictions_ = [prediction.split() for prediction in predictions] + list_of_targets_ = [[line.split() for line in target] for target in list_of_targets] return corpus_bleu( - list_of_references=list_of_references_, - hypotheses=hypotheses_, + list_of_references=list_of_targets_, + hypotheses=predictions_, weights=weights, smoothing_function=smoothing_function, **kwargs @@ -49,74 +49,74 @@ def _compute_bleu_metric_nltk(list_of_references, hypotheses, weights, smoothing ], ) @pytest.mark.parametrize( - ["preds", "targets"], + ["predictions", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) class TestBLEUScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) - def test_bleu_score_class(self, ddp, dist_sync_on_step, preds, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} compute_bleu_metric_nltk = partial(_compute_bleu_metric_nltk, weights=weights, smoothing_function=smooth_func) self.run_class_metric_test( ddp=ddp, - preds=preds, + preds=predictions, targets=targets, metric_class=BLEUScore, sk_metric=compute_bleu_metric_nltk, dist_sync_on_step=dist_sync_on_step, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_functional(self, preds, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_functional(self, predictions, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} compute_bleu_metric_nltk = partial(_compute_bleu_metric_nltk, weights=weights, smoothing_function=smooth_func) self.run_functional_metric_test( - preds, + predictions, targets, metric_functional=bleu_score, sk_metric=compute_bleu_metric_nltk, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_differentiability(self, preds, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_differentiability(self, predictions, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} self.run_differentiability_test( - preds=preds, + preds=predictions, targets=targets, metric_module=BLEUScore, metric_functional=bleu_score, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) def test_bleu_empty_functional(): - hyp = [[]] - ref = [[[]]] - assert bleu_score(ref, hyp) == tensor(0.0) + prediction = [[]] + target = [[[]]] + assert bleu_score(prediction, target) == tensor(0.0) def test_no_4_gram_functional(): - hyps = ["My full pytorch-lightning"] - refs = [["My full pytorch-lightning test", "Completely Different"]] - assert bleu_score(refs, hyps) == tensor(0.0) + predictions = ["My full pytorch-lightning"] + targets = [["My full pytorch-lightning test", "Completely Different"]] + assert bleu_score(predictions, targets) == tensor(0.0) def test_bleu_empty_class(): bleu = BLEUScore() - hyp = [[]] - ref = [[[]]] - assert bleu(ref, hyp) == tensor(0.0) + prediction = [[]] + target = [[[]]] + assert bleu(prediction, target) == tensor(0.0) def test_no_4_gram_class(): bleu = BLEUScore() - hyps = ["My full pytorch-lightning"] - refs = [["My full pytorch-lightning test", "Completely Different"]] - assert bleu(refs, hyps) == tensor(0.0) + predictions = ["My full pytorch-lightning"] + targets = [["My full pytorch-lightning test", "Completely Different"]] + assert bleu(predictions, targets) == tensor(0.0) diff --git a/tests/text/test_chrf.py b/tests/text/test_chrf.py index 76743b6ecd4..09068d7e733 100644 --- a/tests/text/test_chrf.py +++ b/tests/text/test_chrf.py @@ -15,8 +15,8 @@ def sacrebleu_chrf_fn( - targets: Sequence[Sequence[str]], preds: Sequence[str], + targets: Sequence[Sequence[str]], char_order: int, word_order: int, lowercase: bool, @@ -43,7 +43,7 @@ def sacrebleu_chrf_fn( ], ) @pytest.mark.parametrize( - ["preds", "targets"], + ["predictions", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.skipif(not _SACREBLEU_AVAILABLE, reason="test requires sacrebleu") @@ -51,7 +51,7 @@ class TestCHRFScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) def test_chrf_score_class( - self, ddp, dist_sync_on_step, preds, targets, char_order, word_order, lowercase, whitespace + self, ddp, dist_sync_on_step, predictions, targets, char_order, word_order, lowercase, whitespace ): metric_args = { "n_char_order": char_order, @@ -65,16 +65,16 @@ def test_chrf_score_class( self.run_class_metric_test( ddp=ddp, - preds=preds, + preds=predictions, targets=targets, metric_class=CHRFScore, sk_metric=nltk_metric, dist_sync_on_step=dist_sync_on_step, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_functional(self, preds, targets, char_order, word_order, lowercase, whitespace): + def test_chrf_score_functional(self, predictions, targets, char_order, word_order, lowercase, whitespace): metric_args = { "n_char_order": char_order, "n_word_order": word_order, @@ -86,15 +86,15 @@ def test_chrf_score_functional(self, preds, targets, char_order, word_order, low ) self.run_functional_metric_test( - preds, + predictions, targets, metric_functional=chrf_score, sk_metric=nltk_metric, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_differentiability(self, preds, targets, char_order, word_order, lowercase, whitespace): + def test_chrf_score_differentiability(self, predictions, targets, char_order, word_order, lowercase, whitespace): metric_args = { "n_char_order": char_order, "n_word_order": word_order, @@ -103,38 +103,38 @@ def test_chrf_score_differentiability(self, preds, targets, char_order, word_ord } self.run_differentiability_test( - preds=preds, + preds=predictions, targets=targets, metric_module=CHRFScore, metric_functional=chrf_score, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) def test_chrf_empty_functional(): - hyp = [] - ref = [[]] - assert chrf_score(ref, hyp) == tensor(0.0) + prediction = [] + target = [[]] + assert chrf_score(prediction, target) == tensor(0.0) def test_chrf_empty_class(): chrf = CHRFScore() - hyp = [] - ref = [[]] - assert chrf(ref, hyp) == tensor(0.0) + prediction = [] + target = [[]] + assert chrf(prediction, target) == tensor(0.0) def test_chrf_return_sentence_level_score_functional(): - hyp = _inputs_single_sentence_multiple_references.preds - ref = _inputs_single_sentence_multiple_references.targets - _, chrf_sentence_score = chrf_score(ref, hyp, return_sentence_level_score=True) + prediction = _inputs_single_sentence_multiple_references.preds + target = _inputs_single_sentence_multiple_references.targets + _, chrf_sentence_score = chrf_score(prediction, target, return_sentence_level_score=True) isinstance(chrf_sentence_score, Tensor) def test_chrf_return_sentence_level_class(): chrf = CHRFScore(return_sentence_level_score=True) - hyp = _inputs_single_sentence_multiple_references.preds - ref = _inputs_single_sentence_multiple_references.targets - _, chrf_sentence_score = chrf(ref, hyp) + prediction = _inputs_single_sentence_multiple_references.preds + target = _inputs_single_sentence_multiple_references.targets + _, chrf_sentence_score = chrf(prediction, target) isinstance(chrf_sentence_score, Tensor) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 81b6e100e9e..06281d9f009 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -57,62 +57,62 @@ def _tokenize_fn(sentence: str) -> Sequence[str]: def _bleu_score_update( - reference_corpus: Sequence[Sequence[str]], - translate_corpus: Sequence[str], + prediction_corpus: Sequence[str], + target_corpus: Sequence[Sequence[str]], numerator: Tensor, denominator: Tensor, - trans_len: Tensor, - ref_len: Tensor, + prediction_len: Tensor, + target_len: Tensor, n_gram: int = 4, tokenizer: Callable[[str], Sequence[str]] = _tokenize_fn, ) -> Tuple[Tensor, Tensor]: """Updates and returns variables required to compute the BLEU score. Args: - reference_corpus: An iterable of iterables of reference corpus - translate_corpus: An iterable of machine translated corpus + target_corpus: An iterable of iterables of reference corpus + prediction_corpus: An iterable of machine translated corpus numerator: Numerator of precision score (true positives) denominator: Denominator of precision score (true positives + false positives) - trans_len: count of words in a candidate translation - ref_len: count of words in a reference translation + prediction_len: count of words in a candidate prediction + target_len: count of words in a reference translation n_gram: gram value ranged 1 to 4 tokenizer: A function that turns sentence into list of words """ - reference_corpus_: Sequence[Sequence[Sequence[str]]] = [ - [tokenizer(line) if line else [] for line in reference] for reference in reference_corpus + target_corpus_: Sequence[Sequence[Sequence[str]]] = [ + [tokenizer(line) if line else [] for line in target] for target in target_corpus ] - translate_corpus_: Sequence[Sequence[str]] = [tokenizer(line) if line else [] for line in translate_corpus] + prediction_corpus_: Sequence[Sequence[str]] = [tokenizer(line) if line else [] for line in prediction_corpus] - for (translation, references) in zip(translate_corpus_, reference_corpus_): - trans_len += len(translation) - ref_len_list = [len(ref) for ref in references] - ref_len_diff = [abs(len(translation) - x) for x in ref_len_list] - ref_len += ref_len_list[ref_len_diff.index(min(ref_len_diff))] - translation_counter: Counter = _count_ngram(translation, n_gram) - reference_counter: Counter = Counter() + for (prediction, targets) in zip(prediction_corpus_, target_corpus_): + prediction_len += len(prediction) + target_len_list = [len(ref) for ref in targets] + target_len_diff = [abs(len(prediction) - x) for x in target_len_list] + target_len += target_len_list[target_len_diff.index(min(target_len_diff))] + prediction_counter: Counter = _count_ngram(prediction, n_gram) + target_counter: Counter = Counter() - for ref in references: - reference_counter |= _count_ngram(ref, n_gram) + for ref in targets: + target_counter |= _count_ngram(ref, n_gram) - ngram_counter_clip = translation_counter & reference_counter + ngram_counter_clip = prediction_counter & target_counter for counter_clip in ngram_counter_clip: numerator[len(counter_clip) - 1] += ngram_counter_clip[counter_clip] - for counter in translation_counter: - denominator[len(counter) - 1] += translation_counter[counter] + for counter in prediction_counter: + denominator[len(counter) - 1] += prediction_counter[counter] - return trans_len, ref_len + return prediction_len, target_len def _bleu_score_compute( - trans_len: Tensor, ref_len: Tensor, numerator: Tensor, denominator: Tensor, n_gram: int = 4, smooth: bool = False + prediction_len: Tensor, target_len: Tensor, numerator: Tensor, denominator: Tensor, n_gram: int = 4, smooth: bool = False ) -> Tensor: """Computes the BLEU score. Args: - trans_len: count of words in a candidate translation - ref_len: count of words in a reference translation + prediction_len: count of words in a candidate prediction + target_len: count of words in a reference translation numerator: Numerator of precision score (true positives) denominator: Denominator of precision score (true positives + false positives) n_gram: gram value ranged 1 to 4 @@ -133,25 +133,25 @@ def _bleu_score_compute( log_precision_scores = tensor([1.0 / n_gram] * n_gram, device=device) * torch.log(precision_scores) geometric_mean = torch.exp(torch.sum(log_precision_scores)) - brevity_penalty = tensor(1.0, device=device) if trans_len > ref_len else torch.exp(1 - (ref_len / trans_len)) + brevity_penalty = tensor(1.0, device=device) if prediction_len > target_len else torch.exp(1 - (target_len / prediction_len)) bleu = brevity_penalty * geometric_mean return bleu def bleu_score( - reference_corpus: Sequence[Union[str, Sequence[str]]], - translate_corpus: Union[str, Sequence[str]], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Sequence[Union[str, Sequence[str]]], n_gram: int = 4, smooth: bool = False, ) -> Tensor: """Calculate `BLEU score`_ of machine translated text with one or more references. Args: - reference_corpus: - An iterable of iterables of reference corpus - translate_corpus: + prediction_corpus: An iterable of machine translated corpus + target_corpus: + An iterable of iterables of reference corpus n_gram: Gram value ranged from 1 to 4 (Default 4) smooth: @@ -162,9 +162,9 @@ def bleu_score( Example: >>> from torchmetrics.functional import bleu_score - >>> translate_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> bleu_score(reference_corpus, translate_corpus) + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> bleu_score(prediction_corpus, target_corpus) tensor(0.7598) References: @@ -174,21 +174,21 @@ def bleu_score( [2] Automatic Evaluation of Machine Translation Quality Using Longest Common Subsequence and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ - translate_corpus_ = [translate_corpus] if isinstance(translate_corpus, str) else translate_corpus - reference_corpus_ = [ - [reference_text] if isinstance(reference_text, str) else reference_text for reference_text in reference_corpus + prediction_corpus_ = [prediction_corpus] if isinstance(prediction_corpus, str) else prediction_corpus + target_corpus_ = [ + [target_text] if isinstance(target_text, str) else target_text for target_text in target_corpus ] - if len(translate_corpus_) != len(reference_corpus_): - raise ValueError(f"Corpus has different size {len(translate_corpus_)} != {len(reference_corpus_)}") + if len(prediction_corpus_) != len(target_corpus_): + raise ValueError(f"Corpus has different size {len(prediction_corpus_)} != {len(target_corpus_)}") numerator = torch.zeros(n_gram) denominator = torch.zeros(n_gram) - trans_len = tensor(0, dtype=torch.float) - ref_len = tensor(0, dtype=torch.float) + prediction_len = tensor(0, dtype=torch.float) + target_len = tensor(0, dtype=torch.float) - trans_len, ref_len = _bleu_score_update( - reference_corpus_, translate_corpus_, numerator, denominator, trans_len, ref_len, n_gram, _tokenize_fn + prediction_len, target_len = _bleu_score_update( + prediction_corpus_, target_corpus_, numerator, denominator, prediction_len, target_len, n_gram, _tokenize_fn ) - return _bleu_score_compute(trans_len, ref_len, numerator, denominator, n_gram, smooth) + return _bleu_score_compute(prediction_len, target_len, numerator, denominator, n_gram, smooth) diff --git a/torchmetrics/functional/text/chrf.py b/torchmetrics/functional/text/chrf.py index fd646742f27..6b701ebaf5c 100644 --- a/torchmetrics/functional/text/chrf.py +++ b/torchmetrics/functional/text/chrf.py @@ -51,7 +51,7 @@ def _prepare_n_grams_dicts( ) -> Tuple[ Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor] ]: - """Prepare dictionaries dictionaries with default zero values for total reference, hypothesis and matching + """Prepare dictionaries dictionaries with default zero values for total target, prediction and matching character and word n-grams. Args: @@ -61,21 +61,21 @@ def _prepare_n_grams_dicts( A word n-gram order. Return: - Dictionaries with default zero values for total reference, hypothesis and matching character and word + Dictionaries with default zero values for total target, prediction and matching character and word n-grams. """ - total_ref_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} - total_ref_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} - total_hyp_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} - total_hyp_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} + total_target_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} + total_target_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} + total_prediction_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} + total_prediction_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} total_matching_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} total_matching_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} return ( - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) @@ -209,23 +209,23 @@ def _get_total_ngrams(n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]]) def _get_ngram_matches( - ref_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - hyp_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + prediction_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + target_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], ) -> Dict[int, Tensor]: - """Get a number of n-gram matches between reference and hypothesis n-grams. + """Get a number of n-gram matches between target and prediction n-grams. Args: - ref_n_grams_counts: - ref_n_grams_counts: + target_n_grams_counts: + prediction_n_grams_counts: Return: """ matching_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - for n in hyp_n_grams_counts: + for n in prediction_n_grams_counts: matching_n_grams[n] = tensor( sum( - torch.min(ref_n_grams_counts[n][n_gram], hyp_n_grams_counts[n][n_gram]) - for n_gram in hyp_n_grams_counts[n] + torch.min(target_n_grams_counts[n][n_gram], prediction_n_grams_counts[n][n_gram]) + for n_gram in prediction_n_grams_counts[n] ) ) return matching_n_grams @@ -251,29 +251,29 @@ def _sum_over_dicts(total_n_grams: Dict[int, Tensor], n_grams: Dict[int, Tensor] def _calculate_fscore( matching_char_n_grams: Dict[int, Tensor], matching_word_n_grams: Dict[int, Tensor], - ref_char_n_grams: Dict[int, Tensor], - ref_word_n_grams: Dict[int, Tensor], - hyp_char_n_grams: Dict[int, Tensor], - hyp_word_n_grams: Dict[int, Tensor], + prediction_char_n_grams: Dict[int, Tensor], + prediction_word_n_grams: Dict[int, Tensor], + target_char_n_grams: Dict[int, Tensor], + target_word_n_grams: Dict[int, Tensor], n_order: float, beta: float, ) -> Tensor: - """Calculate sentence-level chrF/chrF++ score. For given hypothesis and reference statistics (either sentence- + """Calculate sentence-level chrF/chrF++ score. For given prediction and target statistics (either sentence- level or corpus-level) the chrF/chrF++ score is returned. Args: matching_char_n_grams: - A total number of matching character n-grams between the best matching reference and hypothesis. + A total number of matching character n-grams between the best matching target and prediction. matching_word_n_grams: - A total number of matching word n-grams between the best matching reference and hypothesis. - ref_char_n_grams: - A total number of reference character n-grams. - ref_word_n_grams: - A total number of reference word n-grams. - hyp_char_n_grams: - A total number of hypothesis character n-grams. - hyp_word_n_grams: - A total number of hypothesis word n-grams. + A total number of matching word n-grams between the best matching target and prediction. + target_char_n_grams: + A total number of target character n-grams. + target_word_n_grams: + A total number of target word n-grams. + prediction_char_n_grams: + A total number of prediction character n-grams. + prediction_word_n_grams: + A total number of prediction word n-grams. n_order: A sum of character and word n-gram order. beta: @@ -302,19 +302,19 @@ def _get_n_gram_fscore( return f_score - char_n_gram_f_score = _get_n_gram_fscore(matching_char_n_grams, ref_char_n_grams, hyp_char_n_grams, beta) - word_n_gram_f_score = _get_n_gram_fscore(matching_word_n_grams, ref_word_n_grams, hyp_word_n_grams, beta) + char_n_gram_f_score = _get_n_gram_fscore(matching_char_n_grams, target_char_n_grams, prediction_char_n_grams, beta) + word_n_gram_f_score = _get_n_gram_fscore(matching_word_n_grams, target_word_n_grams, prediction_word_n_grams, beta) f_score = (sum(char_n_gram_f_score.values()) + sum(word_n_gram_f_score.values())) / tensor(n_order) # type: ignore return f_score def _calculate_sentence_level_chrf_score( - references: List[str], - hyp_char_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - hyp_word_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - hyp_char_n_grams: Dict[int, Tensor], - hyp_word_n_grams: Dict[int, Tensor], + targets: List[str], + prediction_char_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + prediction_word_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + prediction_char_n_grams: Dict[int, Tensor], + prediction_word_n_grams: Dict[int, Tensor], n_char_order: int, n_word_order: int, n_order: float, @@ -322,20 +322,20 @@ def _calculate_sentence_level_chrf_score( lowercase: bool, whitespace: bool, ) -> Tuple[Tensor, Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor]]: - """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed hypothesis, all references - are evaluated and score and statistics for the best matching reference is returned. + """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed prediction, all targets + are evaluated and score and statistics for the best matching target is returned. Args: - references: - An iterable of references. - hyp_char_n_grams_counts: - A dictionary of dictionaries with hypothesis character n-grams. - hyp_word_n_grams_counts: - A dictionary of dictionaries with hypothesis word n-grams. - hyp_char_n_grams: - A total number of hypothesis character n-grams. - hyp_word_n_grams: - A total number of hypothesis word n-grams. + targets: + An iterable of targets. + prediction_char_n_grams_counts: + A dictionary of dictionaries with prediction character n-grams. + prediction_word_n_grams_counts: + A dictionary of dictionaries with prediction word n-grams. + prediction_char_n_grams: + A total number of prediction character n-grams. + prediction_word_n_grams: + A total number of prediction word n-grams. n_char_order: A character n-gram order. n_word_order: @@ -350,43 +350,43 @@ def _calculate_sentence_level_chrf_score( An indication whether to keep whitespaces during character n-gram extraction. Return: - Return chrF/chrF++ score and statistics for the best matching hypothesis and reference. + Return chrF/chrF++ score and statistics for the best matching prediction and target. f_score: A sentence-level chrF/chrF++ score. matching_char_n_grams: - A total number of matching character n-grams between the best matching reference and hypothesis. + A total number of matching character n-grams between the best matching target and prediction. matching_word_n_grams: - A total number of matching word n-grams between the best matching reference and hypothesis. - ref_char_n_grams: - A total number of reference character n-grams. - ref_word_n_grams: - A total number of reference word n-grams. + A total number of matching word n-grams between the best matching target and prediction. + target_char_n_grams: + A total number of target character n-grams. + target_word_n_grams: + A total number of target word n-grams. """ best_f_score = tensor(0.0) best_matching_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) best_matching_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - best_ref_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - best_ref_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) + best_target_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) + best_target_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - for reference in references: + for target in targets: ( - ref_char_n_grams_counts, - ref_word_n_grams_counts, - ref_char_n_grams, - ref_word_n_grams, - ) = _get_n_grams_counts_and_total_ngrams(reference, n_char_order, n_word_order, lowercase, whitespace) - matching_char_n_grams = _get_ngram_matches(ref_char_n_grams_counts, hyp_char_n_grams_counts) - matching_word_n_grams = _get_ngram_matches(ref_word_n_grams_counts, hyp_word_n_grams_counts) + target_char_n_grams_counts, + target_word_n_grams_counts, + target_char_n_grams, + target_word_n_grams, + ) = _get_n_grams_counts_and_total_ngrams(target, n_char_order, n_word_order, lowercase, whitespace) + matching_char_n_grams = _get_ngram_matches(prediction_char_n_grams_counts, target_char_n_grams_counts) + matching_word_n_grams = _get_ngram_matches(prediction_word_n_grams_counts, target_word_n_grams_counts) f_score = _calculate_fscore( matching_char_n_grams, matching_word_n_grams, - ref_char_n_grams, - ref_word_n_grams, - hyp_char_n_grams, - hyp_word_n_grams, + prediction_char_n_grams, + prediction_word_n_grams, + target_char_n_grams, + target_word_n_grams, n_order, beta, ) @@ -395,25 +395,25 @@ def _calculate_sentence_level_chrf_score( best_f_score = f_score best_matching_char_n_grams = matching_char_n_grams best_matching_word_n_grams = matching_word_n_grams - best_ref_char_n_grams = ref_char_n_grams - best_ref_word_n_grams = ref_word_n_grams + best_target_char_n_grams = target_char_n_grams + best_target_word_n_grams = target_word_n_grams return ( best_f_score, best_matching_char_n_grams, best_matching_word_n_grams, - best_ref_char_n_grams, - best_ref_word_n_grams, + best_target_char_n_grams, + best_target_word_n_grams, ) def _chrf_score_update( - reference_corpus: Union[Sequence[str], Sequence[Sequence[str]]], - hypothesis_corpus: Union[str, Sequence[str]], - total_ref_char_n_grams: Dict[int, Tensor], - total_ref_word_n_grams: Dict[int, Tensor], - total_hyp_char_n_grams: Dict[int, Tensor], - total_hyp_word_n_grams: Dict[int, Tensor], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Union[Sequence[str], Sequence[Sequence[str]]], + total_prediction_char_n_grams: Dict[int, Tensor], + total_prediction_word_n_grams: Dict[int, Tensor], + total_target_char_n_grams: Dict[int, Tensor], + total_target_word_n_grams: Dict[int, Tensor], total_matching_char_n_grams: Dict[int, Tensor], total_matching_word_n_grams: Dict[int, Tensor], n_char_order: int, @@ -434,22 +434,22 @@ def _chrf_score_update( ]: """ Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. - total_ref_char_n_grams: - A dictionary containing a total number of reference character n-grams. - total_ref_word_n_grams: - A dictionary containing a total number of reference word n-grams. - total_hyp_char_n_grams: - A dictionary containing a total number of hypothesis character n-grams. - total_hyp_word_n_grams: - A dictionary containing a total number of hypothesis word n-grams. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. + total_target_char_n_grams: + A dictionary containing a total number of target character n-grams. + total_target_word_n_grams: + A dictionary containing a total number of target word n-grams. + total_prediction_char_n_grams: + A dictionary containing a total number of prediction character n-grams. + total_prediction_word_n_grams: + A dictionary containing a total number of prediction word n-grams. total_matching_char_n_grams: - A dictionary containing a total number of matching character n-grams between references and hypotheses. + A dictionary containing a total number of matching character n-grams between targets and hypotheses. total_matching_word_n_grams: - A dictionary containing a total number of total matching word n-grams between references and hypotheses. + A dictionary containing a total number of total matching word n-grams between targets and hypotheses. n_char_order: A character n-gram order. n_word_order: @@ -466,51 +466,51 @@ def _chrf_score_update( A list of sentence-level chrF/chrF++ scores. Return: - total_ref_char_n_grams: - An updated dictionary containing a total number of reference character n-grams. - total_ref_word_n_grams: - An updated dictionary containing a total number of reference word n-grams. - total_hyp_char_n_grams: - An updated dictionary containing a total number of hypothesis character n-grams. - total_hyp_word_n_grams: - An updated dictionary containing a total number of hypothesis word n-grams. + total_target_char_n_grams: + An updated dictionary containing a total number of target character n-grams. + total_target_word_n_grams: + An updated dictionary containing a total number of target word n-grams. + total_prediction_char_n_grams: + An updated dictionary containing a total number of prediction character n-grams. + total_prediction_word_n_grams: + An updated dictionary containing a total number of prediction word n-grams. total_matching_char_n_grams: - An updated dictionary containing a total number of matching character n-grams between references and + An updated dictionary containing a total number of matching character n-grams between targets and hypotheses. total_matching_word_n_grams: - An updated dictionary containing a total number of total matching word n-grams between references and + An updated dictionary containing a total number of total matching word n-grams between targets and hypotheses. sentence_chrf_score: (Optionally) A list of sentence-level chrF/chrF++ scores. Raises: ValueError: - If length of `reference_corpus` and `hypothesis_corpus` differs. + If length of `target_corpus` and `prediction_corpus` differs. """ - reference_corpus, hypothesis_corpus = _validate_inputs(reference_corpus, hypothesis_corpus) + target_corpus, prediction_corpus = _validate_inputs(target_corpus, prediction_corpus) - for (references, hypothesis) in zip(reference_corpus, hypothesis_corpus): + for (targets, prediction) in zip(target_corpus, prediction_corpus): ( - hyp_char_n_grams_counts, - hyp_word_n_grams_counts, - hyp_char_n_grams, - hyp_word_n_grams, - ) = _get_n_grams_counts_and_total_ngrams(hypothesis, n_char_order, n_word_order, lowercase, whitespace) - total_hyp_char_n_grams = _sum_over_dicts(total_hyp_char_n_grams, hyp_char_n_grams) - total_hyp_word_n_grams = _sum_over_dicts(total_hyp_word_n_grams, hyp_word_n_grams) + prediction_char_n_grams_counts, + prediction_word_n_grams_counts, + prediction_char_n_grams, + prediction_word_n_grams, + ) = _get_n_grams_counts_and_total_ngrams(prediction, n_char_order, n_word_order, lowercase, whitespace) + total_prediction_char_n_grams = _sum_over_dicts(total_prediction_char_n_grams, prediction_char_n_grams) + total_prediction_word_n_grams = _sum_over_dicts(total_prediction_word_n_grams, prediction_word_n_grams) ( sentence_level_f_score, matching_char_n_grams, matching_word_n_grams, - ref_char_n_grams, - ref_word_n_grams, + target_char_n_grams, + target_word_n_grams, ) = _calculate_sentence_level_chrf_score( - references, # type: ignore - hyp_char_n_grams_counts, - hyp_word_n_grams_counts, - hyp_char_n_grams, - hyp_word_n_grams, + targets, # type: ignore + prediction_char_n_grams_counts, + prediction_word_n_grams_counts, + prediction_char_n_grams, + prediction_word_n_grams, n_char_order, n_word_order, n_order, @@ -522,16 +522,16 @@ def _chrf_score_update( if sentence_chrf_score is not None: sentence_chrf_score.append(sentence_level_f_score.unsqueeze(0)) - total_ref_char_n_grams = _sum_over_dicts(total_ref_char_n_grams, ref_char_n_grams) - total_ref_word_n_grams = _sum_over_dicts(total_ref_word_n_grams, ref_word_n_grams) + total_target_char_n_grams = _sum_over_dicts(total_target_char_n_grams, target_char_n_grams) + total_target_word_n_grams = _sum_over_dicts(total_target_word_n_grams, target_word_n_grams) total_matching_char_n_grams = _sum_over_dicts(total_matching_char_n_grams, matching_char_n_grams) total_matching_word_n_grams = _sum_over_dicts(total_matching_word_n_grams, matching_word_n_grams) return ( - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, @@ -539,31 +539,31 @@ def _chrf_score_update( def _chrf_score_compute( - total_ref_char_n_grams: Dict[int, Tensor], - total_ref_word_n_grams: Dict[int, Tensor], - total_hyp_char_n_grams: Dict[int, Tensor], - total_hyp_word_n_grams: Dict[int, Tensor], + total_prediction_char_n_grams: Dict[int, Tensor], + total_prediction_word_n_grams: Dict[int, Tensor], + total_target_char_n_grams: Dict[int, Tensor], + total_target_word_n_grams: Dict[int, Tensor], total_matching_char_n_grams: Dict[int, Tensor], total_matching_word_n_grams: Dict[int, Tensor], n_order: float, beta: float, ) -> Tensor: - """Compute chrF/chrF++ score based on pre-computed reference, hypothesis and matching character and word + """Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word n-grams. Args: - total_ref_char_n_grams: - A dictionary containing a total number of reference character n-grams. - total_ref_word_n_grams: - A dictionary containing a total number of reference word n-grams. - total_hyp_char_n_grams: - A dictionary containing a total number of hypothesis character n-grams. - total_hyp_word_n_grams: - A dictionary containing a total number of hypothesis word n-grams. + total_prediction_char_n_grams: + A dictionary containing a total number of prediction character n-grams. + total_prediction_word_n_grams: + A dictionary containing a total number of prediction word n-grams. + total_target_char_n_grams: + A dictionary containing a total number of target character n-grams. + total_target_word_n_grams: + A dictionary containing a total number of target word n-grams. total_matching_char_n_grams: - A dictionary containing a total number of matching character n-grams between references and hypotheses. + A dictionary containing a total number of matching character n-grams between targets and hypotheses. total_matching_word_n_grams: - A dictionary containing a total number of total matching word n-grams between references and hypotheses. + A dictionary containing a total number of total matching word n-grams between targets and hypotheses. n_order: A sum of charachter and word n-gram order. beta: @@ -575,10 +575,10 @@ def _chrf_score_compute( chrf_f_score = _calculate_fscore( total_matching_char_n_grams, total_matching_word_n_grams, - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, n_order, beta, ) @@ -586,8 +586,8 @@ def _chrf_score_compute( def chrf_score( - reference_corpus: Union[Sequence[str], Sequence[Sequence[str]]], - hypothesis_corpus: Union[str, Sequence[str]], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Union[Sequence[str], Sequence[Sequence[str]]], n_char_order: int = 6, n_word_order: int = 2, beta: float = 2.0, @@ -595,16 +595,16 @@ def chrf_score( whitespace: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Calculate `chrF score`_ of machine translated text with one or more references. This implementation + """Calculate `chrF score`_ of machine translated text with one or more targets. This implementation supports both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This implementation follows the implmenetaions from https://github.com/m-popovic/chrF and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py. Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. n_char_order: A character n-gram order. If `n_char_order=6`, the metrics refers to the official chrF/chrF++. n_word_order: @@ -633,9 +633,9 @@ def chrf_score( Example: >>> from torchmetrics.functional import chrf_score - >>> hypothesis_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> chrf_score(reference_corpus, hypothesis_corpus) + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> chrf_score(prediction_corpus, target_corpus) tensor(0.8640) References: @@ -652,10 +652,10 @@ def chrf_score( n_order = float(n_char_order + n_word_order) ( - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) = _prepare_n_grams_dicts(n_char_order, n_word_order) @@ -663,20 +663,20 @@ def chrf_score( sentence_chrf_score: Optional[List[Tensor]] = [] if return_sentence_level_score else None ( - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, ) = _chrf_score_update( - reference_corpus, - hypothesis_corpus, - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + prediction_corpus, + target_corpus, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_char_order, @@ -689,10 +689,10 @@ def chrf_score( ) chrf_f_score = _chrf_score_compute( - total_ref_char_n_grams, - total_ref_word_n_grams, - total_hyp_char_n_grams, - total_hyp_word_n_grams, + total_prediction_char_n_grams, + total_prediction_word_n_grams, + total_target_char_n_grams, + total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_order, diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index c28bf9bc1df..b157bfac91e 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -45,10 +45,10 @@ class BLEUScore(Metric): will be used to perform the allgather. Example: - >>> translate_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = BLEUScore() - >>> metric(reference_corpus, translate_corpus) + >>> metric(prediction_corpus, target_corpus) tensor(0.7598) References: @@ -61,8 +61,8 @@ class BLEUScore(Metric): is_differentiable = False higher_is_better = True - trans_len: Tensor - ref_len: Tensor + prediction_len: Tensor + target_len: Tensor numerator: Tensor denominator: Tensor @@ -85,28 +85,28 @@ def __init__( self.n_gram = n_gram self.smooth = smooth - self.add_state("trans_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") - self.add_state("ref_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") + self.add_state("prediction_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") + self.add_state("target_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") self.add_state("numerator", torch.zeros(self.n_gram), dist_reduce_fx="sum") self.add_state("denominator", torch.zeros(self.n_gram), dist_reduce_fx="sum") def update( # type: ignore - self, reference_corpus: Sequence[Sequence[str]], translate_corpus: Sequence[str] + self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]] ) -> None: """Compute Precision Scores. Args: - reference_corpus: An iterable of iterables of reference corpus - translate_corpus: An iterable of machine translated corpus + prediction_corpus: An iterable of machine translated corpus + target_corpus: An iterable of iterables of reference corpus """ - self.trans_len, self.ref_len = _bleu_score_update( - reference_corpus, - translate_corpus, + self.prediction_len, self.target_len = _bleu_score_update( + prediction_corpus, + target_corpus, self.numerator, self.denominator, - self.trans_len, - self.ref_len, + self.prediction_len, + self.target_len, self.n_gram, _tokenize_fn, ) @@ -118,5 +118,5 @@ def compute(self) -> Tensor: Tensor with BLEU Score """ return _bleu_score_compute( - self.trans_len, self.ref_len, self.numerator, self.denominator, self.n_gram, self.smooth + self.prediction_len, self.target_len, self.numerator, self.denominator, self.n_gram, self.smooth ) diff --git a/torchmetrics/text/chrf.py b/torchmetrics/text/chrf.py index b413402246c..51630ae5878 100644 --- a/torchmetrics/text/chrf.py +++ b/torchmetrics/text/chrf.py @@ -27,13 +27,13 @@ from torchmetrics.functional.text.chrf import _chrf_score_compute, _chrf_score_update, _prepare_n_grams_dicts _N_GRAM_LEVELS = ("char", "word") -_TEXT_LEVELS = ("ref", "hyp", "matching") +_TEXT_LEVELS = ("target", "prediction", "matching") _DICT_STATES_NAMES = ( - "total_ref_char_n_grams", - "total_ref_word_n_grams", - "total_hyp_char_n_grams", - "total_hyp_word_n_grams", + "total_prediction_char_n_grams", + "total_prediction_word_n_grams", + "total_target_char_n_grams", + "total_target_word_n_grams", "total_matching_char_n_grams", "total_matching_word_n_grams", ) @@ -83,10 +83,10 @@ class CHRFScore(Metric): If ``beta`` is smaller than 0. Example: - >>> hypothesis_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = CHRFScore() - >>> metric(reference_corpus, hypothesis_corpus) + >>> metric(prediction_corpus, target_corpus) tensor(0.8640) References: @@ -143,19 +143,20 @@ def __init__( self.add_state("sentence_chrf_score", [], dist_reduce_fx="cat") def update( # type: ignore - self, reference_corpus: Sequence[Sequence[str]], hypothesis_corpus: Sequence[str] + self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]] ) -> None: """Compute Precision Scores. Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. + """ n_grams_dicts_tuple = _chrf_score_update( - reference_corpus, - hypothesis_corpus, + prediction_corpus, + target_corpus, *self._convert_states_to_dicts(), self.n_char_order, self.n_word_order, @@ -222,5 +223,5 @@ def _get_state_name(text: str, n_gram_level: str, n: int) -> str: return f"total_{text}_{n_gram_level}_{n}_grams" def _get_text_n_gram_iterator(self) -> Iterator[Tuple[Tuple[str, int], str]]: - """Get iterator over char/word and reference/hypothesis/matching n-gram level.""" + """Get iterator over char/word and target/prediction/matching n-gram level.""" return itertools.product(zip(_N_GRAM_LEVELS, [self.n_char_order, self.n_word_order]), _TEXT_LEVELS) From 1e135217bf1b0eb590856558c032c28950e824bb Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Tue, 28 Dec 2021 12:26:48 +0530 Subject: [PATCH 2/8] Standardize TER metric --- tests/text/test_ter.py | 66 +++--- torchmetrics/functional/text/bleu.py | 15 +- torchmetrics/functional/text/chrf.py | 11 +- torchmetrics/functional/text/ter.py | 287 +++++++++++++-------------- torchmetrics/text/bleu.py | 4 +- torchmetrics/text/chrf.py | 5 +- torchmetrics/text/ter.py | 32 +-- 7 files changed, 210 insertions(+), 210 deletions(-) diff --git a/tests/text/test_ter.py b/tests/text/test_ter.py index 4f49cc4665c..2961575b712 100644 --- a/tests/text/test_ter.py +++ b/tests/text/test_ter.py @@ -15,8 +15,8 @@ def sacrebleu_ter_fn( + predictions: Sequence[str], targets: Sequence[Sequence[str]], - preds: Sequence[str], normalized: bool, no_punct: bool, asian_support: bool, @@ -27,7 +27,7 @@ def sacrebleu_ter_fn( ) # Sacrebleu CHRF expects different format of input targets = [[target[i] for target in targets] for i in range(len(targets[0]))] - sacrebleu_ter = sacrebleu_ter.corpus_score(preds, targets).score / 100 + sacrebleu_ter = sacrebleu_ter.corpus_score(predictions, targets).score / 100 return tensor(sacrebleu_ter) @@ -43,7 +43,7 @@ def sacrebleu_ter_fn( ], ) @pytest.mark.parametrize( - ["preds", "targets"], + ["predictions", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.skipif(not _SACREBLEU_AVAILABLE, reason="test requires sacrebleu") @@ -51,7 +51,7 @@ class TestTER(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) def test_chrf_score_class( - self, ddp, dist_sync_on_step, preds, targets, normalize, no_punctuation, asian_support, lowercase + self, ddp, dist_sync_on_step, predictions, targets, normalize, no_punctuation, asian_support, lowercase ): metric_args = { "normalize": normalize, @@ -69,16 +69,16 @@ def test_chrf_score_class( self.run_class_metric_test( ddp=ddp, - preds=preds, + preds=predictions, targets=targets, metric_class=TER, sk_metric=nltk_metric, dist_sync_on_step=dist_sync_on_step, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_ter_score_functional(self, preds, targets, normalize, no_punctuation, asian_support, lowercase): + def test_ter_score_functional(self, predictions, targets, normalize, no_punctuation, asian_support, lowercase): metric_args = { "normalize": normalize, "no_punctuation": no_punctuation, @@ -94,15 +94,17 @@ def test_ter_score_functional(self, preds, targets, normalize, no_punctuation, a ) self.run_functional_metric_test( - preds, + predictions, targets, metric_functional=ter, sk_metric=nltk_metric, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_differentiability(self, preds, targets, normalize, no_punctuation, asian_support, lowercase): + def test_chrf_score_differentiability( + self, predictions, targets, normalize, no_punctuation, asian_support, lowercase + ): metric_args = { "normalize": normalize, "no_punctuation": no_punctuation, @@ -111,51 +113,51 @@ def test_chrf_score_differentiability(self, preds, targets, normalize, no_punctu } self.run_differentiability_test( - preds=preds, + preds=predictions, targets=targets, metric_module=TER, metric_functional=ter, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) def test_ter_empty_functional(): - hyp = [] - ref = [[]] - assert ter(ref, hyp) == tensor(0.0) + prediction = [] + target = [[]] + assert ter(prediction, target) == tensor(0.0) def test_ter_empty_class(): ter_metric = TER() - hyp = [] - ref = [[]] - assert ter_metric(ref, hyp) == tensor(0.0) + prediction = [] + target = [[]] + assert ter_metric(prediction, target) == tensor(0.0) -def test_ter_empty_with_non_empty_hyp_functional(): - hyp = ["python"] - ref = [[]] - assert ter(ref, hyp) == tensor(0.0) +def test_ter_empty_with_non_empty_prediction_functional(): + prediction = ["python"] + target = [[]] + assert ter(prediction, target) == tensor(0.0) -def test_ter_empty_with_non_empty_hyp_class(): +def test_ter_empty_with_non_empty_prediction_class(): ter_metric = TER() - hyp = ["python"] - ref = [[]] - assert ter_metric(ref, hyp) == tensor(0.0) + prediction = ["python"] + target = [[]] + assert ter_metric(prediction, target) == tensor(0.0) def test_ter_return_sentence_level_score_functional(): - hyp = _inputs_single_sentence_multiple_references.preds - ref = _inputs_single_sentence_multiple_references.targets - _, sentence_ter = ter(ref, hyp, return_sentence_level_score=True) + prediction = _inputs_single_sentence_multiple_references.preds + target = _inputs_single_sentence_multiple_references.targets + _, sentence_ter = ter(prediction, target, return_sentence_level_score=True) isinstance(sentence_ter, Tensor) def test_ter_return_sentence_level_class(): ter_metric = TER(return_sentence_level_score=True) - hyp = _inputs_single_sentence_multiple_references.preds - ref = _inputs_single_sentence_multiple_references.targets - _, sentence_ter = ter_metric(ref, hyp) + prediction = _inputs_single_sentence_multiple_references.preds + target = _inputs_single_sentence_multiple_references.targets + _, sentence_ter = ter_metric(prediction, target) isinstance(sentence_ter, Tensor) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 06281d9f009..55802feecb9 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -106,7 +106,12 @@ def _bleu_score_update( def _bleu_score_compute( - prediction_len: Tensor, target_len: Tensor, numerator: Tensor, denominator: Tensor, n_gram: int = 4, smooth: bool = False + prediction_len: Tensor, + target_len: Tensor, + numerator: Tensor, + denominator: Tensor, + n_gram: int = 4, + smooth: bool = False, ) -> Tensor: """Computes the BLEU score. @@ -133,7 +138,9 @@ def _bleu_score_compute( log_precision_scores = tensor([1.0 / n_gram] * n_gram, device=device) * torch.log(precision_scores) geometric_mean = torch.exp(torch.sum(log_precision_scores)) - brevity_penalty = tensor(1.0, device=device) if prediction_len > target_len else torch.exp(1 - (target_len / prediction_len)) + brevity_penalty = ( + tensor(1.0, device=device) if prediction_len > target_len else torch.exp(1 - (target_len / prediction_len)) + ) bleu = brevity_penalty * geometric_mean return bleu @@ -175,9 +182,7 @@ def bleu_score( and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ prediction_corpus_ = [prediction_corpus] if isinstance(prediction_corpus, str) else prediction_corpus - target_corpus_ = [ - [target_text] if isinstance(target_text, str) else target_text for target_text in target_corpus - ] + target_corpus_ = [[target_text] if isinstance(target_text, str) else target_text for target_text in target_corpus] if len(prediction_corpus_) != len(target_corpus_): raise ValueError(f"Corpus has different size {len(prediction_corpus_)} != {len(target_corpus_)}") diff --git a/torchmetrics/functional/text/chrf.py b/torchmetrics/functional/text/chrf.py index 6b701ebaf5c..501e34e83c5 100644 --- a/torchmetrics/functional/text/chrf.py +++ b/torchmetrics/functional/text/chrf.py @@ -322,8 +322,8 @@ def _calculate_sentence_level_chrf_score( lowercase: bool, whitespace: bool, ) -> Tuple[Tensor, Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor]]: - """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed prediction, all targets - are evaluated and score and statistics for the best matching target is returned. + """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed prediction, all targets are + evaluated and score and statistics for the best matching target is returned. Args: targets: @@ -548,8 +548,7 @@ def _chrf_score_compute( n_order: float, beta: float, ) -> Tensor: - """Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word - n-grams. + """Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word n-grams. Args: total_prediction_char_n_grams: @@ -595,8 +594,8 @@ def chrf_score( whitespace: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Calculate `chrF score`_ of machine translated text with one or more targets. This implementation - supports both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This + """Calculate `chrF score`_ of machine translated text with one or more targets. This implementation supports + both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This implementation follows the implmenetaions from https://github.com/m-popovic/chrF and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py. diff --git a/torchmetrics/functional/text/ter.py b/torchmetrics/functional/text/ter.py index c63efb666bd..62e09750f0f 100644 --- a/torchmetrics/functional/text/ter.py +++ b/torchmetrics/functional/text/ter.py @@ -206,84 +206,85 @@ def _preprocess_sentence(sentence: str, tokenizer: _TercomTokenizer) -> str: return tokenizer(sentence.rstrip()) -def _find_shifted_pairs(reference_words: List[str], hypothesis_words: List[str]) -> Iterator[Tuple[int, int, int]]: +def _find_shifted_pairs(prediction_words: List[str], target_words: List[str]) -> Iterator[Tuple[int, int, int]]: """Find matching word sub-sequences in two lists of words. Ignores sub-sequences starting at the same position. Args: - reference_words: - A list of a tokenized reference sentence. - hypothesis_words: - A list of a tokenized hypothesis sentence. + prediction_words: + A list of a tokenized prediction sentence. + target_words: + A list of a tokenized target sentence. + Return: - Yields tuples of `(reference_start, hypothesis_start, length` such that: - reference_words[reference_start : reference_start + length] ==\ - hypothesis_words[hypothesis_start : hypothesis_start + length] - - reference_start: - A list of reference start indices. - hypothesis_start: - A list of hypothesis start indices. + Yields tuples of `(target_start, prediction_start, length` such that: + target_words[target_start : target_start + length] ==\ + prediction_words[prediction_start : prediction_start + length] + + target_start: + A list of target start indices. + prediction_start: + A list of prediction start indices. length: A length of a word span to be considered. """ - for hypothesis_start in range(len(hypothesis_words)): - for reference_start in range(len(reference_words)): + for prediction_start in range(len(prediction_words)): + for target_start in range(len(target_words)): # this is slightly different from what tercom does but this should # really only kick in in degenerate cases - if abs(reference_start - hypothesis_start) > _MAX_SHIFT_DIST: + if abs(target_start - prediction_start) > _MAX_SHIFT_DIST: continue for length in range(1, _MAX_SHIFT_SIZE): - # Check if hypothesis and reference are equal so far - if hypothesis_words[hypothesis_start + length - 1] != reference_words[reference_start + length - 1]: + # Check if prediction and target are equal so far + if prediction_words[prediction_start + length - 1] != target_words[target_start + length - 1]: break - yield reference_start, hypothesis_start, length + yield prediction_start, target_start, length # Stop processing once a sequence is consumed. - _hyp = len(hypothesis_words) == hypothesis_start + length - _ref = len(reference_words) == reference_start + length - if _hyp or _ref: + _pred = len(prediction_words) == prediction_start + length + _target = len(target_words) == target_start + length + if _pred or _target: break def _handle_corner_cases_during_shifting( alignments: Dict[int, int], - reference_errors: List[int], - hypothesis_errors: List[int], - reference_start: int, - hypothesis_start: int, + prediction_errors: List[int], + target_errors: List[int], + prediction_start: int, + target_start: int, length: int, ) -> bool: """A helper function which returns `True` if any of corner cases has been met. Otherwise, `False` is returned. Args: alignments: - A dictionary mapping aligned positions between a reference and a hypothesis. - reference_errors: - A list of error positions in a reference. - hypothesis_errors: - A list of error positions in a hypothesis. - reference_start: - A reference start index. - hypothesis_start: - A hypothesis start index. + A dictionary mapping aligned positions between a target and a prediction. + prediction_errors: + A list of error positions in a prediction. + target_errors: + A list of error positions in a target. + prediction_start: + A prediction start index. + target_start: + A target start index. length: A length of a word span to be considered. Return: An indication whether any of conrner cases has been met. """ - # don't do the shift unless both the hypothesis was wrong and the - # reference doesn't match hypothesis at the target position - if sum(hypothesis_errors[hypothesis_start : hypothesis_start + length]) == 0: + # don't do the shift unless both the prediction was wrong and the + # target doesn't match prediction at the target position + if sum(prediction_errors[prediction_start : prediction_start + length]) == 0: return True - if sum(reference_errors[reference_start : reference_start + length]) == 0: + if sum(target_errors[target_start : target_start + length]) == 0: return True # don't try to shift within the subsequence - if hypothesis_start <= alignments[reference_start] < hypothesis_start + length: + if prediction_start <= alignments[target_start] < prediction_start + length: return True return False @@ -327,55 +328,55 @@ def _shift_word_within_shifted_string(words: List[str], start: int, target: int, def _shift_words( - reference_words: List[str], - hypothesis_words: List[str], + prediction_words: List[str], + target_words: List[str], cached_edit_distance: _LevenshteinEditDistance, checked_candidates: int, ) -> Tuple[int, List[str], int]: - """Attempt to shift words to match a hypothesis with a reference. It returns the lowest number of required - edits between a hypothesis and a provided reference, a list of shifted words and number of checked candidates. + """Attempt to shift words to match a prediction with a target. It returns the lowest number of required edits + between a prediction and a provided target, a list of shifted words and number of checked candidates. Note that the filtering of possible shifts and shift selection are heavily based on somewhat arbitrary heuristics. The code here follows as closely as possible the logic in Tercom, not always justifying the particular design choices. (The paragraph copied from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/lib_ter.py) Args: - reference_words: - A list of lists of tokenized reference sentences. - hypothesis_words: - A list of tokenized hypothesis sentence. + prediction_words: + A list of tokenized prediction sentence. + target_words: + A list of lists of tokenized target sentences. cached_edit_distance: - A pre-computed edit distance between a hypothesis and a reference. + A pre-computed edit distance between a prediction and a target. checked_candidates: - A number of checked hypothesis candidates to match a provided reference. + A number of checked prediction candidates to match a provided target. Return: best_score: - The best (lowest) number of required edits to match hypothesis and reference sentences. + The best (lowest) number of required edits to match prediction and target sentences. shifted_words: - A list of shifted words in hypothesis sentences. + A list of shifted words in prediction sentences. checked_candidates: - A number of checked hypothesis candidates to match a provided reference. + A number of checked prediction candidates to match a provided target. """ - edit_distance, inverted_trace = cached_edit_distance(hypothesis_words) + edit_distance, inverted_trace = cached_edit_distance(prediction_words) trace = _flip_trace(inverted_trace) - alignments, reference_errors, hypothesis_errors = _trace_to_alignment(trace) + alignments, target_errors, prediction_errors = _trace_to_alignment(trace) best: Optional[Tuple[int, int, int, int, List[str]]] = None - for reference_start, hypothesis_start, length in _find_shifted_pairs(reference_words, hypothesis_words): + for prediction_start, target_start, length in _find_shifted_pairs(prediction_words, target_words): if _handle_corner_cases_during_shifting( - alignments, reference_errors, hypothesis_errors, reference_start, hypothesis_start, length + alignments, prediction_errors, target_errors, prediction_start, target_start, length ): continue prev_idx = -1 for offset in range(-1, length): - if reference_start + offset == -1: + if target_start + offset == -1: idx = 0 - elif reference_start + offset in alignments: - idx = alignments[reference_start + offset] + 1 - # offset is out of bounds => aims past reference + elif target_start + offset in alignments: + idx = alignments[target_start + offset] + 1 + # offset is out of bounds => aims past target else: break # Skip idx if already tried @@ -384,13 +385,13 @@ def _shift_words( prev_idx = idx - shifted_words = _perform_shift(hypothesis_words, hypothesis_start, length, idx) + shifted_words = _perform_shift(prediction_words, prediction_start, length, idx) # Elements of the tuple are designed to replicate Tercom ranking of shifts: candidate = ( edit_distance - cached_edit_distance(shifted_words)[0], # highest score first length, # then, longest match first - -hypothesis_start, # then, earliest match first + -prediction_start, # then, earliest match first -idx, # then, earliest target position first shifted_words, ) @@ -404,35 +405,35 @@ def _shift_words( break if not best: - return 0, hypothesis_words, checked_candidates + return 0, prediction_words, checked_candidates best_score, _, _, _, shifted_words = best return best_score, shifted_words, checked_candidates -def _translation_edit_rate(reference_words: List[str], hypothesis_words: List[str]) -> Tensor: - """Compute translation edit rate between reference and hypothesis sentences. +def _translation_edit_rate(prediction_words: List[str], target_words: List[str]) -> Tensor: + """Compute translation edit rate between target and prediction sentences. Args: - reference_words: - A list of lists of tokenized reference sentences. - hypothesis_words: - A list of a tokenized hypothesis sentence. + prediction_words: + A list of a tokenized prediction sentence. + target_words: + A list of lists of tokenized target sentences. Return: - A number of required edits to match hypothesis and reference sentences. + A number of required edits to match prediction and target sentences. """ - if len(reference_words) == 0: + if len(target_words) == 0: return tensor(0.0) - cached_edit_distance = _LevenshteinEditDistance(reference_words) + cached_edit_distance = _LevenshteinEditDistance(target_words) num_shifts = 0 checked_candidates = 0 - input_words = hypothesis_words + input_words = prediction_words while True: # do shifts until they stop reducing the edit distance delta, new_input_words, checked_candidates = _shift_words( - reference_words, input_words, cached_edit_distance, checked_candidates + input_words, target_words, cached_edit_distance, checked_candidates ) if checked_candidates >= _MAX_SHIFT_CANDIDATES or delta <= 0: break @@ -445,50 +446,48 @@ def _translation_edit_rate(reference_words: List[str], hypothesis_words: List[st return tensor(total_edits) -def _compute_sentence_statistics( - references_words: List[List[str]], hypothesis_words: List[str] -) -> Tuple[Tensor, Tensor]: - """Compute sentence TER statistics between hypothesis and provided references. +def _compute_sentence_statistics(prediction_words: List[str], target_words: List[List[str]]) -> Tuple[Tensor, Tensor]: + """Compute sentence TER statistics between prediction and provided targets. Args: - reference_words: - A list of lists of tokenized reference sentences. - hypothesis_words: - A list of tokenized hypothesis sentence. + prediction_words: + A list of tokenized prediction sentence. + target_words: + A list of lists of tokenized target sentences. Return: best_num_edits: - The best (lowest) number of required edits to match hypothesis and reference sentences. - avg_ref_len: - Average length of tokenized reference sentences. + The best (lowest) number of required edits to match prediction and target sentences. + avg_target_len: + Average length of tokenized target sentences. """ - ref_lengths = tensor(0.0) + target_lengths = tensor(0.0) best_num_edits = tensor(2e16) - for reference_words in references_words: - num_edits = _translation_edit_rate(reference_words, hypothesis_words) - ref_lengths += len(reference_words) + for tgt_words in target_words: + num_edits = _translation_edit_rate(prediction_words, tgt_words) + target_lengths += len(tgt_words) if num_edits < best_num_edits: best_num_edits = num_edits - avg_ref_len = ref_lengths / len(references_words) - return best_num_edits, avg_ref_len + avg_target_len = target_lengths / len(target_words) + return best_num_edits, avg_target_len -def _compute_ter_score_from_statistics(num_edits: Tensor, ref_length: Tensor) -> Tensor: - """Compute TER score based on pre-computed a number of edits and an average reference length. +def _compute_ter_score_from_statistics(num_edits: Tensor, target_length: Tensor) -> Tensor: + """Compute TER score based on pre-computed a number of edits and an average target length. num_edits: - A number of required edits to match hypothesis and reference sentences. - ref_length: - An average length of reference sentences. + A number of required edits to match prediction and target sentences. + target_length: + An average length of target sentences. Return: - A corpus-level TER score or 1 if reference_length == 0. + A corpus-level TER score or 1 if target_length == 0. """ - if ref_length > 0 and num_edits > 0: - score = num_edits / ref_length - elif ref_length == 0 and num_edits > 0: + if target_length > 0 and num_edits > 0: + score = num_edits / target_length + elif target_length == 0 and num_edits > 0: score = tensor(1.0) else: score = tensor(0.0) @@ -496,86 +495,86 @@ def _compute_ter_score_from_statistics(num_edits: Tensor, ref_length: Tensor) -> def _ter_update( - reference_corpus: Sequence[Union[str, Sequence[str]]], - hypothesis_corpus: Union[str, Sequence[str]], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Sequence[Union[str, Sequence[str]]], tokenizer: _TercomTokenizer, total_num_edits: Tensor, - total_ref_length: Tensor, + total_target_length: Tensor, sentence_ter: Optional[List[Tensor]] = None, ) -> Tuple[Tensor, Tensor, Optional[List[Tensor]]]: """Update TER statistics. Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. tokenizer: total_num_edits: - A total number of required edits to match hypothesis and reference sentences. - total_ref_length: - A total average length of reference sentences. + A total number of required edits to match prediction and target sentences. + total_target_length: + A total average length of target sentences. Return: total_num_edits: - A total number of required edits to match hypothesis and reference sentences. - total_ref_length: - A total average length of reference sentences. + A total number of required edits to match prediction and target sentences. + total_target_length: + A total average length of target sentences. sentence_ter: (Optionally) A list of sentence-level TER. Raises: ValueError: - If length of `reference_corpus` and `hypothesis_corpus` differs. + If length of `target_corpus` and `prediction_corpus` differs. """ - reference_corpus, hypothesis_corpus = _validate_inputs(reference_corpus, hypothesis_corpus) + target_corpus, prediction_corpus = _validate_inputs(target_corpus, prediction_corpus) - for (references, hypothesis) in zip(reference_corpus, hypothesis_corpus): - references_words_: List[List[str]] = [ - [word for word in _preprocess_sentence(ref, tokenizer).split()] for ref in references + for (prediction, targets) in zip(prediction_corpus, target_corpus): + target_words_: List[List[str]] = [ + [word for word in _preprocess_sentence(target, tokenizer).split()] for target in targets ] - hypothesis_words_: List[str] = [word for word in _preprocess_sentence(hypothesis, tokenizer).split()] - num_edits, ref_length = _compute_sentence_statistics(references_words_, hypothesis_words_) + prediction_words_: List[str] = [word for word in _preprocess_sentence(prediction, tokenizer).split()] + num_edits, target_length = _compute_sentence_statistics(prediction_words_, target_words_) total_num_edits += num_edits - total_ref_length += ref_length + total_target_length += target_length if sentence_ter is not None: - sentence_ter.append(_compute_ter_score_from_statistics(num_edits, ref_length).unsqueeze(0)) - return total_num_edits, total_ref_length, sentence_ter + sentence_ter.append(_compute_ter_score_from_statistics(num_edits, target_length).unsqueeze(0)) + return total_num_edits, total_target_length, sentence_ter -def _ter_compute(total_num_edits: Tensor, total_ref_length: Tensor) -> Tensor: - """Compute TER based on pre-computed a total number of edits and a total average reference length. +def _ter_compute(total_num_edits: Tensor, total_target_length: Tensor) -> Tensor: + """Compute TER based on pre-computed a total number of edits and a total average target length. Args: total_num_edits: - A total number of required edits to match hypothesis and reference sentences. - total_ref_length: - A total average length of reference sentences. + A total number of required edits to match prediction and target sentences. + total_target_length: + A total average length of target sentences. Return: A corpus-level TER score. """ - return _compute_ter_score_from_statistics(total_num_edits, total_ref_length) + return _compute_ter_score_from_statistics(total_num_edits, total_target_length) def ter( - reference_corpus: Sequence[Union[str, Sequence[str]]], - hypothesis_corpus: Union[str, Sequence[str]], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Sequence[Union[str, Sequence[str]]], normalize: bool = False, no_punctuation: bool = False, lowercase: bool = True, asian_support: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, Tuple[Tensor, List[Tensor]]]: - """Calculate Translation edit rate (`TER`_) of machine translated text with one or more references. This + """Calculate Translation edit rate (`TER`_) of machine translated text with one or more targets. This implementation follows the implmenetaions from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/ter.py. The `sacrebleu` implmenetation is a near-exact reimplementation of the Tercom algorithm, produces identical results on all "sane" outputs. Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. normalize: An indication whether a general tokenization to be applied. no_punctuation: @@ -592,9 +591,9 @@ def ter( (Optionally) A list of sentence-level translation_edit_rate (TER) if `return_sentence_level_score=True`. Example: - >>> hypothesis_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> ter(reference_corpus, hypothesis_corpus) + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> ter(prediction_corpus, target_corpus) tensor(0.1538) References: @@ -613,13 +612,13 @@ def ter( tokenizer: _TercomTokenizer = _TercomTokenizer(normalize, no_punctuation, lowercase, asian_support) total_num_edits = tensor(0.0) - total_ref_length = tensor(0.0) + total_target_length = tensor(0.0) sentence_ter: Optional[List[Tensor]] = [] if return_sentence_level_score else None - total_num_edits, total_ref_length, sentence_ter = _ter_update( - reference_corpus, hypothesis_corpus, tokenizer, total_num_edits, total_ref_length, sentence_ter + total_num_edits, total_target_length, sentence_ter = _ter_update( + prediction_corpus, target_corpus, tokenizer, total_num_edits, total_target_length, sentence_ter ) - ter_score = _ter_compute(total_num_edits, total_ref_length) + ter_score = _ter_compute(total_num_edits, total_target_length) if sentence_ter: return ter_score, sentence_ter diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index b157bfac91e..1929ea3668a 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -90,9 +90,7 @@ def __init__( self.add_state("numerator", torch.zeros(self.n_gram), dist_reduce_fx="sum") self.add_state("denominator", torch.zeros(self.n_gram), dist_reduce_fx="sum") - def update( # type: ignore - self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]] - ) -> None: + def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore """Compute Precision Scores. Args: diff --git a/torchmetrics/text/chrf.py b/torchmetrics/text/chrf.py index 51630ae5878..de302fe4eee 100644 --- a/torchmetrics/text/chrf.py +++ b/torchmetrics/text/chrf.py @@ -142,9 +142,7 @@ def __init__( if self.return_sentence_level_score: self.add_state("sentence_chrf_score", [], dist_reduce_fx="cat") - def update( # type: ignore - self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]] - ) -> None: + def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore """Compute Precision Scores. Args: @@ -152,7 +150,6 @@ def update( # type: ignore An iterable of prediction corpus. target_corpus: An iterable of iterables of target corpus. - """ n_grams_dicts_tuple = _chrf_score_update( prediction_corpus, diff --git a/torchmetrics/text/ter.py b/torchmetrics/text/ter.py index e9c1bd3cb15..f9ce0f2c80c 100644 --- a/torchmetrics/text/ter.py +++ b/torchmetrics/text/ter.py @@ -50,10 +50,10 @@ class TER(Metric): will be used to perform the allgather Example: - >>> hypothesis_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = TER() - >>> metric(reference_corpus, hypothesis_corpus) + >>> metric(prediction_corpus, target_corpus) tensor(0.1538) References: @@ -64,7 +64,7 @@ class TER(Metric): is_differentiable = False higher_is_better = False total_num_edits: Tensor - total_ref_len: Tensor + total_target_len: Tensor sentence_ter: Optional[List[Tensor]] = None def __init__( @@ -98,29 +98,29 @@ def __init__( self.return_sentence_level_score = return_sentence_level_score self.add_state("total_num_edits", tensor(0.0), dist_reduce_fx="sum") - self.add_state("total_ref_len", tensor(0.0), dist_reduce_fx="sum") + self.add_state("total_target_len", tensor(0.0), dist_reduce_fx="sum") if self.return_sentence_level_score: self.add_state("sentence_ter", [], dist_reduce_fx="cat") def update( # type: ignore self, - reference_corpus: Sequence[Union[str, Sequence[str]]], - hypothesis_corpus: Union[str, Sequence[str]], + prediction_corpus: Union[str, Sequence[str]], + target_corpus: Sequence[Union[str, Sequence[str]]], ) -> None: """Update TER statistics. Args: - reference_corpus: - An iterable of iterables of reference corpus. - hypothesis_corpus: - An iterable of hypothesis corpus. + prediction_corpus: + An iterable of prediction corpus. + target_corpus: + An iterable of iterables of target corpus. """ - self.total_num_edits, self.total_ref_len, self.sentence_ter = _ter_update( - reference_corpus, - hypothesis_corpus, + self.total_num_edits, self.total_target_len, self.sentence_ter = _ter_update( + prediction_corpus, + target_corpus, self.tokenizer, self.total_num_edits, - self.total_ref_len, + self.total_target_len, self.sentence_ter, ) @@ -131,7 +131,7 @@ def compute(self) -> Union[Tensor, Tuple[Tensor, Tensor]]: A corpus-level translation edit rate (TER). (Optionally) A list of sentence-level translation_edit_rate (TER) if `return_sentence_level_score=True`. """ - ter = _ter_compute(self.total_num_edits, self.total_ref_len) + ter = _ter_compute(self.total_num_edits, self.total_target_len) if self.sentence_ter is not None: return ter, torch.cat(self.sentence_ter) return ter From 18e76e979b2ca628462e5a542e3724f92dc30cab Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Tue, 28 Dec 2021 13:38:25 +0530 Subject: [PATCH 3/8] Standardize SacreBLEU --- tests/text/test_sacre_bleu.py | 26 ++++++++-------- torchmetrics/functional/text/sacre_bleu.py | 36 +++++++++++----------- torchmetrics/text/sacre_bleu.py | 24 +++++++-------- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/tests/text/test_sacre_bleu.py b/tests/text/test_sacre_bleu.py index 8cd34a807ff..3e121b026d3 100644 --- a/tests/text/test_sacre_bleu.py +++ b/tests/text/test_sacre_bleu.py @@ -31,16 +31,18 @@ TOKENIZERS = ("none", "13a", "zh", "intl", "char") -def sacrebleu_fn(targets: Sequence[Sequence[str]], preds: Sequence[str], tokenize: str, lowercase: bool) -> Tensor: +def sacrebleu_fn( + predictions: Sequence[str], targets: Sequence[Sequence[str]], tokenize: str, lowercase: bool +) -> Tensor: sacrebleu_fn = BLEU(tokenize=tokenize, lowercase=lowercase) # Sacrebleu expects different format of input targets = [[target[i] for target in targets] for i in range(len(targets[0]))] - sacrebleu_score = sacrebleu_fn.corpus_score(preds, targets).score / 100 + sacrebleu_score = sacrebleu_fn.corpus_score(predictions, targets).score / 100 return tensor(sacrebleu_score) @pytest.mark.parametrize( - ["preds", "targets"], + ["predictions", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.parametrize(["lowercase"], [(False,), (True,)]) @@ -49,42 +51,42 @@ def sacrebleu_fn(targets: Sequence[Sequence[str]], preds: Sequence[str], tokeniz class TestSacreBLEUScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) - def test_bleu_score_class(self, ddp, dist_sync_on_step, preds, targets, tokenize, lowercase): + def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} original_sacrebleu = partial(sacrebleu_fn, tokenize=tokenize, lowercase=lowercase) self.run_class_metric_test( ddp=ddp, - preds=preds, + preds=predictions, targets=targets, metric_class=SacreBLEUScore, sk_metric=original_sacrebleu, dist_sync_on_step=dist_sync_on_step, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_functional(self, preds, targets, tokenize, lowercase): + def test_bleu_score_functional(self, predictions, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} original_sacrebleu = partial(sacrebleu_fn, tokenize=tokenize, lowercase=lowercase) self.run_functional_metric_test( - preds, + predictions, targets, metric_functional=sacre_bleu_score, sk_metric=original_sacrebleu, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_differentiability(self, preds, targets, tokenize, lowercase): + def test_bleu_score_differentiability(self, predictions, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} self.run_differentiability_test( - preds=preds, + preds=predictions, targets=targets, metric_module=SacreBLEUScore, metric_functional=sacre_bleu_score, metric_args=metric_args, - input_order=INPUT_ORDER.TARGETS_FIRST, + input_order=INPUT_ORDER.PREDS_FIRST, ) diff --git a/torchmetrics/functional/text/sacre_bleu.py b/torchmetrics/functional/text/sacre_bleu.py index e42409c4f3c..57c7c134c0f 100644 --- a/torchmetrics/functional/text/sacre_bleu.py +++ b/torchmetrics/functional/text/sacre_bleu.py @@ -277,8 +277,8 @@ def _lower(line: str, lowercase: bool) -> str: def sacre_bleu_score( - reference_corpus: Sequence[Sequence[str]], - translate_corpus: Sequence[str], + prediction_corpus: Sequence[str], + target_corpus: Sequence[Sequence[str]], n_gram: int = 4, smooth: bool = False, tokenize: Literal["none", "13a", "zh", "intl", "char"] = "13a", @@ -288,10 +288,10 @@ def sacre_bleu_score( follows the behaviour of SacreBLEU [2] implementation from https://github.com/mjpost/sacrebleu. Args: - reference_corpus: - An iterable of iterables of reference corpus - translate_corpus: + prediction_corpus: An iterable of machine translated corpus + target_corpus: + An iterable of iterables of reference corpus n_gram: Gram value ranged from 1 to 4 (Default 4) smooth: @@ -307,9 +307,9 @@ def sacre_bleu_score( Example: >>> from torchmetrics.functional import sacre_bleu_score - >>> translate_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> sacre_bleu_score(reference_corpus, translate_corpus) + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> sacre_bleu_score(prediction_corpus, target_corpus) tensor(0.7598) References: @@ -328,8 +328,8 @@ def sacre_bleu_score( raise ValueError( f"Unsupported tokenizer selected. Please, choose one of {list(_SacreBLEUTokenizer._TOKENIZE_FN.keys())}" ) - if len(translate_corpus) != len(reference_corpus): - raise ValueError(f"Corpus has different size {len(translate_corpus)} != {len(reference_corpus)}") + if len(prediction_corpus) != len(target_corpus): + raise ValueError(f"Corpus has different size {len(prediction_corpus)} != {len(target_corpus)}") if tokenize == "intl" and not _REGEX_AVAILABLE: raise ValueError( "`'intl'` tokenization requires `regex` installed. Use `pip install regex` or `pip install " @@ -338,19 +338,19 @@ def sacre_bleu_score( numerator = torch.zeros(n_gram) denominator = torch.zeros(n_gram) - trans_len = tensor(0, dtype=torch.float) - ref_len = tensor(0, dtype=torch.float) + prediction_len = tensor(0, dtype=torch.float) + target_len = tensor(0, dtype=torch.float) tokenize_fn = partial(_SacreBLEUTokenizer.tokenize, tokenize=tokenize, lowercase=lowercase) - trans_len, ref_len = _bleu_score_update( - reference_corpus, - translate_corpus, + prediction_len, target_len = _bleu_score_update( + prediction_corpus, + target_corpus, numerator, denominator, - trans_len, - ref_len, + prediction_len, + target_len, n_gram, tokenize_fn, ) - return _bleu_score_compute(trans_len, ref_len, numerator, denominator, n_gram, smooth) + return _bleu_score_compute(prediction_len, target_len, numerator, denominator, n_gram, smooth) diff --git a/torchmetrics/text/sacre_bleu.py b/torchmetrics/text/sacre_bleu.py index b8c59f3c646..44d1e8233de 100644 --- a/torchmetrics/text/sacre_bleu.py +++ b/torchmetrics/text/sacre_bleu.py @@ -64,10 +64,10 @@ class SacreBLEUScore(BLEUScore): Example: - >>> translate_corpus = ['the cat is on the mat'] - >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> prediction_corpus = ['the cat is on the mat'] + >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = SacreBLEUScore() - >>> metric(reference_corpus, translate_corpus) + >>> metric(prediction_corpus, target_corpus) tensor(0.7598) References: @@ -109,22 +109,20 @@ def __init__( ) self.tokenizer = _SacreBLEUTokenizer(tokenize, lowercase) - def update( # type: ignore - self, reference_corpus: Sequence[Sequence[str]], translate_corpus: Sequence[str] - ) -> None: + def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore """Compute Precision Scores. Args: - reference_corpus: An iterable of iterables of reference corpus - translate_corpus: An iterable of machine translated corpus + prediction_corpus: An iterable of machine translated corpus + target_corpus: An iterable of iterables of reference corpus """ - self.trans_len, self.ref_len = _bleu_score_update( - reference_corpus, - translate_corpus, + self.prediction_len, self.target_len = _bleu_score_update( + prediction_corpus, + target_corpus, self.numerator, self.denominator, - self.trans_len, - self.ref_len, + self.prediction_len, + self.target_len, self.n_gram, self.tokenizer, ) From b057fe0a486008fd609509c0819f64747815a422 Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Tue, 28 Dec 2021 13:44:58 +0530 Subject: [PATCH 4/8] Add warnings for breaking change files --- torchmetrics/functional/text/bleu.py | 6 ++++++ torchmetrics/functional/text/sacre_bleu.py | 5 +++++ torchmetrics/text/bleu.py | 6 +++++- torchmetrics/text/sacre_bleu.py | 6 ++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 55802feecb9..3b0d457d297 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -16,6 +16,7 @@ # Authors: torchtext authors and @sluks # Date: 2020-07-18 # Link: https://pytorch.org/text/_modules/torchtext/data/metrics.html#bleu_score +import warnings from collections import Counter from typing import Callable, Sequence, Tuple, Union @@ -181,6 +182,11 @@ def bleu_score( [2] Automatic Evaluation of Machine Translation Quality Using Longest Common Subsequence and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ + warnings.warn( + "Input order of preds and targets were changed to target firsts and predictions \ + second in v0.7. Warning will be removed in v0.8" + ) + prediction_corpus_ = [prediction_corpus] if isinstance(prediction_corpus, str) else prediction_corpus target_corpus_ = [[target_text] if isinstance(target_text, str) else target_text for target_text in target_corpus] diff --git a/torchmetrics/functional/text/sacre_bleu.py b/torchmetrics/functional/text/sacre_bleu.py index 57c7c134c0f..cf59511407c 100644 --- a/torchmetrics/functional/text/sacre_bleu.py +++ b/torchmetrics/functional/text/sacre_bleu.py @@ -39,6 +39,7 @@ import re +import warnings from functools import partial from typing import Sequence @@ -321,6 +322,10 @@ def sacre_bleu_score( [3] Automatic Evaluation of Machine Translation Quality Using Longest Common Subsequence and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ + warnings.warn( + "Input order of preds and targets were changed to target firsts and predictions \ + second in v0.7. Warning will be removed in v0.8" + ) if tokenize not in AVAILABLE_TOKENIZERS: raise ValueError(f"Argument `tokenize` expected to be one of {AVAILABLE_TOKENIZERS} but got {tokenize}.") diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index 1929ea3668a..0f006de3a7b 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -16,6 +16,7 @@ # Authors: torchtext authors and @sluks # Date: 2020-07-18 # Link: https://pytorch.org/text/_modules/torchtext/data/metrics.html#bleu_score +import warnings from typing import Any, Callable, Optional, Sequence import torch @@ -81,7 +82,10 @@ def __init__( process_group=process_group, dist_sync_fn=dist_sync_fn, ) - + warnings.warn( + "Input order of preds and targets were changed to target firsts and predictions \ + second in v0.7. Warning will be removed in v0.8" + ) self.n_gram = n_gram self.smooth = smooth diff --git a/torchmetrics/text/sacre_bleu.py b/torchmetrics/text/sacre_bleu.py index 44d1e8233de..5f7f044b108 100644 --- a/torchmetrics/text/sacre_bleu.py +++ b/torchmetrics/text/sacre_bleu.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings + # referenced from # Library Name: torchtext # Authors: torchtext authors and @sluks @@ -99,6 +101,10 @@ def __init__( process_group=process_group, dist_sync_fn=dist_sync_fn, ) + warnings.warn( + "Input order of preds and targets were changed to target firsts and predictions \ + second in v0.7. Warning will be removed in v0.8" + ) if tokenize not in AVAILABLE_TOKENIZERS: raise ValueError(f"Argument `tokenize` expected to be one of {AVAILABLE_TOKENIZERS} but got {tokenize}.") From fc482a7e0ed63e14045743a88a17e4e5713d5e6b Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Tue, 28 Dec 2021 18:11:17 +0530 Subject: [PATCH 5/8] Update docstrings --- torchmetrics/functional/text/bleu.py | 2 +- torchmetrics/functional/text/chrf.py | 27 ++++++++++++++------------- torchmetrics/functional/text/ter.py | 6 +++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 3b0d457d297..a0774b01bb4 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -70,8 +70,8 @@ def _bleu_score_update( """Updates and returns variables required to compute the BLEU score. Args: - target_corpus: An iterable of iterables of reference corpus prediction_corpus: An iterable of machine translated corpus + target_corpus: An iterable of iterables of reference corpus numerator: Numerator of precision score (true positives) denominator: Denominator of precision score (true positives + false positives) prediction_len: count of words in a candidate prediction diff --git a/torchmetrics/functional/text/chrf.py b/torchmetrics/functional/text/chrf.py index 501e34e83c5..42436f62d7c 100644 --- a/torchmetrics/functional/text/chrf.py +++ b/torchmetrics/functional/text/chrf.py @@ -215,10 +215,11 @@ def _get_ngram_matches( """Get a number of n-gram matches between target and prediction n-grams. Args: - target_n_grams_counts: prediction_n_grams_counts: + target_n_grams_counts: Return: + matching_n_grams """ matching_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) for n in prediction_n_grams_counts: @@ -266,14 +267,14 @@ def _calculate_fscore( A total number of matching character n-grams between the best matching target and prediction. matching_word_n_grams: A total number of matching word n-grams between the best matching target and prediction. - target_char_n_grams: - A total number of target character n-grams. - target_word_n_grams: - A total number of target word n-grams. prediction_char_n_grams: A total number of prediction character n-grams. prediction_word_n_grams: A total number of prediction word n-grams. + target_char_n_grams: + A total number of target character n-grams. + target_word_n_grams: + A total number of target word n-grams. n_order: A sum of character and word n-gram order. beta: @@ -438,14 +439,14 @@ def _chrf_score_update( An iterable of prediction corpus. target_corpus: An iterable of iterables of target corpus. - total_target_char_n_grams: - A dictionary containing a total number of target character n-grams. - total_target_word_n_grams: - A dictionary containing a total number of target word n-grams. total_prediction_char_n_grams: A dictionary containing a total number of prediction character n-grams. total_prediction_word_n_grams: A dictionary containing a total number of prediction word n-grams. + total_target_char_n_grams: + A dictionary containing a total number of target character n-grams. + total_target_word_n_grams: + A dictionary containing a total number of target word n-grams. total_matching_char_n_grams: A dictionary containing a total number of matching character n-grams between targets and hypotheses. total_matching_word_n_grams: @@ -466,14 +467,14 @@ def _chrf_score_update( A list of sentence-level chrF/chrF++ scores. Return: - total_target_char_n_grams: - An updated dictionary containing a total number of target character n-grams. - total_target_word_n_grams: - An updated dictionary containing a total number of target word n-grams. total_prediction_char_n_grams: An updated dictionary containing a total number of prediction character n-grams. total_prediction_word_n_grams: An updated dictionary containing a total number of prediction word n-grams. + total_target_char_n_grams: + An updated dictionary containing a total number of target character n-grams. + total_target_word_n_grams: + An updated dictionary containing a total number of target word n-grams. total_matching_char_n_grams: An updated dictionary containing a total number of matching character n-grams between targets and hypotheses. diff --git a/torchmetrics/functional/text/ter.py b/torchmetrics/functional/text/ter.py index 62e09750f0f..b8ac23ce491 100644 --- a/torchmetrics/functional/text/ter.py +++ b/torchmetrics/functional/text/ter.py @@ -217,14 +217,14 @@ def _find_shifted_pairs(prediction_words: List[str], target_words: List[str]) -> Return: - Yields tuples of `(target_start, prediction_start, length` such that: + Yields tuples of `(prediction_start, target_start, length` such that: target_words[target_start : target_start + length] ==\ prediction_words[prediction_start : prediction_start + length] - target_start: - A list of target start indices. prediction_start: A list of prediction start indices. + target_start: + A list of target start indices. length: A length of a word span to be considered. """ From 0a48f777ebeea71bb1134b2d8414ce04dfab5170 Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Tue, 28 Dec 2021 18:19:36 +0530 Subject: [PATCH 6/8] Update docstring + CHANGELOG.md --- CHANGELOG.md | 3 +++ torchmetrics/functional/text/bleu.py | 2 +- torchmetrics/functional/text/sacre_bleu.py | 2 +- torchmetrics/text/bleu.py | 4 ++-- torchmetrics/text/sacre_bleu.py | 4 ++-- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fa95f171bc..4ce9d550cbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `BLEUScore` now expects untokenized input to stay consistent with all the other text metrics ([#640](https://github.com/PyTorchLightning/metrics/pull/640)) +- `TER`, `BLEUScore`, `SacreBLEUScore`, `CHRFScore` now expect input order as predictions first and target second ([#696](https://github.com/PyTorchLightning/metrics/pull/696)) + + ### Deprecated - Renamed IoU -> Jaccard Index ([#662](https://github.com/PyTorchLightning/metrics/pull/662)) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index a0774b01bb4..0a942554949 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -183,7 +183,7 @@ def bleu_score( and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ warnings.warn( - "Input order of preds and targets were changed to target firsts and predictions \ + "Input order of targets and preds were changed to predictions firsts and targets \ second in v0.7. Warning will be removed in v0.8" ) diff --git a/torchmetrics/functional/text/sacre_bleu.py b/torchmetrics/functional/text/sacre_bleu.py index cf59511407c..00401e4392a 100644 --- a/torchmetrics/functional/text/sacre_bleu.py +++ b/torchmetrics/functional/text/sacre_bleu.py @@ -323,7 +323,7 @@ def sacre_bleu_score( and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ warnings.warn( - "Input order of preds and targets were changed to target firsts and predictions \ + "Input order of targets and preds were changed to predictions firsts and targets \ second in v0.7. Warning will be removed in v0.8" ) if tokenize not in AVAILABLE_TOKENIZERS: diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index 0f006de3a7b..9c1d82a624d 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -83,8 +83,8 @@ def __init__( dist_sync_fn=dist_sync_fn, ) warnings.warn( - "Input order of preds and targets were changed to target firsts and predictions \ - second in v0.7. Warning will be removed in v0.8" + "Input order of targets and preds were changed to predictions firsts and targets \ + second in v0.7. Warning will be removed in v0.8" ) self.n_gram = n_gram self.smooth = smooth diff --git a/torchmetrics/text/sacre_bleu.py b/torchmetrics/text/sacre_bleu.py index 5f7f044b108..b577dc2f19b 100644 --- a/torchmetrics/text/sacre_bleu.py +++ b/torchmetrics/text/sacre_bleu.py @@ -102,8 +102,8 @@ def __init__( dist_sync_fn=dist_sync_fn, ) warnings.warn( - "Input order of preds and targets were changed to target firsts and predictions \ - second in v0.7. Warning will be removed in v0.8" + "Input order of targets and preds were changed to predictions firsts and targets \ + second in v0.7. Warning will be removed in v0.8" ) if tokenize not in AVAILABLE_TOKENIZERS: raise ValueError(f"Argument `tokenize` expected to be one of {AVAILABLE_TOKENIZERS} but got {tokenize}.") From 9b6a54102c87cf54e2f04f353f57c9272a8e222b Mon Sep 17 00:00:00 2001 From: ashutoshml Date: Mon, 3 Jan 2022 19:15:12 +0530 Subject: [PATCH 7/8] Update order and keep naming unchanged --- tests/text/test_bleu.py | 48 +-- tests/text/test_chrf.py | 38 +-- tests/text/test_sacre_bleu.py | 20 +- tests/text/test_ter.py | 60 ++-- torchmetrics/functional/text/bleu.py | 95 +++--- torchmetrics/functional/text/chrf.py | 354 ++++++++++----------- torchmetrics/functional/text/sacre_bleu.py | 34 +- torchmetrics/functional/text/ter.py | 287 ++++++++--------- torchmetrics/text/bleu.py | 34 +- torchmetrics/text/chrf.py | 34 +- torchmetrics/text/sacre_bleu.py | 24 +- torchmetrics/text/ter.py | 32 +- 12 files changed, 531 insertions(+), 529 deletions(-) diff --git a/tests/text/test_bleu.py b/tests/text/test_bleu.py index 01713aeacd3..48bbb12633d 100644 --- a/tests/text/test_bleu.py +++ b/tests/text/test_bleu.py @@ -27,12 +27,12 @@ smooth_func = SmoothingFunction().method2 -def _compute_bleu_metric_nltk(predictions, list_of_targets, weights, smoothing_function, **kwargs): - predictions_ = [prediction.split() for prediction in predictions] - list_of_targets_ = [[line.split() for line in target] for target in list_of_targets] +def _compute_bleu_metric_nltk(hypotheses, list_of_references, weights, smoothing_function, **kwargs): + hypotheses_ = [hypothesis.split() for hypothesis in hypotheses] + list_of_references_ = [[line.split() for line in ref] for ref in list_of_references] return corpus_bleu( - list_of_references=list_of_targets_, - hypotheses=predictions_, + list_of_references=list_of_references_, + hypotheses=hypotheses_, weights=weights, smoothing_function=smoothing_function, **kwargs @@ -49,19 +49,19 @@ def _compute_bleu_metric_nltk(predictions, list_of_targets, weights, smoothing_f ], ) @pytest.mark.parametrize( - ["predictions", "targets"], + ["preds", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) class TestBLEUScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) - def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_class(self, ddp, dist_sync_on_step, preds, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} compute_bleu_metric_nltk = partial(_compute_bleu_metric_nltk, weights=weights, smoothing_function=smooth_func) self.run_class_metric_test( ddp=ddp, - preds=predictions, + preds=preds, targets=targets, metric_class=BLEUScore, sk_metric=compute_bleu_metric_nltk, @@ -70,12 +70,12 @@ def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, we input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_functional(self, predictions, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_functional(self, preds, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} compute_bleu_metric_nltk = partial(_compute_bleu_metric_nltk, weights=weights, smoothing_function=smooth_func) self.run_functional_metric_test( - predictions, + preds, targets, metric_functional=bleu_score, sk_metric=compute_bleu_metric_nltk, @@ -83,11 +83,11 @@ def test_bleu_score_functional(self, predictions, targets, weights, n_gram, smoo input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_differentiability(self, predictions, targets, weights, n_gram, smooth_func, smooth): + def test_bleu_score_differentiability(self, preds, targets, weights, n_gram, smooth_func, smooth): metric_args = {"n_gram": n_gram, "smooth": smooth} self.run_differentiability_test( - preds=predictions, + preds=preds, targets=targets, metric_module=BLEUScore, metric_functional=bleu_score, @@ -97,26 +97,26 @@ def test_bleu_score_differentiability(self, predictions, targets, weights, n_gra def test_bleu_empty_functional(): - prediction = [[]] - target = [[[]]] - assert bleu_score(prediction, target) == tensor(0.0) + hyp = [[]] + ref = [[[]]] + assert bleu_score(hyp, ref) == tensor(0.0) def test_no_4_gram_functional(): - predictions = ["My full pytorch-lightning"] - targets = [["My full pytorch-lightning test", "Completely Different"]] - assert bleu_score(predictions, targets) == tensor(0.0) + hyps = ["My full pytorch-lightning"] + refs = [["My full pytorch-lightning test", "Completely Different"]] + assert bleu_score(hyps, refs) == tensor(0.0) def test_bleu_empty_class(): bleu = BLEUScore() - prediction = [[]] - target = [[[]]] - assert bleu(prediction, target) == tensor(0.0) + hyp = [[]] + ref = [[[]]] + assert bleu(hyp, ref) == tensor(0.0) def test_no_4_gram_class(): bleu = BLEUScore() - predictions = ["My full pytorch-lightning"] - targets = [["My full pytorch-lightning test", "Completely Different"]] - assert bleu(predictions, targets) == tensor(0.0) + hyps = ["My full pytorch-lightning"] + refs = [["My full pytorch-lightning test", "Completely Different"]] + assert bleu(hyps, refs) == tensor(0.0) diff --git a/tests/text/test_chrf.py b/tests/text/test_chrf.py index 09068d7e733..4863d850dfc 100644 --- a/tests/text/test_chrf.py +++ b/tests/text/test_chrf.py @@ -43,7 +43,7 @@ def sacrebleu_chrf_fn( ], ) @pytest.mark.parametrize( - ["predictions", "targets"], + ["preds", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.skipif(not _SACREBLEU_AVAILABLE, reason="test requires sacrebleu") @@ -51,7 +51,7 @@ class TestCHRFScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) def test_chrf_score_class( - self, ddp, dist_sync_on_step, predictions, targets, char_order, word_order, lowercase, whitespace + self, ddp, dist_sync_on_step, preds, targets, char_order, word_order, lowercase, whitespace ): metric_args = { "n_char_order": char_order, @@ -65,7 +65,7 @@ def test_chrf_score_class( self.run_class_metric_test( ddp=ddp, - preds=predictions, + preds=preds, targets=targets, metric_class=CHRFScore, sk_metric=nltk_metric, @@ -74,7 +74,7 @@ def test_chrf_score_class( input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_functional(self, predictions, targets, char_order, word_order, lowercase, whitespace): + def test_chrf_score_functional(self, preds, targets, char_order, word_order, lowercase, whitespace): metric_args = { "n_char_order": char_order, "n_word_order": word_order, @@ -86,7 +86,7 @@ def test_chrf_score_functional(self, predictions, targets, char_order, word_orde ) self.run_functional_metric_test( - predictions, + preds, targets, metric_functional=chrf_score, sk_metric=nltk_metric, @@ -94,7 +94,7 @@ def test_chrf_score_functional(self, predictions, targets, char_order, word_orde input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_differentiability(self, predictions, targets, char_order, word_order, lowercase, whitespace): + def test_chrf_score_differentiability(self, preds, targets, char_order, word_order, lowercase, whitespace): metric_args = { "n_char_order": char_order, "n_word_order": word_order, @@ -103,7 +103,7 @@ def test_chrf_score_differentiability(self, predictions, targets, char_order, wo } self.run_differentiability_test( - preds=predictions, + preds=preds, targets=targets, metric_module=CHRFScore, metric_functional=chrf_score, @@ -113,28 +113,28 @@ def test_chrf_score_differentiability(self, predictions, targets, char_order, wo def test_chrf_empty_functional(): - prediction = [] - target = [[]] - assert chrf_score(prediction, target) == tensor(0.0) + hyp = [] + ref = [[]] + assert chrf_score(hyp, ref) == tensor(0.0) def test_chrf_empty_class(): chrf = CHRFScore() - prediction = [] - target = [[]] - assert chrf(prediction, target) == tensor(0.0) + hyp = [] + ref = [[]] + assert chrf(hyp, ref) == tensor(0.0) def test_chrf_return_sentence_level_score_functional(): - prediction = _inputs_single_sentence_multiple_references.preds - target = _inputs_single_sentence_multiple_references.targets - _, chrf_sentence_score = chrf_score(prediction, target, return_sentence_level_score=True) + hyp = _inputs_single_sentence_multiple_references.preds + ref = _inputs_single_sentence_multiple_references.targets + _, chrf_sentence_score = chrf_score(hyp, ref, return_sentence_level_score=True) isinstance(chrf_sentence_score, Tensor) def test_chrf_return_sentence_level_class(): chrf = CHRFScore(return_sentence_level_score=True) - prediction = _inputs_single_sentence_multiple_references.preds - target = _inputs_single_sentence_multiple_references.targets - _, chrf_sentence_score = chrf(prediction, target) + hyp = _inputs_single_sentence_multiple_references.preds + ref = _inputs_single_sentence_multiple_references.targets + _, chrf_sentence_score = chrf(hyp, ref) isinstance(chrf_sentence_score, Tensor) diff --git a/tests/text/test_sacre_bleu.py b/tests/text/test_sacre_bleu.py index 3e121b026d3..6cbe0aa8328 100644 --- a/tests/text/test_sacre_bleu.py +++ b/tests/text/test_sacre_bleu.py @@ -31,18 +31,16 @@ TOKENIZERS = ("none", "13a", "zh", "intl", "char") -def sacrebleu_fn( - predictions: Sequence[str], targets: Sequence[Sequence[str]], tokenize: str, lowercase: bool -) -> Tensor: +def sacrebleu_fn(preds: Sequence[str], targets: Sequence[Sequence[str]], tokenize: str, lowercase: bool) -> Tensor: sacrebleu_fn = BLEU(tokenize=tokenize, lowercase=lowercase) # Sacrebleu expects different format of input targets = [[target[i] for target in targets] for i in range(len(targets[0]))] - sacrebleu_score = sacrebleu_fn.corpus_score(predictions, targets).score / 100 + sacrebleu_score = sacrebleu_fn.corpus_score(preds, targets).score / 100 return tensor(sacrebleu_score) @pytest.mark.parametrize( - ["predictions", "targets"], + ["preds", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.parametrize(["lowercase"], [(False,), (True,)]) @@ -51,13 +49,13 @@ def sacrebleu_fn( class TestSacreBLEUScore(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) - def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, tokenize, lowercase): + def test_bleu_score_class(self, ddp, dist_sync_on_step, preds, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} original_sacrebleu = partial(sacrebleu_fn, tokenize=tokenize, lowercase=lowercase) self.run_class_metric_test( ddp=ddp, - preds=predictions, + preds=preds, targets=targets, metric_class=SacreBLEUScore, sk_metric=original_sacrebleu, @@ -66,12 +64,12 @@ def test_bleu_score_class(self, ddp, dist_sync_on_step, predictions, targets, to input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_functional(self, predictions, targets, tokenize, lowercase): + def test_bleu_score_functional(self, preds, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} original_sacrebleu = partial(sacrebleu_fn, tokenize=tokenize, lowercase=lowercase) self.run_functional_metric_test( - predictions, + preds, targets, metric_functional=sacre_bleu_score, sk_metric=original_sacrebleu, @@ -79,11 +77,11 @@ def test_bleu_score_functional(self, predictions, targets, tokenize, lowercase): input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_bleu_score_differentiability(self, predictions, targets, tokenize, lowercase): + def test_bleu_score_differentiability(self, preds, targets, tokenize, lowercase): metric_args = {"tokenize": tokenize, "lowercase": lowercase} self.run_differentiability_test( - preds=predictions, + preds=preds, targets=targets, metric_module=SacreBLEUScore, metric_functional=sacre_bleu_score, diff --git a/tests/text/test_ter.py b/tests/text/test_ter.py index 2961575b712..50c38049031 100644 --- a/tests/text/test_ter.py +++ b/tests/text/test_ter.py @@ -15,7 +15,7 @@ def sacrebleu_ter_fn( - predictions: Sequence[str], + preds: Sequence[str], targets: Sequence[Sequence[str]], normalized: bool, no_punct: bool, @@ -27,7 +27,7 @@ def sacrebleu_ter_fn( ) # Sacrebleu CHRF expects different format of input targets = [[target[i] for target in targets] for i in range(len(targets[0]))] - sacrebleu_ter = sacrebleu_ter.corpus_score(predictions, targets).score / 100 + sacrebleu_ter = sacrebleu_ter.corpus_score(preds, targets).score / 100 return tensor(sacrebleu_ter) @@ -43,7 +43,7 @@ def sacrebleu_ter_fn( ], ) @pytest.mark.parametrize( - ["predictions", "targets"], + ["preds", "targets"], [(_inputs_multiple_references.preds, _inputs_multiple_references.targets)], ) @pytest.mark.skipif(not _SACREBLEU_AVAILABLE, reason="test requires sacrebleu") @@ -51,7 +51,7 @@ class TestTER(TextTester): @pytest.mark.parametrize("ddp", [False, True]) @pytest.mark.parametrize("dist_sync_on_step", [False, True]) def test_chrf_score_class( - self, ddp, dist_sync_on_step, predictions, targets, normalize, no_punctuation, asian_support, lowercase + self, ddp, dist_sync_on_step, preds, targets, normalize, no_punctuation, asian_support, lowercase ): metric_args = { "normalize": normalize, @@ -69,7 +69,7 @@ def test_chrf_score_class( self.run_class_metric_test( ddp=ddp, - preds=predictions, + preds=preds, targets=targets, metric_class=TER, sk_metric=nltk_metric, @@ -78,7 +78,7 @@ def test_chrf_score_class( input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_ter_score_functional(self, predictions, targets, normalize, no_punctuation, asian_support, lowercase): + def test_ter_score_functional(self, preds, targets, normalize, no_punctuation, asian_support, lowercase): metric_args = { "normalize": normalize, "no_punctuation": no_punctuation, @@ -94,7 +94,7 @@ def test_ter_score_functional(self, predictions, targets, normalize, no_punctuat ) self.run_functional_metric_test( - predictions, + preds, targets, metric_functional=ter, sk_metric=nltk_metric, @@ -102,9 +102,7 @@ def test_ter_score_functional(self, predictions, targets, normalize, no_punctuat input_order=INPUT_ORDER.PREDS_FIRST, ) - def test_chrf_score_differentiability( - self, predictions, targets, normalize, no_punctuation, asian_support, lowercase - ): + def test_chrf_score_differentiability(self, preds, targets, normalize, no_punctuation, asian_support, lowercase): metric_args = { "normalize": normalize, "no_punctuation": no_punctuation, @@ -113,7 +111,7 @@ def test_chrf_score_differentiability( } self.run_differentiability_test( - preds=predictions, + preds=preds, targets=targets, metric_module=TER, metric_functional=ter, @@ -123,41 +121,41 @@ def test_chrf_score_differentiability( def test_ter_empty_functional(): - prediction = [] - target = [[]] - assert ter(prediction, target) == tensor(0.0) + hyp = [] + ref = [[]] + assert ter(hyp, ref) == tensor(0.0) def test_ter_empty_class(): ter_metric = TER() - prediction = [] - target = [[]] - assert ter_metric(prediction, target) == tensor(0.0) + hyp = [] + ref = [[]] + assert ter_metric(hyp, ref) == tensor(0.0) -def test_ter_empty_with_non_empty_prediction_functional(): - prediction = ["python"] - target = [[]] - assert ter(prediction, target) == tensor(0.0) +def test_ter_empty_with_non_empty_hyp_functional(): + hyp = ["python"] + ref = [[]] + assert ter(hyp, ref) == tensor(0.0) -def test_ter_empty_with_non_empty_prediction_class(): +def test_ter_empty_with_non_empty_hyp_class(): ter_metric = TER() - prediction = ["python"] - target = [[]] - assert ter_metric(prediction, target) == tensor(0.0) + hyp = ["python"] + ref = [[]] + assert ter_metric(hyp, ref) == tensor(0.0) def test_ter_return_sentence_level_score_functional(): - prediction = _inputs_single_sentence_multiple_references.preds - target = _inputs_single_sentence_multiple_references.targets - _, sentence_ter = ter(prediction, target, return_sentence_level_score=True) + hyp = _inputs_single_sentence_multiple_references.preds + ref = _inputs_single_sentence_multiple_references.targets + _, sentence_ter = ter(hyp, ref, return_sentence_level_score=True) isinstance(sentence_ter, Tensor) def test_ter_return_sentence_level_class(): ter_metric = TER(return_sentence_level_score=True) - prediction = _inputs_single_sentence_multiple_references.preds - target = _inputs_single_sentence_multiple_references.targets - _, sentence_ter = ter_metric(prediction, target) + hyp = _inputs_single_sentence_multiple_references.preds + ref = _inputs_single_sentence_multiple_references.targets + _, sentence_ter = ter_metric(hyp, ref) isinstance(sentence_ter, Tensor) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 0a942554949..94df9495508 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -58,57 +58,57 @@ def _tokenize_fn(sentence: str) -> Sequence[str]: def _bleu_score_update( - prediction_corpus: Sequence[str], - target_corpus: Sequence[Sequence[str]], + translate_corpus: Sequence[str], + reference_corpus: Sequence[Sequence[str]], numerator: Tensor, denominator: Tensor, - prediction_len: Tensor, - target_len: Tensor, + trans_len: Tensor, + ref_len: Tensor, n_gram: int = 4, tokenizer: Callable[[str], Sequence[str]] = _tokenize_fn, ) -> Tuple[Tensor, Tensor]: """Updates and returns variables required to compute the BLEU score. Args: - prediction_corpus: An iterable of machine translated corpus - target_corpus: An iterable of iterables of reference corpus + translate_corpus: An iterable of machine translated corpus + reference_corpus: An iterable of iterables of reference corpus numerator: Numerator of precision score (true positives) denominator: Denominator of precision score (true positives + false positives) - prediction_len: count of words in a candidate prediction - target_len: count of words in a reference translation + trans_len: count of words in a candidate prediction + ref_len: count of words in a reference translation n_gram: gram value ranged 1 to 4 tokenizer: A function that turns sentence into list of words """ - target_corpus_: Sequence[Sequence[Sequence[str]]] = [ - [tokenizer(line) if line else [] for line in target] for target in target_corpus + reference_corpus_: Sequence[Sequence[Sequence[str]]] = [ + [tokenizer(line) if line else [] for line in reference] for reference in reference_corpus ] - prediction_corpus_: Sequence[Sequence[str]] = [tokenizer(line) if line else [] for line in prediction_corpus] + translate_corpus_: Sequence[Sequence[str]] = [tokenizer(line) if line else [] for line in translate_corpus] - for (prediction, targets) in zip(prediction_corpus_, target_corpus_): - prediction_len += len(prediction) - target_len_list = [len(ref) for ref in targets] - target_len_diff = [abs(len(prediction) - x) for x in target_len_list] - target_len += target_len_list[target_len_diff.index(min(target_len_diff))] - prediction_counter: Counter = _count_ngram(prediction, n_gram) - target_counter: Counter = Counter() + for (translation, references) in zip(translate_corpus_, reference_corpus_): + trans_len += len(translation) + ref_len_list = [len(ref) for ref in references] + ref_len_diff = [abs(len(translation) - x) for x in ref_len_list] + ref_len += ref_len_list[ref_len_diff.index(min(ref_len_diff))] + translation_counter: Counter = _count_ngram(translation, n_gram) + reference_counter: Counter = Counter() - for ref in targets: - target_counter |= _count_ngram(ref, n_gram) + for ref in references: + reference_counter |= _count_ngram(ref, n_gram) - ngram_counter_clip = prediction_counter & target_counter + ngram_counter_clip = translation_counter & reference_counter for counter_clip in ngram_counter_clip: numerator[len(counter_clip) - 1] += ngram_counter_clip[counter_clip] - for counter in prediction_counter: - denominator[len(counter) - 1] += prediction_counter[counter] + for counter in translation_counter: + denominator[len(counter) - 1] += translation_counter[counter] - return prediction_len, target_len + return trans_len, ref_len def _bleu_score_compute( - prediction_len: Tensor, - target_len: Tensor, + trans_len: Tensor, + ref_len: Tensor, numerator: Tensor, denominator: Tensor, n_gram: int = 4, @@ -117,8 +117,8 @@ def _bleu_score_compute( """Computes the BLEU score. Args: - prediction_len: count of words in a candidate prediction - target_len: count of words in a reference translation + trans_len: count of words in a candidate translation + ref_len: count of words in a reference translation numerator: Numerator of precision score (true positives) denominator: Denominator of precision score (true positives + false positives) n_gram: gram value ranged 1 to 4 @@ -139,26 +139,24 @@ def _bleu_score_compute( log_precision_scores = tensor([1.0 / n_gram] * n_gram, device=device) * torch.log(precision_scores) geometric_mean = torch.exp(torch.sum(log_precision_scores)) - brevity_penalty = ( - tensor(1.0, device=device) if prediction_len > target_len else torch.exp(1 - (target_len / prediction_len)) - ) + brevity_penalty = tensor(1.0, device=device) if trans_len > ref_len else torch.exp(1 - (ref_len / trans_len)) bleu = brevity_penalty * geometric_mean return bleu def bleu_score( - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Sequence[Union[str, Sequence[str]]], + translate_corpus: Union[str, Sequence[str]], + reference_corpus: Sequence[Union[str, Sequence[str]]], n_gram: int = 4, smooth: bool = False, ) -> Tensor: """Calculate `BLEU score`_ of machine translated text with one or more references. Args: - prediction_corpus: + translate_corpus: An iterable of machine translated corpus - target_corpus: + reference_corpus: An iterable of iterables of reference corpus n_gram: Gram value ranged from 1 to 4 (Default 4) @@ -170,9 +168,9 @@ def bleu_score( Example: >>> from torchmetrics.functional import bleu_score - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> bleu_score(prediction_corpus, target_corpus) + >>> translate_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> bleu_score(translate_corpus, reference_corpus) tensor(0.7598) References: @@ -186,20 +184,21 @@ def bleu_score( "Input order of targets and preds were changed to predictions firsts and targets \ second in v0.7. Warning will be removed in v0.8" ) + translate_corpus_ = [translate_corpus] if isinstance(translate_corpus, str) else translate_corpus + reference_corpus_ = [ + [reference_text] if isinstance(reference_text, str) else reference_text for reference_text in reference_corpus + ] - prediction_corpus_ = [prediction_corpus] if isinstance(prediction_corpus, str) else prediction_corpus - target_corpus_ = [[target_text] if isinstance(target_text, str) else target_text for target_text in target_corpus] - - if len(prediction_corpus_) != len(target_corpus_): - raise ValueError(f"Corpus has different size {len(prediction_corpus_)} != {len(target_corpus_)}") + if len(translate_corpus_) != len(reference_corpus_): + raise ValueError(f"Corpus has different size {len(translate_corpus_)} != {len(reference_corpus_)}") numerator = torch.zeros(n_gram) denominator = torch.zeros(n_gram) - prediction_len = tensor(0, dtype=torch.float) - target_len = tensor(0, dtype=torch.float) + trans_len = tensor(0, dtype=torch.float) + ref_len = tensor(0, dtype=torch.float) - prediction_len, target_len = _bleu_score_update( - prediction_corpus_, target_corpus_, numerator, denominator, prediction_len, target_len, n_gram, _tokenize_fn + trans_len, ref_len = _bleu_score_update( + translate_corpus_, reference_corpus_, numerator, denominator, trans_len, ref_len, n_gram, _tokenize_fn ) - return _bleu_score_compute(prediction_len, target_len, numerator, denominator, n_gram, smooth) + return _bleu_score_compute(trans_len, ref_len, numerator, denominator, n_gram, smooth) diff --git a/torchmetrics/functional/text/chrf.py b/torchmetrics/functional/text/chrf.py index 42436f62d7c..ba8b2214931 100644 --- a/torchmetrics/functional/text/chrf.py +++ b/torchmetrics/functional/text/chrf.py @@ -51,7 +51,7 @@ def _prepare_n_grams_dicts( ) -> Tuple[ Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor] ]: - """Prepare dictionaries dictionaries with default zero values for total target, prediction and matching + """Prepare dictionaries dictionaries with default zero values for total reference, hypothesis and matching character and word n-grams. Args: @@ -61,21 +61,21 @@ def _prepare_n_grams_dicts( A word n-gram order. Return: - Dictionaries with default zero values for total target, prediction and matching character and word + Dictionaries with default zero values for total reference, hypothesis and matching character and word n-grams. """ - total_target_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} - total_target_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} - total_prediction_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} - total_prediction_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} + total_ref_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} + total_ref_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} + total_hyp_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} + total_hyp_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} total_matching_char_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} total_matching_word_n_grams: Dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} return ( - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) @@ -209,24 +209,24 @@ def _get_total_ngrams(n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]]) def _get_ngram_matches( - prediction_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - target_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + hyp_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + ref_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], ) -> Dict[int, Tensor]: - """Get a number of n-gram matches between target and prediction n-grams. + """Get a number of n-gram matches between reference and hypothesis n-grams. Args: - prediction_n_grams_counts: - target_n_grams_counts: + hyp_n_grams_counts: + ref_n_grams_counts: Return: matching_n_grams """ matching_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - for n in prediction_n_grams_counts: + for n in hyp_n_grams_counts: matching_n_grams[n] = tensor( sum( - torch.min(target_n_grams_counts[n][n_gram], prediction_n_grams_counts[n][n_gram]) - for n_gram in prediction_n_grams_counts[n] + torch.min(ref_n_grams_counts[n][n_gram], hyp_n_grams_counts[n][n_gram]) + for n_gram in hyp_n_grams_counts[n] ) ) return matching_n_grams @@ -252,29 +252,29 @@ def _sum_over_dicts(total_n_grams: Dict[int, Tensor], n_grams: Dict[int, Tensor] def _calculate_fscore( matching_char_n_grams: Dict[int, Tensor], matching_word_n_grams: Dict[int, Tensor], - prediction_char_n_grams: Dict[int, Tensor], - prediction_word_n_grams: Dict[int, Tensor], - target_char_n_grams: Dict[int, Tensor], - target_word_n_grams: Dict[int, Tensor], + hyp_char_n_grams: Dict[int, Tensor], + hyp_word_n_grams: Dict[int, Tensor], + ref_char_n_grams: Dict[int, Tensor], + ref_word_n_grams: Dict[int, Tensor], n_order: float, beta: float, ) -> Tensor: - """Calculate sentence-level chrF/chrF++ score. For given prediction and target statistics (either sentence- + """Calculate sentence-level chrF/chrF++ score. For given hypothesis and reference statistics (either sentence- level or corpus-level) the chrF/chrF++ score is returned. Args: matching_char_n_grams: - A total number of matching character n-grams between the best matching target and prediction. + A total number of matching character n-grams between the best matching reference and hypothesis. matching_word_n_grams: - A total number of matching word n-grams between the best matching target and prediction. - prediction_char_n_grams: - A total number of prediction character n-grams. - prediction_word_n_grams: - A total number of prediction word n-grams. - target_char_n_grams: - A total number of target character n-grams. - target_word_n_grams: - A total number of target word n-grams. + A total number of matching word n-grams between the best matching reference and hypothesis. + hyp_char_n_grams: + A total number of hypothesis character n-grams. + hyp_word_n_grams: + A total number of hypothesis word n-grams. + ref_char_n_grams: + A total number of reference character n-grams. + ref_word_n_grams: + A total number of reference word n-grams. n_order: A sum of character and word n-gram order. beta: @@ -303,19 +303,19 @@ def _get_n_gram_fscore( return f_score - char_n_gram_f_score = _get_n_gram_fscore(matching_char_n_grams, target_char_n_grams, prediction_char_n_grams, beta) - word_n_gram_f_score = _get_n_gram_fscore(matching_word_n_grams, target_word_n_grams, prediction_word_n_grams, beta) + char_n_gram_f_score = _get_n_gram_fscore(matching_char_n_grams, ref_char_n_grams, hyp_char_n_grams, beta) + word_n_gram_f_score = _get_n_gram_fscore(matching_word_n_grams, ref_word_n_grams, hyp_word_n_grams, beta) f_score = (sum(char_n_gram_f_score.values()) + sum(word_n_gram_f_score.values())) / tensor(n_order) # type: ignore return f_score def _calculate_sentence_level_chrf_score( - targets: List[str], - prediction_char_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - prediction_word_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], - prediction_char_n_grams: Dict[int, Tensor], - prediction_word_n_grams: Dict[int, Tensor], + references: List[str], + hyp_char_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + hyp_word_n_grams_counts: Dict[int, Dict[Tuple[str, ...], Tensor]], + hyp_char_n_grams: Dict[int, Tensor], + hyp_word_n_grams: Dict[int, Tensor], n_char_order: int, n_word_order: int, n_order: float, @@ -323,20 +323,20 @@ def _calculate_sentence_level_chrf_score( lowercase: bool, whitespace: bool, ) -> Tuple[Tensor, Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor], Dict[int, Tensor]]: - """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed prediction, all targets are - evaluated and score and statistics for the best matching target is returned. + """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed hypothesis, all references + are evaluated and score and statistics for the best matching reference is returned. Args: - targets: - An iterable of targets. - prediction_char_n_grams_counts: - A dictionary of dictionaries with prediction character n-grams. - prediction_word_n_grams_counts: - A dictionary of dictionaries with prediction word n-grams. - prediction_char_n_grams: - A total number of prediction character n-grams. - prediction_word_n_grams: - A total number of prediction word n-grams. + references: + An iterable of references. + hyp_char_n_grams_counts: + A dictionary of dictionaries with hypothesis character n-grams. + hyp_word_n_grams_counts: + A dictionary of dictionaries with hypothesis word n-grams. + hyp_char_n_grams: + A total number of hypothesis character n-grams. + hyp_word_n_grams: + A total number of hypothesis word n-grams. n_char_order: A character n-gram order. n_word_order: @@ -351,43 +351,43 @@ def _calculate_sentence_level_chrf_score( An indication whether to keep whitespaces during character n-gram extraction. Return: - Return chrF/chrF++ score and statistics for the best matching prediction and target. + Return chrF/chrF++ score and statistics for the best matching hypothesis and reference. f_score: A sentence-level chrF/chrF++ score. matching_char_n_grams: - A total number of matching character n-grams between the best matching target and prediction. + A total number of matching character n-grams between the best matching reference and hypothesis. matching_word_n_grams: - A total number of matching word n-grams between the best matching target and prediction. - target_char_n_grams: - A total number of target character n-grams. - target_word_n_grams: - A total number of target word n-grams. + A total number of matching word n-grams between the best matching reference and hypothesis. + ref_char_n_grams: + A total number of reference character n-grams. + ref_word_n_grams: + A total number of reference word n-grams. """ best_f_score = tensor(0.0) best_matching_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) best_matching_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - best_target_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - best_target_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) + best_ref_char_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) + best_ref_word_n_grams: Dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) - for target in targets: + for reference in references: ( - target_char_n_grams_counts, - target_word_n_grams_counts, - target_char_n_grams, - target_word_n_grams, - ) = _get_n_grams_counts_and_total_ngrams(target, n_char_order, n_word_order, lowercase, whitespace) - matching_char_n_grams = _get_ngram_matches(prediction_char_n_grams_counts, target_char_n_grams_counts) - matching_word_n_grams = _get_ngram_matches(prediction_word_n_grams_counts, target_word_n_grams_counts) + ref_char_n_grams_counts, + ref_word_n_grams_counts, + ref_char_n_grams, + ref_word_n_grams, + ) = _get_n_grams_counts_and_total_ngrams(reference, n_char_order, n_word_order, lowercase, whitespace) + matching_char_n_grams = _get_ngram_matches(ref_char_n_grams_counts, hyp_char_n_grams_counts) + matching_word_n_grams = _get_ngram_matches(ref_word_n_grams_counts, hyp_word_n_grams_counts) f_score = _calculate_fscore( matching_char_n_grams, matching_word_n_grams, - prediction_char_n_grams, - prediction_word_n_grams, - target_char_n_grams, - target_word_n_grams, + hyp_char_n_grams, + hyp_word_n_grams, + ref_char_n_grams, + ref_word_n_grams, n_order, beta, ) @@ -396,25 +396,25 @@ def _calculate_sentence_level_chrf_score( best_f_score = f_score best_matching_char_n_grams = matching_char_n_grams best_matching_word_n_grams = matching_word_n_grams - best_target_char_n_grams = target_char_n_grams - best_target_word_n_grams = target_word_n_grams + best_ref_char_n_grams = ref_char_n_grams + best_ref_word_n_grams = ref_word_n_grams return ( best_f_score, best_matching_char_n_grams, best_matching_word_n_grams, - best_target_char_n_grams, - best_target_word_n_grams, + best_ref_char_n_grams, + best_ref_word_n_grams, ) def _chrf_score_update( - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Union[Sequence[str], Sequence[Sequence[str]]], - total_prediction_char_n_grams: Dict[int, Tensor], - total_prediction_word_n_grams: Dict[int, Tensor], - total_target_char_n_grams: Dict[int, Tensor], - total_target_word_n_grams: Dict[int, Tensor], + hypothesis_corpus: Union[str, Sequence[str]], + reference_corpus: Union[Sequence[str], Sequence[Sequence[str]]], + total_hyp_char_n_grams: Dict[int, Tensor], + total_hyp_word_n_grams: Dict[int, Tensor], + total_ref_char_n_grams: Dict[int, Tensor], + total_ref_word_n_grams: Dict[int, Tensor], total_matching_char_n_grams: Dict[int, Tensor], total_matching_word_n_grams: Dict[int, Tensor], n_char_order: int, @@ -435,22 +435,22 @@ def _chrf_score_update( ]: """ Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. - total_prediction_char_n_grams: - A dictionary containing a total number of prediction character n-grams. - total_prediction_word_n_grams: - A dictionary containing a total number of prediction word n-grams. - total_target_char_n_grams: - A dictionary containing a total number of target character n-grams. - total_target_word_n_grams: - A dictionary containing a total number of target word n-grams. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. + total_hyp_char_n_grams: + A dictionary containing a total number of hypothesis character n-grams. + total_hyp_word_n_grams: + A dictionary containing a total number of hypothesis word n-grams. + total_ref_char_n_grams: + A dictionary containing a total number of reference character n-grams. + total_ref_word_n_grams: + A dictionary containing a total number of reference word n-grams. total_matching_char_n_grams: - A dictionary containing a total number of matching character n-grams between targets and hypotheses. + A dictionary containing a total number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: - A dictionary containing a total number of total matching word n-grams between targets and hypotheses. + A dictionary containing a total number of total matching word n-grams between references and hypotheses. n_char_order: A character n-gram order. n_word_order: @@ -467,51 +467,51 @@ def _chrf_score_update( A list of sentence-level chrF/chrF++ scores. Return: - total_prediction_char_n_grams: - An updated dictionary containing a total number of prediction character n-grams. - total_prediction_word_n_grams: - An updated dictionary containing a total number of prediction word n-grams. - total_target_char_n_grams: - An updated dictionary containing a total number of target character n-grams. - total_target_word_n_grams: - An updated dictionary containing a total number of target word n-grams. + total_ref_char_n_grams: + An updated dictionary containing a total number of reference character n-grams. + total_ref_word_n_grams: + An updated dictionary containing a total number of reference word n-grams. + total_hyp_char_n_grams: + An updated dictionary containing a total number of hypothesis character n-grams. + total_hyp_word_n_grams: + An updated dictionary containing a total number of hypothesis word n-grams. total_matching_char_n_grams: - An updated dictionary containing a total number of matching character n-grams between targets and + An updated dictionary containing a total number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: - An updated dictionary containing a total number of total matching word n-grams between targets and + An updated dictionary containing a total number of total matching word n-grams between references and hypotheses. sentence_chrf_score: (Optionally) A list of sentence-level chrF/chrF++ scores. Raises: ValueError: - If length of `target_corpus` and `prediction_corpus` differs. + If length of `reference_corpus` and `hypothesis_corpus` differs. """ - target_corpus, prediction_corpus = _validate_inputs(target_corpus, prediction_corpus) + reference_corpus, hypothesis_corpus = _validate_inputs(reference_corpus, hypothesis_corpus) - for (targets, prediction) in zip(target_corpus, prediction_corpus): + for (hypothesis, references) in zip(hypothesis_corpus, reference_corpus): ( - prediction_char_n_grams_counts, - prediction_word_n_grams_counts, - prediction_char_n_grams, - prediction_word_n_grams, - ) = _get_n_grams_counts_and_total_ngrams(prediction, n_char_order, n_word_order, lowercase, whitespace) - total_prediction_char_n_grams = _sum_over_dicts(total_prediction_char_n_grams, prediction_char_n_grams) - total_prediction_word_n_grams = _sum_over_dicts(total_prediction_word_n_grams, prediction_word_n_grams) + hyp_char_n_grams_counts, + hyp_word_n_grams_counts, + hyp_char_n_grams, + hyp_word_n_grams, + ) = _get_n_grams_counts_and_total_ngrams(hypothesis, n_char_order, n_word_order, lowercase, whitespace) + total_hyp_char_n_grams = _sum_over_dicts(total_hyp_char_n_grams, hyp_char_n_grams) + total_hyp_word_n_grams = _sum_over_dicts(total_hyp_word_n_grams, hyp_word_n_grams) ( sentence_level_f_score, matching_char_n_grams, matching_word_n_grams, - target_char_n_grams, - target_word_n_grams, + ref_char_n_grams, + ref_word_n_grams, ) = _calculate_sentence_level_chrf_score( - targets, # type: ignore - prediction_char_n_grams_counts, - prediction_word_n_grams_counts, - prediction_char_n_grams, - prediction_word_n_grams, + references, # type: ignore + hyp_char_n_grams_counts, + hyp_word_n_grams_counts, + hyp_char_n_grams, + hyp_word_n_grams, n_char_order, n_word_order, n_order, @@ -523,16 +523,16 @@ def _chrf_score_update( if sentence_chrf_score is not None: sentence_chrf_score.append(sentence_level_f_score.unsqueeze(0)) - total_target_char_n_grams = _sum_over_dicts(total_target_char_n_grams, target_char_n_grams) - total_target_word_n_grams = _sum_over_dicts(total_target_word_n_grams, target_word_n_grams) + total_ref_char_n_grams = _sum_over_dicts(total_ref_char_n_grams, ref_char_n_grams) + total_ref_word_n_grams = _sum_over_dicts(total_ref_word_n_grams, ref_word_n_grams) total_matching_char_n_grams = _sum_over_dicts(total_matching_char_n_grams, matching_char_n_grams) total_matching_word_n_grams = _sum_over_dicts(total_matching_word_n_grams, matching_word_n_grams) return ( - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, @@ -540,10 +540,10 @@ def _chrf_score_update( def _chrf_score_compute( - total_prediction_char_n_grams: Dict[int, Tensor], - total_prediction_word_n_grams: Dict[int, Tensor], - total_target_char_n_grams: Dict[int, Tensor], - total_target_word_n_grams: Dict[int, Tensor], + total_hyp_char_n_grams: Dict[int, Tensor], + total_hyp_word_n_grams: Dict[int, Tensor], + total_ref_char_n_grams: Dict[int, Tensor], + total_ref_word_n_grams: Dict[int, Tensor], total_matching_char_n_grams: Dict[int, Tensor], total_matching_word_n_grams: Dict[int, Tensor], n_order: float, @@ -552,18 +552,18 @@ def _chrf_score_compute( """Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word n-grams. Args: - total_prediction_char_n_grams: - A dictionary containing a total number of prediction character n-grams. - total_prediction_word_n_grams: - A dictionary containing a total number of prediction word n-grams. - total_target_char_n_grams: - A dictionary containing a total number of target character n-grams. - total_target_word_n_grams: - A dictionary containing a total number of target word n-grams. + total_hyp_char_n_grams: + A dictionary containing a total number of hypothesis character n-grams. + total_hyp_word_n_grams: + A dictionary containing a total number of hypothesis word n-grams. + total_ref_char_n_grams: + A dictionary containing a total number of reference character n-grams. + total_ref_word_n_grams: + A dictionary containing a total number of reference word n-grams. total_matching_char_n_grams: - A dictionary containing a total number of matching character n-grams between targets and hypotheses. + A dictionary containing a total number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: - A dictionary containing a total number of total matching word n-grams between targets and hypotheses. + A dictionary containing a total number of total matching word n-grams between references and hypotheses. n_order: A sum of charachter and word n-gram order. beta: @@ -575,10 +575,10 @@ def _chrf_score_compute( chrf_f_score = _calculate_fscore( total_matching_char_n_grams, total_matching_word_n_grams, - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, n_order, beta, ) @@ -586,8 +586,8 @@ def _chrf_score_compute( def chrf_score( - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Union[Sequence[str], Sequence[Sequence[str]]], + hypothesis_corpus: Union[str, Sequence[str]], + reference_corpus: Union[Sequence[str], Sequence[Sequence[str]]], n_char_order: int = 6, n_word_order: int = 2, beta: float = 2.0, @@ -595,16 +595,16 @@ def chrf_score( whitespace: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Calculate `chrF score`_ of machine translated text with one or more targets. This implementation supports - both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This + """Calculate `chrF score`_ of machine translated text with one or more references. This implementation + supports both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This implementation follows the implmenetaions from https://github.com/m-popovic/chrF and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py. Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. n_char_order: A character n-gram order. If `n_char_order=6`, the metrics refers to the official chrF/chrF++. n_word_order: @@ -633,9 +633,9 @@ def chrf_score( Example: >>> from torchmetrics.functional import chrf_score - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> chrf_score(prediction_corpus, target_corpus) + >>> hypothesis_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> chrf_score(hypothesis_corpus, reference_corpus) tensor(0.8640) References: @@ -652,10 +652,10 @@ def chrf_score( n_order = float(n_char_order + n_word_order) ( - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) = _prepare_n_grams_dicts(n_char_order, n_word_order) @@ -663,20 +663,20 @@ def chrf_score( sentence_chrf_score: Optional[List[Tensor]] = [] if return_sentence_level_score else None ( - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, ) = _chrf_score_update( - prediction_corpus, - target_corpus, - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + hypothesis_corpus, + reference_corpus, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_char_order, @@ -689,10 +689,10 @@ def chrf_score( ) chrf_f_score = _chrf_score_compute( - total_prediction_char_n_grams, - total_prediction_word_n_grams, - total_target_char_n_grams, - total_target_word_n_grams, + total_hyp_char_n_grams, + total_hyp_word_n_grams, + total_ref_char_n_grams, + total_ref_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_order, diff --git a/torchmetrics/functional/text/sacre_bleu.py b/torchmetrics/functional/text/sacre_bleu.py index 00401e4392a..9f2b51d470a 100644 --- a/torchmetrics/functional/text/sacre_bleu.py +++ b/torchmetrics/functional/text/sacre_bleu.py @@ -278,8 +278,8 @@ def _lower(line: str, lowercase: bool) -> str: def sacre_bleu_score( - prediction_corpus: Sequence[str], - target_corpus: Sequence[Sequence[str]], + translate_corpus: Sequence[str], + reference_corpus: Sequence[Sequence[str]], n_gram: int = 4, smooth: bool = False, tokenize: Literal["none", "13a", "zh", "intl", "char"] = "13a", @@ -289,9 +289,9 @@ def sacre_bleu_score( follows the behaviour of SacreBLEU [2] implementation from https://github.com/mjpost/sacrebleu. Args: - prediction_corpus: + translate_corpus: An iterable of machine translated corpus - target_corpus: + reference_corpus: An iterable of iterables of reference corpus n_gram: Gram value ranged from 1 to 4 (Default 4) @@ -308,9 +308,9 @@ def sacre_bleu_score( Example: >>> from torchmetrics.functional import sacre_bleu_score - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> sacre_bleu_score(prediction_corpus, target_corpus) + >>> translate_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> sacre_bleu_score(translate_corpus, reference_corpus) tensor(0.7598) References: @@ -333,8 +333,8 @@ def sacre_bleu_score( raise ValueError( f"Unsupported tokenizer selected. Please, choose one of {list(_SacreBLEUTokenizer._TOKENIZE_FN.keys())}" ) - if len(prediction_corpus) != len(target_corpus): - raise ValueError(f"Corpus has different size {len(prediction_corpus)} != {len(target_corpus)}") + if len(translate_corpus) != len(reference_corpus): + raise ValueError(f"Corpus has different size {len(translate_corpus)} != {len(reference_corpus)}") if tokenize == "intl" and not _REGEX_AVAILABLE: raise ValueError( "`'intl'` tokenization requires `regex` installed. Use `pip install regex` or `pip install " @@ -343,19 +343,19 @@ def sacre_bleu_score( numerator = torch.zeros(n_gram) denominator = torch.zeros(n_gram) - prediction_len = tensor(0, dtype=torch.float) - target_len = tensor(0, dtype=torch.float) + trans_len = tensor(0, dtype=torch.float) + ref_len = tensor(0, dtype=torch.float) tokenize_fn = partial(_SacreBLEUTokenizer.tokenize, tokenize=tokenize, lowercase=lowercase) - prediction_len, target_len = _bleu_score_update( - prediction_corpus, - target_corpus, + trans_len, ref_len = _bleu_score_update( + translate_corpus, + reference_corpus, numerator, denominator, - prediction_len, - target_len, + trans_len, + ref_len, n_gram, tokenize_fn, ) - return _bleu_score_compute(prediction_len, target_len, numerator, denominator, n_gram, smooth) + return _bleu_score_compute(trans_len, ref_len, numerator, denominator, n_gram, smooth) diff --git a/torchmetrics/functional/text/ter.py b/torchmetrics/functional/text/ter.py index b8ac23ce491..ba217c47225 100644 --- a/torchmetrics/functional/text/ter.py +++ b/torchmetrics/functional/text/ter.py @@ -206,85 +206,84 @@ def _preprocess_sentence(sentence: str, tokenizer: _TercomTokenizer) -> str: return tokenizer(sentence.rstrip()) -def _find_shifted_pairs(prediction_words: List[str], target_words: List[str]) -> Iterator[Tuple[int, int, int]]: +def _find_shifted_pairs(hypothesis_words: List[str], reference_words: List[str]) -> Iterator[Tuple[int, int, int]]: """Find matching word sub-sequences in two lists of words. Ignores sub-sequences starting at the same position. Args: - prediction_words: - A list of a tokenized prediction sentence. - target_words: - A list of a tokenized target sentence. - + hypothesis_words: + A list of a tokenized hypothesis sentence. + reference_words: + A list of a tokenized reference sentence. Return: - Yields tuples of `(prediction_start, target_start, length` such that: - target_words[target_start : target_start + length] ==\ - prediction_words[prediction_start : prediction_start + length] - - prediction_start: - A list of prediction start indices. - target_start: - A list of target start indices. + Yields tuples of `(reference_start, hypothesis_start, length` such that: + reference_words[reference_start : reference_start + length] ==\ + hypothesis_words[hypothesis_start : hypothesis_start + length] + + hypothesis_start: + A list of hypothesis start indices. + reference_start: + A list of reference start indices. length: A length of a word span to be considered. """ - for prediction_start in range(len(prediction_words)): - for target_start in range(len(target_words)): + for hypothesis_start in range(len(hypothesis_words)): + for reference_start in range(len(reference_words)): # this is slightly different from what tercom does but this should # really only kick in in degenerate cases - if abs(target_start - prediction_start) > _MAX_SHIFT_DIST: + if abs(reference_start - hypothesis_start) > _MAX_SHIFT_DIST: continue for length in range(1, _MAX_SHIFT_SIZE): - # Check if prediction and target are equal so far - if prediction_words[prediction_start + length - 1] != target_words[target_start + length - 1]: + # Check if hypothesis and reference are equal so far + if hypothesis_words[hypothesis_start + length - 1] != reference_words[reference_start + length - 1]: break - yield prediction_start, target_start, length + yield hypothesis_start, reference_start, length # Stop processing once a sequence is consumed. - _pred = len(prediction_words) == prediction_start + length - _target = len(target_words) == target_start + length - if _pred or _target: + _hyp = len(hypothesis_words) == hypothesis_start + length + _ref = len(reference_words) == reference_start + length + if _hyp or _ref: break def _handle_corner_cases_during_shifting( alignments: Dict[int, int], - prediction_errors: List[int], - target_errors: List[int], - prediction_start: int, - target_start: int, + hypothesis_errors: List[int], + reference_errors: List[int], + hypothesis_start: int, + reference_start: int, length: int, ) -> bool: """A helper function which returns `True` if any of corner cases has been met. Otherwise, `False` is returned. Args: alignments: - A dictionary mapping aligned positions between a target and a prediction. - prediction_errors: - A list of error positions in a prediction. - target_errors: - A list of error positions in a target. - prediction_start: - A prediction start index. - target_start: - A target start index. + A dictionary mapping aligned positions between a reference and a hypothesis. + hypothesis_errors: + A list of error positions in a hypothesis. + reference_errors: + A list of error positions in a reference. + hypothesis_start: + A hypothesis start index. + reference_start: + A reference start index. length: A length of a word span to be considered. Return: An indication whether any of conrner cases has been met. """ - # don't do the shift unless both the prediction was wrong and the - # target doesn't match prediction at the target position - if sum(prediction_errors[prediction_start : prediction_start + length]) == 0: + # don't do the shift unless both the hypothesis was wrong and the + # reference doesn't match hypothesis at the target position + if sum(hypothesis_errors[hypothesis_start : hypothesis_start + length]) == 0: return True - if sum(target_errors[target_start : target_start + length]) == 0: + if sum(reference_errors[reference_start : reference_start + length]) == 0: return True # don't try to shift within the subsequence - if prediction_start <= alignments[target_start] < prediction_start + length: + if hypothesis_start <= alignments[reference_start] < hypothesis_start + length: return True return False @@ -328,55 +327,55 @@ def _shift_word_within_shifted_string(words: List[str], start: int, target: int, def _shift_words( - prediction_words: List[str], - target_words: List[str], + hypothesis_words: List[str], + reference_words: List[str], cached_edit_distance: _LevenshteinEditDistance, checked_candidates: int, ) -> Tuple[int, List[str], int]: - """Attempt to shift words to match a prediction with a target. It returns the lowest number of required edits - between a prediction and a provided target, a list of shifted words and number of checked candidates. + """Attempt to shift words to match a hypothesis with a reference. It returns the lowest number of required + edits between a hypothesis and a provided reference, a list of shifted words and number of checked candidates. Note that the filtering of possible shifts and shift selection are heavily based on somewhat arbitrary heuristics. The code here follows as closely as possible the logic in Tercom, not always justifying the particular design choices. (The paragraph copied from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/lib_ter.py) Args: - prediction_words: - A list of tokenized prediction sentence. - target_words: - A list of lists of tokenized target sentences. + hypothesis_words: + A list of tokenized hypothesis sentence. + reference_words: + A list of lists of tokenized reference sentences. cached_edit_distance: - A pre-computed edit distance between a prediction and a target. + A pre-computed edit distance between a hypothesis and a reference. checked_candidates: - A number of checked prediction candidates to match a provided target. + A number of checked hypothesis candidates to match a provided reference. Return: best_score: - The best (lowest) number of required edits to match prediction and target sentences. + The best (lowest) number of required edits to match hypothesis and reference sentences. shifted_words: - A list of shifted words in prediction sentences. + A list of shifted words in hypothesis sentences. checked_candidates: - A number of checked prediction candidates to match a provided target. + A number of checked hypothesis candidates to match a provided reference. """ - edit_distance, inverted_trace = cached_edit_distance(prediction_words) + edit_distance, inverted_trace = cached_edit_distance(hypothesis_words) trace = _flip_trace(inverted_trace) - alignments, target_errors, prediction_errors = _trace_to_alignment(trace) + alignments, reference_errors, hypothesis_errors = _trace_to_alignment(trace) best: Optional[Tuple[int, int, int, int, List[str]]] = None - for prediction_start, target_start, length in _find_shifted_pairs(prediction_words, target_words): + for hypothesis_start, reference_start, length in _find_shifted_pairs(hypothesis_words, reference_words): if _handle_corner_cases_during_shifting( - alignments, prediction_errors, target_errors, prediction_start, target_start, length + alignments, hypothesis_errors, reference_errors, hypothesis_start, reference_start, length ): continue prev_idx = -1 for offset in range(-1, length): - if target_start + offset == -1: + if reference_start + offset == -1: idx = 0 - elif target_start + offset in alignments: - idx = alignments[target_start + offset] + 1 - # offset is out of bounds => aims past target + elif reference_start + offset in alignments: + idx = alignments[reference_start + offset] + 1 + # offset is out of bounds => aims past reference else: break # Skip idx if already tried @@ -385,13 +384,13 @@ def _shift_words( prev_idx = idx - shifted_words = _perform_shift(prediction_words, prediction_start, length, idx) + shifted_words = _perform_shift(hypothesis_words, hypothesis_start, length, idx) # Elements of the tuple are designed to replicate Tercom ranking of shifts: candidate = ( edit_distance - cached_edit_distance(shifted_words)[0], # highest score first length, # then, longest match first - -prediction_start, # then, earliest match first + -hypothesis_start, # then, earliest match first -idx, # then, earliest target position first shifted_words, ) @@ -405,35 +404,35 @@ def _shift_words( break if not best: - return 0, prediction_words, checked_candidates + return 0, hypothesis_words, checked_candidates best_score, _, _, _, shifted_words = best return best_score, shifted_words, checked_candidates -def _translation_edit_rate(prediction_words: List[str], target_words: List[str]) -> Tensor: - """Compute translation edit rate between target and prediction sentences. +def _translation_edit_rate(hypothesis_words: List[str], reference_words: List[str]) -> Tensor: + """Compute translation edit rate between hypothesis and reference sentences. Args: - prediction_words: - A list of a tokenized prediction sentence. - target_words: - A list of lists of tokenized target sentences. + hypothesis_words: + A list of a tokenized hypothesis sentence. + reference_words: + A list of lists of tokenized reference sentences. Return: - A number of required edits to match prediction and target sentences. + A number of required edits to match hypothesis and reference sentences. """ - if len(target_words) == 0: + if len(reference_words) == 0: return tensor(0.0) - cached_edit_distance = _LevenshteinEditDistance(target_words) + cached_edit_distance = _LevenshteinEditDistance(reference_words) num_shifts = 0 checked_candidates = 0 - input_words = prediction_words + input_words = hypothesis_words while True: # do shifts until they stop reducing the edit distance delta, new_input_words, checked_candidates = _shift_words( - input_words, target_words, cached_edit_distance, checked_candidates + input_words, reference_words, cached_edit_distance, checked_candidates ) if checked_candidates >= _MAX_SHIFT_CANDIDATES or delta <= 0: break @@ -446,48 +445,50 @@ def _translation_edit_rate(prediction_words: List[str], target_words: List[str]) return tensor(total_edits) -def _compute_sentence_statistics(prediction_words: List[str], target_words: List[List[str]]) -> Tuple[Tensor, Tensor]: - """Compute sentence TER statistics between prediction and provided targets. +def _compute_sentence_statistics( + hypothesis_words: List[str], references_words: List[List[str]] +) -> Tuple[Tensor, Tensor]: + """Compute sentence TER statistics between hypothesis and provided references. Args: - prediction_words: - A list of tokenized prediction sentence. - target_words: - A list of lists of tokenized target sentences. + hypothesis_words: + A list of tokenized hypothesis sentence. + reference_words: + A list of lists of tokenized reference sentences. Return: best_num_edits: - The best (lowest) number of required edits to match prediction and target sentences. - avg_target_len: - Average length of tokenized target sentences. + The best (lowest) number of required edits to match hypothesis and reference sentences. + avg_ref_len: + Average length of tokenized reference sentences. """ - target_lengths = tensor(0.0) + ref_lengths = tensor(0.0) best_num_edits = tensor(2e16) - for tgt_words in target_words: - num_edits = _translation_edit_rate(prediction_words, tgt_words) - target_lengths += len(tgt_words) + for reference_words in references_words: + num_edits = _translation_edit_rate(reference_words, hypothesis_words) + ref_lengths += len(reference_words) if num_edits < best_num_edits: best_num_edits = num_edits - avg_target_len = target_lengths / len(target_words) - return best_num_edits, avg_target_len + avg_ref_len = ref_lengths / len(references_words) + return best_num_edits, avg_ref_len -def _compute_ter_score_from_statistics(num_edits: Tensor, target_length: Tensor) -> Tensor: - """Compute TER score based on pre-computed a number of edits and an average target length. +def _compute_ter_score_from_statistics(num_edits: Tensor, ref_length: Tensor) -> Tensor: + """Compute TER score based on pre-computed a number of edits and an average reference length. num_edits: - A number of required edits to match prediction and target sentences. - target_length: - An average length of target sentences. + A number of required edits to match hypothesis and reference sentences. + ref_length: + An average length of reference sentences. Return: - A corpus-level TER score or 1 if target_length == 0. + A corpus-level TER score or 1 if reference_length == 0. """ - if target_length > 0 and num_edits > 0: - score = num_edits / target_length - elif target_length == 0 and num_edits > 0: + if ref_length > 0 and num_edits > 0: + score = num_edits / ref_length + elif ref_length == 0 and num_edits > 0: score = tensor(1.0) else: score = tensor(0.0) @@ -495,86 +496,86 @@ def _compute_ter_score_from_statistics(num_edits: Tensor, target_length: Tensor) def _ter_update( - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Sequence[Union[str, Sequence[str]]], + hypothesis_corpus: Union[str, Sequence[str]], + reference_corpus: Sequence[Union[str, Sequence[str]]], tokenizer: _TercomTokenizer, total_num_edits: Tensor, - total_target_length: Tensor, + total_ref_length: Tensor, sentence_ter: Optional[List[Tensor]] = None, ) -> Tuple[Tensor, Tensor, Optional[List[Tensor]]]: """Update TER statistics. Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. tokenizer: total_num_edits: - A total number of required edits to match prediction and target sentences. - total_target_length: - A total average length of target sentences. + A total number of required edits to match hypothesis and reference sentences. + total_ref_length: + A total average length of reference sentences. Return: total_num_edits: - A total number of required edits to match prediction and target sentences. - total_target_length: - A total average length of target sentences. + A total number of required edits to match hypothesis and reference sentences. + total_ref_length: + A total average length of reference sentences. sentence_ter: (Optionally) A list of sentence-level TER. Raises: ValueError: - If length of `target_corpus` and `prediction_corpus` differs. + If length of `reference_corpus` and `hypothesis_corpus` differs. """ - target_corpus, prediction_corpus = _validate_inputs(target_corpus, prediction_corpus) + reference_corpus, hypothesis_corpus = _validate_inputs(reference_corpus, hypothesis_corpus) - for (prediction, targets) in zip(prediction_corpus, target_corpus): - target_words_: List[List[str]] = [ - [word for word in _preprocess_sentence(target, tokenizer).split()] for target in targets + for (hypothesis, references) in zip(hypothesis_corpus, reference_corpus): + references_words_: List[List[str]] = [ + [word for word in _preprocess_sentence(ref, tokenizer).split()] for ref in references ] - prediction_words_: List[str] = [word for word in _preprocess_sentence(prediction, tokenizer).split()] - num_edits, target_length = _compute_sentence_statistics(prediction_words_, target_words_) + hypothesis_words_: List[str] = [word for word in _preprocess_sentence(hypothesis, tokenizer).split()] + num_edits, ref_length = _compute_sentence_statistics(hypothesis_words_, references_words_) total_num_edits += num_edits - total_target_length += target_length + total_ref_length += ref_length if sentence_ter is not None: - sentence_ter.append(_compute_ter_score_from_statistics(num_edits, target_length).unsqueeze(0)) - return total_num_edits, total_target_length, sentence_ter + sentence_ter.append(_compute_ter_score_from_statistics(num_edits, ref_length).unsqueeze(0)) + return total_num_edits, total_ref_length, sentence_ter -def _ter_compute(total_num_edits: Tensor, total_target_length: Tensor) -> Tensor: - """Compute TER based on pre-computed a total number of edits and a total average target length. +def _ter_compute(total_num_edits: Tensor, total_ref_length: Tensor) -> Tensor: + """Compute TER based on pre-computed a total number of edits and a total average reference length. Args: total_num_edits: - A total number of required edits to match prediction and target sentences. - total_target_length: - A total average length of target sentences. + A total number of required edits to match hypothesis and reference sentences. + total_ref_length: + A total average length of reference sentences. Return: A corpus-level TER score. """ - return _compute_ter_score_from_statistics(total_num_edits, total_target_length) + return _compute_ter_score_from_statistics(total_num_edits, total_ref_length) def ter( - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Sequence[Union[str, Sequence[str]]], + hypothesis_corpus: Union[str, Sequence[str]], + reference_corpus: Sequence[Union[str, Sequence[str]]], normalize: bool = False, no_punctuation: bool = False, lowercase: bool = True, asian_support: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, Tuple[Tensor, List[Tensor]]]: - """Calculate Translation edit rate (`TER`_) of machine translated text with one or more targets. This + """Calculate Translation edit rate (`TER`_) of machine translated text with one or more references. This implementation follows the implmenetaions from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/ter.py. The `sacrebleu` implmenetation is a near-exact reimplementation of the Tercom algorithm, produces identical results on all "sane" outputs. Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. normalize: An indication whether a general tokenization to be applied. no_punctuation: @@ -591,9 +592,9 @@ def ter( (Optionally) A list of sentence-level translation_edit_rate (TER) if `return_sentence_level_score=True`. Example: - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] - >>> ter(prediction_corpus, target_corpus) + >>> hypothesis_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> ter(hypothesis_corpus, reference_corpus) tensor(0.1538) References: @@ -612,13 +613,13 @@ def ter( tokenizer: _TercomTokenizer = _TercomTokenizer(normalize, no_punctuation, lowercase, asian_support) total_num_edits = tensor(0.0) - total_target_length = tensor(0.0) + total_ref_length = tensor(0.0) sentence_ter: Optional[List[Tensor]] = [] if return_sentence_level_score else None - total_num_edits, total_target_length, sentence_ter = _ter_update( - prediction_corpus, target_corpus, tokenizer, total_num_edits, total_target_length, sentence_ter + total_num_edits, total_ref_length, sentence_ter = _ter_update( + hypothesis_corpus, reference_corpus, tokenizer, total_num_edits, total_ref_length, sentence_ter ) - ter_score = _ter_compute(total_num_edits, total_target_length) + ter_score = _ter_compute(total_num_edits, total_ref_length) if sentence_ter: return ter_score, sentence_ter diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index 9c1d82a624d..6731e131655 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -46,10 +46,10 @@ class BLEUScore(Metric): will be used to perform the allgather. Example: - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> translate_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = BLEUScore() - >>> metric(prediction_corpus, target_corpus) + >>> metric(translate_corpus, reference_corpus) tensor(0.7598) References: @@ -62,8 +62,8 @@ class BLEUScore(Metric): is_differentiable = False higher_is_better = True - prediction_len: Tensor - target_len: Tensor + trans_len: Tensor + ref_len: Tensor numerator: Tensor denominator: Tensor @@ -89,26 +89,28 @@ def __init__( self.n_gram = n_gram self.smooth = smooth - self.add_state("prediction_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") - self.add_state("target_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") + self.add_state("trans_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") + self.add_state("ref_len", tensor(0, dtype=torch.float), dist_reduce_fx="sum") self.add_state("numerator", torch.zeros(self.n_gram), dist_reduce_fx="sum") self.add_state("denominator", torch.zeros(self.n_gram), dist_reduce_fx="sum") - def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore + def update( # type: ignore + self, translate_corpus: Sequence[str], reference_corpus: Sequence[Sequence[str]] + ) -> None: """Compute Precision Scores. Args: - prediction_corpus: An iterable of machine translated corpus - target_corpus: An iterable of iterables of reference corpus + translate_corpus: An iterable of machine translated corpus + reference_corpus: An iterable of iterables of reference corpus """ - self.prediction_len, self.target_len = _bleu_score_update( - prediction_corpus, - target_corpus, + self.trans_len, self.ref_len = _bleu_score_update( + translate_corpus, + reference_corpus, self.numerator, self.denominator, - self.prediction_len, - self.target_len, + self.trans_len, + self.ref_len, self.n_gram, _tokenize_fn, ) @@ -120,5 +122,5 @@ def compute(self) -> Tensor: Tensor with BLEU Score """ return _bleu_score_compute( - self.prediction_len, self.target_len, self.numerator, self.denominator, self.n_gram, self.smooth + self.trans_len, self.ref_len, self.numerator, self.denominator, self.n_gram, self.smooth ) diff --git a/torchmetrics/text/chrf.py b/torchmetrics/text/chrf.py index de302fe4eee..1059f13cf37 100644 --- a/torchmetrics/text/chrf.py +++ b/torchmetrics/text/chrf.py @@ -27,13 +27,13 @@ from torchmetrics.functional.text.chrf import _chrf_score_compute, _chrf_score_update, _prepare_n_grams_dicts _N_GRAM_LEVELS = ("char", "word") -_TEXT_LEVELS = ("target", "prediction", "matching") +_TEXT_LEVELS = ("ref", "hyp", "matching") _DICT_STATES_NAMES = ( - "total_prediction_char_n_grams", - "total_prediction_word_n_grams", - "total_target_char_n_grams", - "total_target_word_n_grams", + "total_hyp_char_n_grams", + "total_hyp_word_n_grams", + "total_ref_char_n_grams", + "total_ref_word_n_grams", "total_matching_char_n_grams", "total_matching_word_n_grams", ) @@ -83,10 +83,10 @@ class CHRFScore(Metric): If ``beta`` is smaller than 0. Example: - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> hypothesis_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = CHRFScore() - >>> metric(prediction_corpus, target_corpus) + >>> metric(hypothesis_corpus, reference_corpus) tensor(0.8640) References: @@ -142,18 +142,20 @@ def __init__( if self.return_sentence_level_score: self.add_state("sentence_chrf_score", [], dist_reduce_fx="cat") - def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore + def update( # type: ignore + self, hypothesis_corpus: Sequence[str], reference_corpus: Sequence[Sequence[str]] + ) -> None: """Compute Precision Scores. Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. """ n_grams_dicts_tuple = _chrf_score_update( - prediction_corpus, - target_corpus, + hypothesis_corpus, + reference_corpus, *self._convert_states_to_dicts(), self.n_char_order, self.n_word_order, @@ -220,5 +222,5 @@ def _get_state_name(text: str, n_gram_level: str, n: int) -> str: return f"total_{text}_{n_gram_level}_{n}_grams" def _get_text_n_gram_iterator(self) -> Iterator[Tuple[Tuple[str, int], str]]: - """Get iterator over char/word and target/prediction/matching n-gram level.""" + """Get iterator over char/word and reference/hypothesis/matching n-gram level.""" return itertools.product(zip(_N_GRAM_LEVELS, [self.n_char_order, self.n_word_order]), _TEXT_LEVELS) diff --git a/torchmetrics/text/sacre_bleu.py b/torchmetrics/text/sacre_bleu.py index b577dc2f19b..97ae380a4a1 100644 --- a/torchmetrics/text/sacre_bleu.py +++ b/torchmetrics/text/sacre_bleu.py @@ -66,10 +66,10 @@ class SacreBLEUScore(BLEUScore): Example: - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> translate_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = SacreBLEUScore() - >>> metric(prediction_corpus, target_corpus) + >>> metric(translate_corpus, reference_corpus) tensor(0.7598) References: @@ -115,20 +115,22 @@ def __init__( ) self.tokenizer = _SacreBLEUTokenizer(tokenize, lowercase) - def update(self, prediction_corpus: Sequence[str], target_corpus: Sequence[Sequence[str]]) -> None: # type: ignore + def update( # type: ignore + self, translate_corpus: Sequence[str], reference_corpus: Sequence[Sequence[str]] + ) -> None: """Compute Precision Scores. Args: - prediction_corpus: An iterable of machine translated corpus - target_corpus: An iterable of iterables of reference corpus + translate_corpus: An iterable of machine translated corpus + reference_corpus: An iterable of iterables of reference corpus """ - self.prediction_len, self.target_len = _bleu_score_update( - prediction_corpus, - target_corpus, + self.trans_len, self.ref_len = _bleu_score_update( + translate_corpus, + reference_corpus, self.numerator, self.denominator, - self.prediction_len, - self.target_len, + self.trans_len, + self.ref_len, self.n_gram, self.tokenizer, ) diff --git a/torchmetrics/text/ter.py b/torchmetrics/text/ter.py index f9ce0f2c80c..662afeccc25 100644 --- a/torchmetrics/text/ter.py +++ b/torchmetrics/text/ter.py @@ -50,10 +50,10 @@ class TER(Metric): will be used to perform the allgather Example: - >>> prediction_corpus = ['the cat is on the mat'] - >>> target_corpus = [['there is a cat on the mat', 'a cat is on the mat']] + >>> hypothesis_corpus = ['the cat is on the mat'] + >>> reference_corpus = [['there is a cat on the mat', 'a cat is on the mat']] >>> metric = TER() - >>> metric(prediction_corpus, target_corpus) + >>> metric(hypothesis_corpus, reference_corpus) tensor(0.1538) References: @@ -64,7 +64,7 @@ class TER(Metric): is_differentiable = False higher_is_better = False total_num_edits: Tensor - total_target_len: Tensor + total_ref_len: Tensor sentence_ter: Optional[List[Tensor]] = None def __init__( @@ -98,29 +98,29 @@ def __init__( self.return_sentence_level_score = return_sentence_level_score self.add_state("total_num_edits", tensor(0.0), dist_reduce_fx="sum") - self.add_state("total_target_len", tensor(0.0), dist_reduce_fx="sum") + self.add_state("total_ref_len", tensor(0.0), dist_reduce_fx="sum") if self.return_sentence_level_score: self.add_state("sentence_ter", [], dist_reduce_fx="cat") def update( # type: ignore self, - prediction_corpus: Union[str, Sequence[str]], - target_corpus: Sequence[Union[str, Sequence[str]]], + hypothesis_corpus: Union[str, Sequence[str]], + reference_corpus: Sequence[Union[str, Sequence[str]]], ) -> None: """Update TER statistics. Args: - prediction_corpus: - An iterable of prediction corpus. - target_corpus: - An iterable of iterables of target corpus. + hypothesis_corpus: + An iterable of hypothesis corpus. + reference_corpus: + An iterable of iterables of reference corpus. """ - self.total_num_edits, self.total_target_len, self.sentence_ter = _ter_update( - prediction_corpus, - target_corpus, + self.total_num_edits, self.total_ref_len, self.sentence_ter = _ter_update( + hypothesis_corpus, + reference_corpus, self.tokenizer, self.total_num_edits, - self.total_target_len, + self.total_ref_len, self.sentence_ter, ) @@ -131,7 +131,7 @@ def compute(self) -> Union[Tensor, Tuple[Tensor, Tensor]]: A corpus-level translation edit rate (TER). (Optionally) A list of sentence-level translation_edit_rate (TER) if `return_sentence_level_score=True`. """ - ter = _ter_compute(self.total_num_edits, self.total_target_len) + ter = _ter_compute(self.total_num_edits, self.total_ref_len) if self.sentence_ter is not None: return ter, torch.cat(self.sentence_ter) return ter From 2f5e461d784b8ecb711679b3147d2f24eac52e8d Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 3 Jan 2022 15:54:31 +0100 Subject: [PATCH 8/8] Apply suggestions from code review --- torchmetrics/functional/text/bleu.py | 4 ++-- torchmetrics/functional/text/sacre_bleu.py | 4 ++-- torchmetrics/text/bleu.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/torchmetrics/functional/text/bleu.py b/torchmetrics/functional/text/bleu.py index 94df9495508..19176f229fd 100644 --- a/torchmetrics/functional/text/bleu.py +++ b/torchmetrics/functional/text/bleu.py @@ -181,8 +181,8 @@ def bleu_score( and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ warnings.warn( - "Input order of targets and preds were changed to predictions firsts and targets \ - second in v0.7. Warning will be removed in v0.8" + "Input order of targets and preds were changed to predictions firsts and targets second in v0.7." + " Warning will be removed in v0.8." ) translate_corpus_ = [translate_corpus] if isinstance(translate_corpus, str) else translate_corpus reference_corpus_ = [ diff --git a/torchmetrics/functional/text/sacre_bleu.py b/torchmetrics/functional/text/sacre_bleu.py index 9f2b51d470a..835607579a0 100644 --- a/torchmetrics/functional/text/sacre_bleu.py +++ b/torchmetrics/functional/text/sacre_bleu.py @@ -323,8 +323,8 @@ def sacre_bleu_score( and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_ """ warnings.warn( - "Input order of targets and preds were changed to predictions firsts and targets \ - second in v0.7. Warning will be removed in v0.8" + "Input order of targets and preds were changed to predictions firsts and targets second in v0.7." + " Warning will be removed in v0.8." ) if tokenize not in AVAILABLE_TOKENIZERS: raise ValueError(f"Argument `tokenize` expected to be one of {AVAILABLE_TOKENIZERS} but got {tokenize}.") diff --git a/torchmetrics/text/bleu.py b/torchmetrics/text/bleu.py index 6731e131655..10c1f5ab5a1 100644 --- a/torchmetrics/text/bleu.py +++ b/torchmetrics/text/bleu.py @@ -83,8 +83,8 @@ def __init__( dist_sync_fn=dist_sync_fn, ) warnings.warn( - "Input order of targets and preds were changed to predictions firsts and targets \ - second in v0.7. Warning will be removed in v0.8" + "Input order of targets and preds were changed to predictions firsts and targets second in v0.7." + " Warning will be removed in v0.8." ) self.n_gram = n_gram self.smooth = smooth