Lightning-AI · SkafteNicki · Jul 24, 2021 · Jul 16, 2021 · Jul 16, 2021 · Jul 16, 2021
@@ -301,6 +301,17 @@ bleu_score [func]
 .. autofunction:: torchmetrics.functional.bleu_score
  :noindex:
 
+***
+Text
+***
+
+wer [func]
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torchmetrics.functional.wer
+ :noindex:
+
+
 ********
 Pairwise
 ********

@@ -507,6 +507,15 @@ RetrievalNormalizedDCG
 .. autoclass:: torchmetrics.RetrievalNormalizedDCG
  :noindex:
 
+************
+Text Metrics
+************
+
+WER
+~~~
+
+.. autoclass:: torchmetrics.WER
+ :noindex:
 
 ********
 Wrappers

@@ -19,6 +19,7 @@ nltk>=3.6
 
 # add extra requirements
 -r image.txt
+-r text.txt
 
 # audio
 pypesq

@@ -0,0 +1 @@
+jiwer
@@ -27,6 +27,7 @@ def _load_py_module(fname, pkg="torchmetrics"):
 def _prepare_extras():
  extras = {
  'image': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='image.txt'),
+ 'text': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='text.txt'),
  }
  return extras
 

diff --git a/tests/functional/text_wer.py b/tests/functional/text_wer.py
@@ -0,0 +1,11 @@
+import pytest
+
+from torchmetrics.functional.text.wer import wer
+
+
+@pytest.mark.parametrize(
+ "hyp,ref,score",
+ [("hello world", "hello world", 0.0), ("hello world", "Firwww", 1.0)],
+)
+def test_wer_same(hyp, ref, score):
+ assert wer(ref, hyp) == score
@@ -0,0 +1,13 @@
+import pytest
+
+from torchmetrics.text.wer import WER
+
+
+@pytest.mark.parametrize(
+ "hyp,ref,score",
+ [("hello world", "hello world", 0.0), ("hello world", "Firwww", 1.0)],
+)
+def test_wer_same(hyp, ref, score):
+ metric = WER()
+ metric.update(hyp, ref)
+ assert metric.compute() == score
@@ -60,4 +60,5 @@
  RetrievalPrecision,
  RetrievalRecall,
 )
+from torchmetrics.text import WER # noqa: F401 E402
 from torchmetrics.wrappers import BootStrapper # noqa: F401 E402
@@ -58,3 +58,4 @@
 from torchmetrics.functional.retrieval.recall import retrieval_recall # noqa: F401
 from torchmetrics.functional.retrieval.reciprocal_rank import retrieval_reciprocal_rank # noqa: F401
 from torchmetrics.functional.self_supervised import embedding_similarity # noqa: F401
+from torchmetrics.functional.text.wer import wer # noqa: F401
@@ -0,0 +1 @@
+from torchmetrics.functional.text.wer import wer # noqa: F401
@@ -0,0 +1,29 @@
+from typing import Any
+
+from jiwer import compute_measures
+
+
+def wer(target: Any, preds: Any, concatenate_texts: bool = False) -> float:
+ """
+ Args:
+ references: List of references for each speech input.
+ predictions: List of transcriptions to score.
+ concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.
+ Returns:
+ (float): the word error rate
+ Examples:
+ >>> predictions = ["this is the prediction", "there is an other sample"]
+ >>> references = ["this is the reference", "there is another one"]
+ >>> wer_score = wer(preds=predictions, target=references)
+ >>> print(wer_score)
+ 0.5
+ """
+ if concatenate_texts:
+ return compute_measures(target, preds)["wer"]
+ incorrect = 0
+ total = 0
+ for prediction, reference in zip(preds, target):
+ measures = compute_measures(reference, prediction)
+ incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
+ total += measures["substitutions"] + measures["deletions"] + measures["hits"]
+ return incorrect / total
@@ -0,0 +1 @@
+from torchmetrics.text.wer import WER # noqa: F401
@@ -0,0 +1,55 @@
+from typing import Any
+
+from jiwer import compute_measures
+
+from torchmetrics.metric import Metric
+
+
+class WER(Metric):
+ """
+ Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
+ WER's output is always a number between 0 and 1.
+ This value indicates the percentage of words that were incorrectly predicted.
+ The lower the value, the better the performance of the ASR system with a WER of 0 being a perfect score.
+ Word error rate can then be computed as:
+ WER = (S + D + I) / N = (S + D + I) / (S + D + C)
+ where:
+ S is the number of substitutions,
+ D is the number of deletions,
+ I is the number of insertions,
+ C is the number of correct words,
+ N is the number of words in the reference (N=S+D+C).
+ Compute WER score of transcribed segments against references.
+ Args:
+ references: List of references for each speech input.
+ predictions: List of transcriptions to score.
+ concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.
+ Returns:
+ (float): the word error rate
+ Examples:
+ >>> predictions = ["this is the prediction", "there is an other sample"]
+ >>> references = ["this is the reference", "there is another one"]
+ >>> wer = WER(predictions=predictions, references=references)
+ >>> wer_score = wer.compute()
+ >>> print(wer_score)
+ 0.5
+ """
+
+ def __init__(self, concatenate_texts: bool = False):
+ super().__init__()
+ self.concatenate_texts = concatenate_texts
+
+ def update(self, preds: Any, target: Any) -> None:
+ self.preds.append(preds)
+ self.target.append(target)
+
+ def compute(self) -> float:
+ if self.concatenate_texts:
+ return compute_measures(self.target, self.preds)["wer"]
+ incorrect = 0
+ total = 0
+ for prediction, reference in zip(self.preds, self.target):
+ measures = compute_measures(reference, prediction)
+ incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
+ total += measures["substitutions"] + measures["deletions"] + measures["hits"]
+ return incorrect / total