ArneBinder · ArneBinder · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/src/pytorch_ie/metrics/statistics.py b/src/pytorch_ie/metrics/statistics.py
@@ -4,8 +4,11 @@
 
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
+from pytorch_ie import tokenize_document
+from pytorch_ie.annotations import Span
 from pytorch_ie.core import Document, DocumentStatistic
-from pytorch_ie.documents import TextBasedDocument
+from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument
+from pytorch_ie.utils.hydra import resolve_optional_document_type
 
 logger = logging.getLogger(__name__)
 
@@ -72,24 +75,113 @@ def _collect(self, doc: Document) -> List[int]:
         return lengths
 
 
-class LabeledSpanLengthCollector(DocumentStatistic):
-    """Collects the length of spans in a field per label, e.g. to collect the length of entities per type.
+class SpanLengthCollector(DocumentStatistic):
+    """Collects the lengths of Span annotations. If labels are provided, the lengths collected per
+    label.
 
-    The field should be a list of elements with a label, a start and end attribute.
+    If a tokenizer is provided, the span length is calculated in means of tokens, otherwise in
+    means of characters.
     """
 
-    DEFAULT_AGGREGATION_FUNCTIONS = ["mean", "std", "min", "max", "len"]
+    DEFAULT_AGGREGATION_FUNCTIONS = ["len", "mean", "std", "min", "max"]
 
-    def __init__(self, field: str, **kwargs):
+    def __init__(
+        self,
+        layer: str,
+        tokenize: bool = False,
+        tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+        tokenized_document_type: Optional[Union[str, Type[TokenBasedDocument]]] = None,
+        labels: Optional[Union[List[str], str]] = None,
+        label_attribute: str = "label",
+        tokenize_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
-        self.field = field
+        self.layer = layer
+        if isinstance(labels, str) and labels != "INFERRED":
+            raise ValueError("labels must be a list of strings or 'INFERRED'")
+        if labels == "INFERRED":
+            logger.warning(
+                f"Inferring labels with {self.__class__.__name__} from data produces wrong results "
+                f"for certain aggregation functions (e.g. 'mean', 'std', 'min') because zero values "
+                f"are not included in the calculation. We remove these aggregation functions from "
+                f"this collector, but be aware that the results may be wrong for your own aggregation "
+                f"functions that rely on zero values."
+            )
+            self.aggregation_functions = {
+                name: func
+                for name, func in self.aggregation_functions.items()
+                if name not in ["mean", "std", "min"]
+            }
+        self.labels = labels
+        self.label_field = label_attribute
+        self.tokenize = tokenize
+        if self.tokenize:
+            if tokenizer is None:
+                raise ValueError(
+                    "tokenizer must be provided to calculate the span length in means of tokens"
+                )
+            if isinstance(tokenizer, str):
+                tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+            self.tokenizer = tokenizer
+            resolved_tokenized_document_type = resolve_optional_document_type(
+                tokenized_document_type
+            )
+            if resolved_tokenized_document_type is None:
+                raise ValueError(
+                    "tokenized_document_type must be provided to calculate the span length in means of tokens"
+                )
+            if not (
+                isinstance(resolved_tokenized_document_type, type)
+                and issubclass(resolved_tokenized_document_type, TokenBasedDocument)
+            ):
+                raise TypeError(
+                    f"tokenized_document_type must be a subclass of TokenBasedDocument, but it is: "
+                    f"{resolved_tokenized_document_type}"
+                )
+            self.tokenized_document_type = resolved_tokenized_document_type
+            self.tokenize_kwargs = tokenize_kwargs or {}
+
+    def _collect(self, doc: Document) -> Union[List[int], Dict[str, List[int]]]:
+        docs: Union[List[Document], List[TokenBasedDocument]]
+        if self.tokenize:
+            if not isinstance(doc, TextBasedDocument):
+                raise ValueError(
+                    "doc must be a TextBasedDocument to calculate the span length in means of tokens"
+                )
+            if not isinstance(doc, TextBasedDocument):
+                raise ValueError(
+                    "doc must be a TextBasedDocument to calculate the span length in means of tokens"
+                )
+            docs = tokenize_document(
+                doc,
+                tokenizer=self.tokenizer,
+                result_document_type=self.tokenized_document_type,
+                **self.tokenize_kwargs,
+            )
+        else:
+            docs = [doc]
 
-    def _collect(self, doc: Document) -> Dict[str, List[int]]:
-        field_obj = getattr(doc, self.field)
-        counts = defaultdict(list)
-        for elem in field_obj:
-            counts[elem.label].append(elem.end - elem.start)
-        return dict(counts)
+        values: Dict[str, List[int]]
+        if isinstance(self.labels, str):
+            values = defaultdict(list)
+        else:
+            values = {label: [] for label in self.labels or ["ALL"]}
+        for doc in docs:
+            layer_obj = getattr(doc, self.layer)
+            for span in layer_obj:
+                if not isinstance(span, Span):
+                    raise TypeError(
+                        f"span length calculation is not yet supported for {type(span)}"
+                    )
+                length = span.end - span.start
+                if self.labels is None:
+                    label = "ALL"
+                else:
+                    label = getattr(span, self.label_field)
+                values[label].append(length)
+
+        return values if self.labels is not None else values["ALL"]
 
 
 class DummyCollector(DocumentStatistic):

diff --git a/tests/core/test_statistic.py b/tests/core/test_statistic.py
@@ -5,12 +5,12 @@
 from pytorch_ie import DatasetDict
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.core import AnnotationList, annotation_field
-from pytorch_ie.documents import TextBasedDocument
+from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument
 from pytorch_ie.metrics.statistics import (
     DummyCollector,
     FieldLengthCollector,
     LabelCountCollector,
-    LabeledSpanLengthCollector,
+    SpanLengthCollector,
     SubFieldLengthCollector,
     TokenCountCollector,
 )
@@ -113,30 +113,41 @@ def test_statistics(dataset):
         "validation": {"max": 187, "mean": 89.66666666666667, "min": 17, "std": 71.5603863103665},
     }
 
-    statistic = LabeledSpanLengthCollector(field="entities")
+    statistic = SpanLengthCollector(layer="entities")
+    values = statistic(dataset)
+    assert values == {
+        "train": {"len": 5, "mean": 7.6, "std": 4.223742416388575, "min": 2, "max": 15},
+        "validation": {
+            "len": 6,
+            "mean": 10.833333333333334,
+            "std": 2.9674156357941426,
+            "min": 6,
+            "max": 14,
+        },
+        "test": {"len": 5, "mean": 9.4, "std": 5.748043145279966, "min": 5, "max": 20},
+    }
+
+    statistic = SpanLengthCollector(layer="entities", labels="INFERRED")
     values = statistic(dataset)
     assert values == {
         "train": {
-            "ORG": {"mean": 2.0, "std": 0.0, "min": 2, "max": 2, "len": 1},
-            "MISC": {"mean": 6.5, "std": 0.5, "min": 6, "max": 7, "len": 2},
-            "PER": {"mean": 15.0, "std": 0.0, "min": 15, "max": 15, "len": 1},
-            "LOC": {"mean": 8.0, "std": 0.0, "min": 8, "max": 8, "len": 1},
+            "ORG": {"max": 2, "len": 1},
+            "MISC": {"max": 7, "len": 2},
+            "PER": {"max": 15, "len": 1},
+            "LOC": {"max": 8, "len": 1},
         },
         "test": {
             "LOC": {
-                "mean": 10.333333333333334,
-                "std": 6.847546194724712,
-                "min": 5,
                 "max": 20,
                 "len": 3,
             },
-            "PER": {"mean": 8.0, "std": 3.0, "min": 5, "max": 11, "len": 2},
+            "PER": {"max": 11, "len": 2},
         },
         "validation": {
-            "ORG": {"mean": 12.0, "std": 2.8284271247461903, "min": 8, "max": 14, "len": 3},
-            "LOC": {"mean": 6.0, "std": 0.0, "min": 6, "max": 6, "len": 1},
-            "MISC": {"mean": 11.0, "std": 0.0, "min": 11, "max": 11, "len": 1},
-            "PER": {"mean": 12.0, "std": 0.0, "min": 12, "max": 12, "len": 1},
+            "ORG": {"max": 14, "len": 3},
+            "LOC": {"max": 6, "len": 1},
+            "MISC": {"max": 11, "len": 1},
+            "PER": {"max": 12, "len": 1},
         },
     }
 
@@ -163,3 +174,26 @@ def test_statistics_with_tokenize(dataset):
         "train": {"max": 9, "mean": 5.666666666666667, "min": 2, "std": 2.8674417556808756},
         "validation": {"max": 38, "mean": 18.333333333333332, "min": 6, "std": 14.055445761538678},
     }
+
+    @dataclasses.dataclass
+    class TokenDocumentWithLabeledEntities(TokenBasedDocument):
+        entities: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
+
+    statistic = SpanLengthCollector(
+        layer="entities",
+        tokenize=True,
+        tokenizer="bert-base-uncased",
+        tokenized_document_type=TokenDocumentWithLabeledEntities,
+    )
+    values = statistic(dataset)
+    assert values == {
+        "test": {"len": 5, "max": 4, "mean": 2.4, "min": 1, "std": 1.2000000000000002},
+        "train": {"len": 5, "max": 2, "mean": 1.2, "min": 1, "std": 0.4},
+        "validation": {
+            "len": 6,
+            "max": 2,
+            "mean": 1.3333333333333333,
+            "min": 1,
+            "std": 0.4714045207910317,
+        },
+    }