tomaarsen · tomaarsen · Sep 29, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -38,7 +38,7 @@ jobs:
       - name: Install external dependencies on cache miss
         run: |
           python -m pip install --no-cache-dir --upgrade pip
-          python -m pip install --no-cache-dir ".[dev]"
+          python -m pip install --no-cache-dir ".[dev, codecarbon]"
           python -m spacy download en_core_web_sm
         if: steps.restore-cache.outputs.cache-hit != 'true'
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,22 @@ Types of changes
 * "Security" in case of vulnerabilities.
 -->
 
+## [Unreleased]
+
+### Added
+
+- Added `SpanMarkerModel.generate_model_card()` method to get a model card string.
+- Added `SpanMarkerModelCardData` that should be passed to `SpanMarkerModel.from_pretrained` with additional information like
+  - `language`, `license`, `model_name`, `model_id`, `encoder_name`, `encoder_id`, `dataset_name`, `dataset_id`, `dataset_revision`.
+
+### Changed
+
+- Heavily improved automatic model card generated.
+- Evaluating outside of training now returns per-label outputs instead of only "overall" F1, precision and recall.
+- Warn if the used tokenizer distinguishes between punctuation directly attached to a word and punctuation separated from a word by a space.
+  - If so, then inference of that model will require the punctuation to be split from the words.
+- Improve label normalization speed.
+
 ## [1.3.0]
 
 ### Added

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include span_marker/model_card_template.md
diff --git a/README.md b/README.md
@@ -46,25 +46,36 @@ Please have a look at our [Getting Started](notebooks/getting_started.ipynb) not
 ```python
 from datasets import load_dataset
 from transformers import TrainingArguments
-from span_marker import SpanMarkerModel, Trainer
+from span_marker import SpanMarkerModel, Trainer, SpanMarkerModelCardData
 
 
 def main() -> None:
     # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
-    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
+    dataset_id = "DFKI-SLT/few-nerd"
+    dataset_name = "FewNERD"
+    dataset = load_dataset(dataset_id, "supervised")
     dataset = dataset.remove_columns("ner_tags")
     dataset = dataset.rename_column("fine_ner_tags", "ner_tags")
     labels = dataset["train"].features["ner_tags"].feature.names
 
     # Initialize a SpanMarker model using a pretrained BERT-style encoder
-    model_name = "bert-base-cased"
+    encoder_id = "bert-base-cased"
     model = SpanMarkerModel.from_pretrained(
-        model_name,
+        encoder_id,
         labels=labels,
         # SpanMarker hyperparameters:
         model_max_length=256,
         marker_max_length=128,
         entity_max_length=8,
+        # Model card arguments
+        model_card_data=SpanMarkerModelCardData(
+            model_id="tomaarsen/span-marker-bert-base-fewnerd-fine-super",
+            encoder_id=encoder_id,
+            dataset_name=dataset_name,
+            dataset_id=dataset_id,
+            license="cc-by-sa-4.0",
+            language="en",
+        ),
     )
 
     # Prepare the 🤗 transformers training arguments
@@ -121,8 +132,6 @@ entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B
  {'span': 'Paris', 'label': 'location-GPE', 'score': 0.9892390966415405, 'char_start_index': 78, 'char_end_index': 83}]
 ```
 
-<!-- Because this work is based on [PL-Marker](https://arxiv.org/pdf/2109.06067v5.pdf), you may expect similar results to its [Papers with Code Leaderboard](https://paperswithcode.com/paper/pack-together-entity-and-relation-extraction) results. -->
-
 ## Pretrained Models
 
 All models in this list contain `train.py` files that show the training scripts used to generate them. Additionally, all training scripts used are stored in the [training_scripts](training_scripts) directory.

diff --git a/notebooks/getting_started.ipynb b/notebooks/getting_started.ipynb
@@ -76,20 +76,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-       "DatasetDict({\n",
-       "    train: Dataset({\n",
-       "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
-       "        num_rows: 131767\n",
-       "    })\n",
-       "    validation: Dataset({\n",
-       "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
-       "        num_rows: 18824\n",
-       "    })\n",
-       "    test: Dataset({\n",
-       "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
-       "        num_rows: 37648\n",
-       "    })\n",
-       "})"
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
+      "        num_rows: 131767\n",
+      "    })\n",
+      "    validation: Dataset({\n",
+      "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
+      "        num_rows: 18824\n",
+      "    })\n",
+      "    test: Dataset({\n",
+      "        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],\n",
+      "        num_rows: 37648\n",
+      "    })\n",
+      "})"
      ]
     }
    ],
@@ -317,9 +317,9 @@
       "- 2 missed entities with 15 words (0.009828%)\n",
       "- 1 missed entities with 17 words (0.004914%)\n",
       "- 1 missed entities with 19 words (0.004914%)\n",
-	  "Tracking run with wandb version 0.14.0\n",
-	  "Run data is saved locally in ...\n",
-	  "Syncing run colorful-leaf-761 to Weights & Biases\n"
+      "Tracking run with wandb version 0.14.0\n",
+      "Run data is saved locally in ...\n",
+      "Syncing run colorful-leaf-761 to Weights & Biases\n"
      ]
     },
     {
@@ -462,7 +462,7 @@
      "text": [
       "{'eval_loss': 0.019159900024533272, 'eval_overall_precision': 0.7773279352226721, 'eval_overall_recall': 0.7774778249132279, 'eval_overall_f1': 0.7774028728429576, 'eval_overall_accuracy': 0.9399702095533473, 'eval_runtime': 28.0225, 'eval_samples_per_second': 87.394, 'eval_steps_per_second': 21.875, 'epoch': 0.98}\n",
       "{'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0}\n",
-	  "TrainOutput(global_step=1227, training_loss=0.06319850289734186, metrics={'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0})"
+      "TrainOutput(global_step=1227, training_loss=0.06319850289734186, metrics={'train_runtime': 453.1296, 'train_samples_per_second': 21.667, 'train_steps_per_second': 2.708, 'train_loss': 0.06319850289734186, 'epoch': 1.0})"
      ]
     }
    ],
@@ -489,15 +489,15 @@
      "text": [
       "Loading cached processed dataset at ...\n",
       "Loading cached processed dataset at ...\n",
-       "{'eval_loss': 0.019206691533327103,\n",
-       " 'eval_overall_precision': 0.7758985200845666,\n",
-       " 'eval_overall_recall': 0.7784419591207096,\n",
-       " 'eval_overall_f1': 0.7771681586293194,\n",
-       " 'eval_overall_accuracy': 0.9398477830602543,\n",
-       " 'eval_runtime': 28.0849,\n",
-       " 'eval_samples_per_second': 87.2,\n",
-       " 'eval_steps_per_second': 21.827,\n",
-       " 'epoch': 1.0}"
+      "{'eval_loss': 0.019206691533327103,\n",
+      " 'eval_overall_precision': 0.7758985200845666,\n",
+      " 'eval_overall_recall': 0.7784419591207096,\n",
+      " 'eval_overall_f1': 0.7771681586293194,\n",
+      " 'eval_overall_accuracy': 0.9398477830602543,\n",
+      " 'eval_runtime': 28.0849,\n",
+      " 'eval_samples_per_second': 87.2,\n",
+      " 'eval_steps_per_second': 21.827,\n",
+      " 'epoch': 1.0}"
      ]
     }
    ],
@@ -533,15 +533,15 @@
       "- 1 missed entities with 17 words (0.019040%)\n",
       "- 1 missed entities with 19 words (0.019040%)\n",
       "- 1 missed entities with 40 words (0.019040%)\n",
-       "{'test_loss': 0.019189156591892242,\n",
-       " 'test_overall_precision': 0.769879287219774,\n",
-       " 'test_overall_recall': 0.7679663608562691,\n",
-       " 'test_overall_f1': 0.7689216342933691,\n",
-       " 'test_overall_accuracy': 0.938544749464231,\n",
-       " 'test_runtime': 28.0932,\n",
-       " 'test_samples_per_second': 86.854,\n",
-       " 'test_steps_per_second': 21.713,\n",
-       " 'epoch': 1.0}"
+      "{'test_loss': 0.019189156591892242,\n",
+      " 'test_overall_precision': 0.769879287219774,\n",
+      " 'test_overall_recall': 0.7679663608562691,\n",
+      " 'test_overall_f1': 0.7689216342933691,\n",
+      " 'test_overall_accuracy': 0.938544749464231,\n",
+      " 'test_runtime': 28.0932,\n",
+      " 'test_samples_per_second': 86.854,\n",
+      " 'test_steps_per_second': 21.713,\n",
+      " 'epoch': 1.0}"
      ]
     }
    ],
@@ -660,7 +660,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# trainer.push_to_hub()"
+    "trainer.push_to_hub()"
    ]
   },
   {

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "torch",
     "accelerate",
     "transformers>=4.19.0", # required for EvalPrediction.inputs
-    "datasets>=2.0.0",
+    "datasets>=2.14.0", # required for sorting with multiple columns
     "packaging>=20.0",
     "evaluate",
     "seqeval",
@@ -59,6 +59,9 @@ docs = [
 wandb = [
     "wandb"
 ]
+codecarbon = [
+    "codecarbon"
+]
 
 [project.urls]
 Documentation = "https://tomaarsen.github.io/SpanMarkerNER"

diff --git a/span_marker/__init__.py b/span_marker/__init__.py
@@ -7,6 +7,7 @@
 from transformers import AutoConfig, AutoModel, TrainingArguments
 
 from span_marker.configuration import SpanMarkerConfig
+from span_marker.model_card import SpanMarkerModelCardData
 from span_marker.modeling import SpanMarkerModel
 from span_marker.trainer import Trainer
 

diff --git a/span_marker/evaluation.py b/span_marker/evaluation.py
@@ -9,7 +9,9 @@
 from span_marker.tokenizer import SpanMarkerTokenizer
 
 
-def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction) -> Dict[str, float]:
+def compute_f1_via_seqeval(
+    tokenizer: SpanMarkerTokenizer, eval_prediction: EvalPrediction, is_in_train: bool
+) -> Dict[str, float]:
     """Compute micro-F1, recall, precision and accuracy scores using ``seqeval`` for the evaluation predictions.
 
     Note:
@@ -98,7 +100,7 @@ def compute_f1_via_seqeval(tokenizer: SpanMarkerTokenizer, eval_prediction: Eval
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", UndefinedMetricWarning)
         results = seqeval.compute()
-    # `results` also contains e.g. "person-athlete": {'precision': 0.5982658959537572, 'recall': 0.9, 'f1': 0.71875, 'number': 230}
-    # logging this all is overkill. Tensorboard doesn't even support it, WandB does, but it's not very useful generally.
-    # I'd like to revisit this to expose this information somehow still
-    return {key: value for key, value in results.items() if isinstance(value, float)}
+
+    if is_in_train:
+        return {key: value for key, value in results.items() if isinstance(value, float)}
+    return results
diff --git a/span_marker/label_normalizer.py b/span_marker/label_normalizer.py
@@ -27,8 +27,17 @@ def __init__(self, config: SpanMarkerConfig) -> None:
         self.config = config
 
     @abstractmethod
+    def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
+        pass
+
     def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        raise NotImplementedError
+        output = {"ner_tags": [], "entity_count": [], "word_count": []}
+        for tokens, ner_tags in zip(tokens, ner_tags):
+            ner_tags = list(self.ner_tags_to_entities(ner_tags))
+            output["ner_tags"].append(ner_tags)
+            output["entity_count"].append(len(ner_tags))
+            output["word_count"].append(len(tokens))
+        return output
 
 
 class LabelNormalizerScheme(LabelNormalizer):
@@ -57,9 +66,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
         if start_idx is not None:
             yield (reduced_label_id, start_idx, idx + 1)
 
-    def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))}
-
 
 class LabelNormalizerIOB(LabelNormalizerScheme):
     def __init__(self, config: SpanMarkerConfig) -> None:
@@ -108,9 +114,6 @@ def ner_tags_to_entities(self, ner_tags: List[int]) -> Iterator[Entity]:
         if start_idx is not None:
             yield (entity_label_id, start_idx, idx + 1)
 
-    def __call__(self, tokens: List[str], ner_tags: List[int]) -> Dict[str, List[Any]]:
-        return {"tokens": tokens, "ner_tags": list(self.ner_tags_to_entities(ner_tags))}
-
 
 class AutoLabelNormalizer:
     """Factory class to return the correct LabelNormalizer subclass."""