Improve citation deduplication (#101)

* feat: Improve author parsing in ris * build: Bump dedupe min version * feat: Improve dedupe record preproc, modeling * feat: Update dedupe training data * feat: Add new trained deduper data * feat: Update dedupe model used in tasks
datakind · Mar 9, 2024 · 8badac0 · 8badac0
1 parent ae32e43
commit 8badac0
Show file tree

Hide file tree

Showing 10 changed files with 760 additions and 216 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@ RUN mkdir -p ${COLANDR_APP_DIR}
 WORKDIR ${COLANDR_APP_DIR}
 
 RUN apt update \
-    && apt install -y gcc \
+    && apt install -y gcc git \
     && apt clean \
     && rm -rf /var/lib/apt/lists/* /usr/share/doc /usr/share/man
 

diff --git a/colandr/config.py b/colandr/config.py
@@ -82,7 +82,9 @@
 
 # files-on-disk config
 COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/tmp")
-DEDUPE_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "dedupe")
+DEDUPE_MODELS_DIR = os.path.join(
+    COLANDR_APP_DIR, "colandr_data", "dedupe-v2", "model_202403"
+)
 RANKING_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranking_models")
 CITATIONS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "citations")
 FULLTEXT_UPLOADS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "fulltexts")

diff --git a/colandr/lib/fileio/ris.py b/colandr/lib/fileio/ris.py
@@ -135,6 +135,9 @@ def _sanitize_reference(reference: dict) -> dict:
                 if alt_key in reference:
                     reference[default_key] = reference.pop(alt_key)
                     break
+    # handle authors specified all together on one line
+    if "authors" in reference:
+        reference["authors"] = _split_up_authors(reference["authors"])
     # clean notes text, which may contain html tags and markup
     if "notes" in reference:
         reference["notes"] = _strip_tags_from_notes(reference["notes"])
@@ -154,6 +157,17 @@ def _sanitize_reference(reference: dict) -> dict:
     return reference
 
 
+def _split_up_authors(authors: list[str]) -> list[str]:
+    if len(authors) == 1:
+        if authors[0].count(",") >= 2:
+            authors = [author.strip() for author in authors[0].split(",")]
+        elif authors[0].count(" ") >= 5:
+            # TODO: this is probably bad data (all authors in one field w/o delimiters)
+            # but how to reliably fix?
+            pass
+    return authors
+
+
 def _strip_tags_from_notes(notes: list[str]) -> list[str]:
     notes = [markupsafe.Markup(note).striptags() for note in notes]
     return [note for note in notes if note]
diff --git a/colandr/lib/models/deduper.py b/colandr/lib/models/deduper.py
@@ -1,28 +1,76 @@
 import functools
 import logging
 import pathlib
+import re
 import typing as t
 import urllib.parse
 from collections.abc import Iterable
 
 import dedupe
+from textacy import preprocessing
 
 from .. import utils
 
 
 LOGGER = logging.getLogger(__name__)
 
+RE_DOI_HTTP = re.compile(r"^https?(://)?", flags=re.IGNORECASE)
+
 SETTINGS_FNAME = "deduper_settings"
 TRAINING_FNAME = "deduper_training.json"
 VARIABLES: list[dict[str, t.Any]] = [
-    {"field": "type_of_reference", "type": "ShortString"},
+    {"field": "type_of_reference", "type": "Exact"},
+    {"field": "doi", "type": "String", "has missing": True},
     {"field": "title", "type": "String", "variable name": "title"},
-    {"field": "pub_year", "type": "Exact", "variable name": "pub_year"},
-    {"field": "authors", "type": "Set", "has missing": True},
-    {"field": "authors_joined", "type": "String", "has missing": True},
+    {
+        "field": "authors_joined",
+        "type": "String",
+        "has missing": True,
+        "variable name": "authors_joined",
+    },
+    {
+        "field": "authors_initials",
+        "type": "Set",
+        "has missing": True,
+        "variable name": "authors_initials",
+    },
+    {
+        "field": "pub_year",
+        "type": "Exact",
+        "has missing": True,
+        "variable name": "pub_year",
+    },
+    {
+        "field": "journal_name",
+        "type": "String",
+        "has missing": True,
+        "variable name": "journal_name",
+    },
+    {
+        "field": "journal_volume",
+        "type": "Exact",
+        "has missing": True,
+        "variable name": "journal_volume",
+    },
+    {
+        "field": "journal_issue_number",
+        "type": "Exact",
+        "has missing": True,
+        "variable name": "journal_issue_number",
+    },
+    {"field": "issn", "type": "String", "has missing": True, "variable name": "issn"},
     {"field": "abstract", "type": "Text", "has missing": True},
-    {"field": "doi", "type": "ShortString", "has missing": True},
-    {"type": "Interaction", "interaction variables": ["title", "pub_year"]},
+    {"type": "Interaction", "interaction variables": ["journal_name", "pub_year"]},
+    {
+        "type": "Interaction",
+        "interaction variables": [
+            "journal_name",
+            "journal_volume",
+            "journal_issue_number",
+        ],
+    },
+    {"type": "Interaction", "interaction variables": ["issn", "pub_year"]},
+    {"type": "Interaction", "interaction variables": ["title", "authors_joined"]},
 ]
 
 
@@ -79,28 +127,47 @@ def _preprocess_record(self, record: dict[str, t.Any]) -> dict[str, t.Any]:
                 if record.get("type_of_reference")
                 else None
             ),
+            "doi": (_sanitize_doi(record["doi"]) if record.get("doi") else None),
             "title": (
-                record["title"].strip().strip(".").lower()
-                if record.get("title")
-                else None
+                _standardize_str(record["title"]) if record.get("title") else None
             ),
-            "pub_year": record.get("pub_year", None),
             "authors": (
-                tuple(sorted(author.strip().lower() for author in record["authors"]))
+                tuple(
+                    sorted(
+                        _standardize_str(author.replace("-", " "))
+                        for author in record["authors"]
+                    )
+                )
                 if record.get("authors")
                 else None
             ),
+            "pub_year": record.get("pub_year"),
+            "journal_name": (
+                preprocessing.remove.brackets(
+                    _standardize_str(record["journal_name"]), only="round"
+                )
+                if record.get("journal_name")
+                else None
+            ),
+            "journal_volume": record.get("volume"),
+            "journal_issue_number": record.get("issue_number"),
+            "issn": record["issn"].strip().lower() if record.get("issn") else None,
             "abstract": (
-                record["abstract"].strip().lower()[:500]  # truncated for performance
+                _standardize_str(record["abstract"][:500])  # truncated for performance
                 if record.get("abstract")
                 else None
             ),
-            "doi": (_sanitize_doi(record["doi"]) if record.get("doi") else None),
         }
         # derivative fields
-        record["authors_joined"] = (
-            "; ".join(record["authors"]) if record.get("authors") else None
-        )
+        if record.get("authors"):
+            record["authors_initials"] = tuple(
+                "".join(name[0] for name in author.split())
+                for author in record["authors"]
+            )
+            record["authors_joined"] = " ".join(record["authors"])
+        else:
+            record["authors_initials"] = None
+            record["authors_joined"] = None
         return record
 
     def fit(
@@ -146,4 +213,15 @@ def _sanitize_doi(value: str) -> str:
     value = value.strip().lower()
     if value.startswith("http://") or value.startswith("https://"):
         value = urllib.parse.unquote(value)
+        value = RE_DOI_HTTP.sub("", value)
     return value
+
+
+_standardize_str = preprocessing.make_pipeline(
+    functools.partial(
+        preprocessing.remove.punctuation, only=[".", "?", "!", ",", ";", "—"]
+    ),
+    preprocessing.normalize.quotation_marks,
+    preprocessing.normalize.whitespace,
+    str.lower,
+)
diff --git a/colandr/tasks.py b/colandr/tasks.py
@@ -85,10 +85,9 @@ def deduplicate_citations(review_id: int):
         lock.release()
         return
 
-    dir_path = os.path.join(
-        current_app.config["COLANDR_APP_DIR"], "colandr_data", "dedupe-v2", "model"
+    deduper = Deduper.load(
+        current_app.config["DEDUPE_MODELS_DIR"], num_cores=1, in_memory=False
     )
-    deduper = Deduper.load(dir_path, num_cores=1, in_memory=False)
 
     # remove dedupe rows for this review
     # which we'll add back with the latest citations included