Skip to content

Commit

Permalink
Improve citation deduplication (#101)
Browse files Browse the repository at this point in the history
* feat: Improve author parsing in ris

* build: Bump dedupe min version

* feat: Improve dedupe record preproc, modeling

* feat: Update dedupe training data

* feat: Add new trained deduper data

* feat: Update dedupe model used in tasks
  • Loading branch information
bdewilde authored Mar 9, 2024
1 parent ae32e43 commit 8badac0
Show file tree
Hide file tree
Showing 10 changed files with 760 additions and 216 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RUN mkdir -p ${COLANDR_APP_DIR}
WORKDIR ${COLANDR_APP_DIR}

RUN apt update \
&& apt install -y gcc \
&& apt install -y gcc git \
&& apt clean \
&& rm -rf /var/lib/apt/lists/* /usr/share/doc /usr/share/man

Expand Down
4 changes: 3 additions & 1 deletion colandr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@

# files-on-disk config
COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/tmp")
DEDUPE_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "dedupe")
DEDUPE_MODELS_DIR = os.path.join(
COLANDR_APP_DIR, "colandr_data", "dedupe-v2", "model_202403"
)
RANKING_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranking_models")
CITATIONS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "citations")
FULLTEXT_UPLOADS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "fulltexts")
Expand Down
14 changes: 14 additions & 0 deletions colandr/lib/fileio/ris.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ def _sanitize_reference(reference: dict) -> dict:
if alt_key in reference:
reference[default_key] = reference.pop(alt_key)
break
# handle authors specified all together on one line
if "authors" in reference:
reference["authors"] = _split_up_authors(reference["authors"])
# clean notes text, which may contain html tags and markup
if "notes" in reference:
reference["notes"] = _strip_tags_from_notes(reference["notes"])
Expand All @@ -154,6 +157,17 @@ def _sanitize_reference(reference: dict) -> dict:
return reference


def _split_up_authors(authors: list[str]) -> list[str]:
if len(authors) == 1:
if authors[0].count(",") >= 2:
authors = [author.strip() for author in authors[0].split(",")]
elif authors[0].count(" ") >= 5:
# TODO: this is probably bad data (all authors in one field w/o delimiters)
# but how to reliably fix?
pass
return authors


def _strip_tags_from_notes(notes: list[str]) -> list[str]:
notes = [markupsafe.Markup(note).striptags() for note in notes]
return [note for note in notes if note]
110 changes: 94 additions & 16 deletions colandr/lib/models/deduper.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,76 @@
import functools
import logging
import pathlib
import re
import typing as t
import urllib.parse
from collections.abc import Iterable

import dedupe
from textacy import preprocessing

from .. import utils


LOGGER = logging.getLogger(__name__)

RE_DOI_HTTP = re.compile(r"^https?(://)?", flags=re.IGNORECASE)

SETTINGS_FNAME = "deduper_settings"
TRAINING_FNAME = "deduper_training.json"
VARIABLES: list[dict[str, t.Any]] = [
{"field": "type_of_reference", "type": "ShortString"},
{"field": "type_of_reference", "type": "Exact"},
{"field": "doi", "type": "String", "has missing": True},
{"field": "title", "type": "String", "variable name": "title"},
{"field": "pub_year", "type": "Exact", "variable name": "pub_year"},
{"field": "authors", "type": "Set", "has missing": True},
{"field": "authors_joined", "type": "String", "has missing": True},
{
"field": "authors_joined",
"type": "String",
"has missing": True,
"variable name": "authors_joined",
},
{
"field": "authors_initials",
"type": "Set",
"has missing": True,
"variable name": "authors_initials",
},
{
"field": "pub_year",
"type": "Exact",
"has missing": True,
"variable name": "pub_year",
},
{
"field": "journal_name",
"type": "String",
"has missing": True,
"variable name": "journal_name",
},
{
"field": "journal_volume",
"type": "Exact",
"has missing": True,
"variable name": "journal_volume",
},
{
"field": "journal_issue_number",
"type": "Exact",
"has missing": True,
"variable name": "journal_issue_number",
},
{"field": "issn", "type": "String", "has missing": True, "variable name": "issn"},
{"field": "abstract", "type": "Text", "has missing": True},
{"field": "doi", "type": "ShortString", "has missing": True},
{"type": "Interaction", "interaction variables": ["title", "pub_year"]},
{"type": "Interaction", "interaction variables": ["journal_name", "pub_year"]},
{
"type": "Interaction",
"interaction variables": [
"journal_name",
"journal_volume",
"journal_issue_number",
],
},
{"type": "Interaction", "interaction variables": ["issn", "pub_year"]},
{"type": "Interaction", "interaction variables": ["title", "authors_joined"]},
]


Expand Down Expand Up @@ -79,28 +127,47 @@ def _preprocess_record(self, record: dict[str, t.Any]) -> dict[str, t.Any]:
if record.get("type_of_reference")
else None
),
"doi": (_sanitize_doi(record["doi"]) if record.get("doi") else None),
"title": (
record["title"].strip().strip(".").lower()
if record.get("title")
else None
_standardize_str(record["title"]) if record.get("title") else None
),
"pub_year": record.get("pub_year", None),
"authors": (
tuple(sorted(author.strip().lower() for author in record["authors"]))
tuple(
sorted(
_standardize_str(author.replace("-", " "))
for author in record["authors"]
)
)
if record.get("authors")
else None
),
"pub_year": record.get("pub_year"),
"journal_name": (
preprocessing.remove.brackets(
_standardize_str(record["journal_name"]), only="round"
)
if record.get("journal_name")
else None
),
"journal_volume": record.get("volume"),
"journal_issue_number": record.get("issue_number"),
"issn": record["issn"].strip().lower() if record.get("issn") else None,
"abstract": (
record["abstract"].strip().lower()[:500] # truncated for performance
_standardize_str(record["abstract"][:500]) # truncated for performance
if record.get("abstract")
else None
),
"doi": (_sanitize_doi(record["doi"]) if record.get("doi") else None),
}
# derivative fields
record["authors_joined"] = (
"; ".join(record["authors"]) if record.get("authors") else None
)
if record.get("authors"):
record["authors_initials"] = tuple(
"".join(name[0] for name in author.split())
for author in record["authors"]
)
record["authors_joined"] = " ".join(record["authors"])
else:
record["authors_initials"] = None
record["authors_joined"] = None
return record

def fit(
Expand Down Expand Up @@ -146,4 +213,15 @@ def _sanitize_doi(value: str) -> str:
value = value.strip().lower()
if value.startswith("http://") or value.startswith("https://"):
value = urllib.parse.unquote(value)
value = RE_DOI_HTTP.sub("", value)
return value


_standardize_str = preprocessing.make_pipeline(
functools.partial(
preprocessing.remove.punctuation, only=[".", "?", "!", ",", ";", "—"]
),
preprocessing.normalize.quotation_marks,
preprocessing.normalize.whitespace,
str.lower,
)
5 changes: 2 additions & 3 deletions colandr/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,9 @@ def deduplicate_citations(review_id: int):
lock.release()
return

dir_path = os.path.join(
current_app.config["COLANDR_APP_DIR"], "colandr_data", "dedupe-v2", "model"
deduper = Deduper.load(
current_app.config["DEDUPE_MODELS_DIR"], num_cores=1, in_memory=False
)
deduper = Deduper.load(dir_path, num_cores=1, in_memory=False)

# remove dedupe rows for this review
# which we'll add back with the latest citations included
Expand Down
Loading

0 comments on commit 8badac0

Please sign in to comment.