Skip to content

Commit

Permalink
Tidy typing, bump some dependencies (#96)
Browse files Browse the repository at this point in the history
* refactor: Standardize typing usage

* docs: Update item in todos

* build: Bump pytest, v7.0 => v8.0

* build: Bump rispy, v0.8 => v0.9

* build: Bump min setuptools used to build pkg

* build: Tweak ruff config
  • Loading branch information
bdewilde authored Feb 4, 2024
1 parent 8ca9d4b commit 90fb3ec
Show file tree
Hide file tree
Showing 22 changed files with 112 additions and 110 deletions.
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- [ ] Confirm that all email interactions (e.g. password reset) actually work
- [ ] Better handle uploaded fulltext files (really, all data artifacts saved on disk, not in the db)
- [ ] Allow for multiple review owners
- [x] Allow for multiple review owners
- [ ] Allow for assigning studies to reviewers for screening
- [ ] Integrate caching and rate limiting into API
- [ ] Add extra fields in users table, e.g. affiliation
Expand Down
5 changes: 2 additions & 3 deletions colandr/apis/auth.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import functools
from typing import Optional
import typing as t

import flask_jwt_extended as jwtext
import sqlalchemy as sa
import sqlalchemy.exc as sa_exc
from flask import current_app, render_template, url_for
from flask_restx import Namespace, Resource
from marshmallow import fields as ma_fields
Expand Down Expand Up @@ -374,7 +373,7 @@ def authenticate_user(email: str, password: str) -> User:
return user


def get_user_from_token(token: str) -> Optional[User]:
def get_user_from_token(token: str) -> t.Optional[User]:
"""
Get a ``User`` from the identity stored in an encoded, unexpired JWT token,
if it exists in the database; otherwise, return None.
Expand Down
4 changes: 2 additions & 2 deletions colandr/apis/errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
import typing as t

import webargs.core
import webargs.flaskparser
Expand Down Expand Up @@ -66,7 +66,7 @@ def validation_error(error, req, schema, *, error_status_code, error_headers):


def _make_error_response(
status_code: int, message: Optional[str] = None
status_code: int, message: t.Optional[str] = None
) -> tuple[dict[str, str], int]:
data = {"error": HTTP_STATUS_CODES.get(status_code, "Unknown error")}
if message:
Expand Down
6 changes: 3 additions & 3 deletions colandr/apis/resources/exports.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import itertools
from typing import Optional
import typing as t

import flask_jwt_extended as jwtext
import sqlalchemy as sa
Expand Down Expand Up @@ -84,7 +84,7 @@ def get(self, review_id, content_type):
"fulltext_filename",
"fulltext_exclude_reasons",
]
extraction_label_types: Optional[list[tuple[str, str]]]
extraction_label_types: t.Optional[list[tuple[str, str]]]
if data_extraction_form:
extraction_label_types = [
(item["label"], item["field_type"]) for item in data_extraction_form[0]
Expand All @@ -109,7 +109,7 @@ def get(self, review_id, content_type):


def _study_to_row(
study: Study, extraction_label_types: Optional[list[tuple[str, str]]]
study: Study, extraction_label_types: t.Optional[list[tuple[str, str]]]
) -> dict:
row = {
"study_id": study.id,
Expand Down
6 changes: 3 additions & 3 deletions colandr/apis/resources/review_exports.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import collections
import csv
import itertools
from typing import Optional
import typing as t

import flask_jwt_extended as jwtext
import sqlalchemy as sa
Expand Down Expand Up @@ -204,7 +204,7 @@ def get(self, id):
"fulltext_filename",
"fulltext_exclude_reasons",
]
extraction_label_types: Optional[list[tuple[str, str]]]
extraction_label_types: t.Optional[list[tuple[str, str]]]
if data_extraction_form:
extraction_label_types = [
(item["label"], item["field_type"]) for item in data_extraction_form[0]
Expand Down Expand Up @@ -278,7 +278,7 @@ def get(self, id):


def _study_to_row(
study: Study, extraction_label_types: Optional[list[tuple[str, str]]]
study: Study, extraction_label_types: t.Optional[list[tuple[str, str]]]
) -> dict:
row = {
"study_id": study.id,
Expand Down
4 changes: 2 additions & 2 deletions colandr/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import sys
from typing import Any, Optional
import typing as t

import flask
import flask.logging
Expand All @@ -9,7 +9,7 @@
from colandr.apis import api_v1


def create_app(config_overrides: Optional[dict[str, Any]] = None) -> flask.Flask:
def create_app(config_overrides: t.Optional[dict[str, t.Any]] = None) -> flask.Flask:
app = flask.Flask("colandr")
app.config.from_object(config)
if config_overrides:
Expand Down
8 changes: 4 additions & 4 deletions colandr/lib/fileio/bibtex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import pathlib
import re
from typing import BinaryIO, Optional, Tuple
import typing as t

import bibtexparser

Expand Down Expand Up @@ -52,7 +52,7 @@
}


def read(path_or_stream: BinaryIO | pathlib.Path) -> list[dict]:
def read(path_or_stream: t.BinaryIO | pathlib.Path) -> list[dict]:
data = utils.load_from_path_or_stream(path_or_stream)
records = parse(data)
records = sanitize(records)
Expand Down Expand Up @@ -132,7 +132,7 @@ def _split_names(record: dict, field_name: str) -> dict:
return record


def _sanitize_month(value: str) -> Optional[int]:
def _sanitize_month(value: str) -> t.Optional[int]:
try:
return int(value)
except ValueError:
Expand All @@ -143,7 +143,7 @@ def _sanitize_month(value: str) -> Optional[int]:
return None


def _split_pages(value: str) -> Optional[Tuple[Optional[int], Optional[int]]]:
def _split_pages(value: str) -> t.Optional[tuple[t.Optional[int], t.Optional[int]]]:
if "--" in value:
pages = value.split("--")
if len(pages) == 2:
Expand Down
3 changes: 1 addition & 2 deletions colandr/lib/fileio/pdf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import pathlib
from typing import Union

import fitz


def read(file_path: Union[str, pathlib.Path]) -> str:
def read(file_path: str | pathlib.Path) -> str:
"""Extract text from a PDF file and write it to a text file."""
with fitz.open(str(file_path), filetype="pdf") as doc:
text = chr(12).join(page.get_text("text", sort=True) for page in doc)
Expand Down
6 changes: 3 additions & 3 deletions colandr/lib/fileio/ris.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""
import logging
import pathlib
from typing import BinaryIO, List
import typing as t

import markupsafe
import rispy
Expand Down Expand Up @@ -77,7 +77,7 @@
}


def read(path_or_stream: BinaryIO | pathlib.Path) -> list[dict]:
def read(path_or_stream: t.BinaryIO | pathlib.Path) -> list[dict]:
data = utils.load_from_path_or_stream(path_or_stream)
records = parse(data)
records = sanitize(records)
Expand Down Expand Up @@ -154,6 +154,6 @@ def _sanitize_reference(reference: dict) -> dict:
return reference


def _strip_tags_from_notes(notes: List[str]) -> List[str]:
def _strip_tags_from_notes(notes: list[str]) -> list[str]:
notes = [markupsafe.Markup(note).striptags() for note in notes]
return [note for note in notes if note]
4 changes: 2 additions & 2 deletions colandr/lib/fileio/tabular.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import csv
import io
import itertools
import typing as t
from collections.abc import Iterable, Sequence
from typing import Any


def read(data: str, *, dialect: str = "excel", **kwargs) -> Iterable[list[str]]:
Expand All @@ -24,7 +24,7 @@ def write(

def write_stream(
cols: Sequence[str],
rows: Iterable[dict[str, Any]] | Iterable[list],
rows: Iterable[dict[str, t.Any]] | Iterable[list],
*,
dialect="excel",
**kwargs,
Expand Down
12 changes: 6 additions & 6 deletions colandr/lib/fileio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import logging
import pathlib
import tempfile
from collections.abc import Sequence
from typing import Any, BinaryIO, Iterable, Optional
import typing as t
from collections.abc import Iterable, Sequence

from dateutil.parser import ParserError
from dateutil.parser import parse as parse_dttm
Expand All @@ -14,7 +14,7 @@


def load_from_path_or_stream(
path_or_stream: BinaryIO | pathlib.Path,
path_or_stream: t.BinaryIO | pathlib.Path,
encodings: Sequence[str] = ("utf-8", "ISO-8859-1"),
) -> str:
"""
Expand Down Expand Up @@ -58,7 +58,7 @@ def load_from_path_or_stream(
return data


def try_to_dttm(value: float | int | str) -> Optional[datetime.datetime]:
def try_to_dttm(value: float | int | str) -> t.Optional[datetime.datetime]:
"""Cast ``value`` into a dttm, as needed."""
if isinstance(value, int):
try:
Expand All @@ -75,7 +75,7 @@ def try_to_dttm(value: float | int | str) -> Optional[datetime.datetime]:
return None


def try_to_int(value: float | int | str) -> Optional[int]:
def try_to_int(value: float | int | str) -> t.Optional[int]:
"""Cast ``value`` into an int, as needed."""
if isinstance(value, int):
return value
Expand All @@ -87,7 +87,7 @@ def try_to_int(value: float | int | str) -> Optional[int]:
return None


def to_list(value: Any) -> list:
def to_list(value: t.Any) -> list:
"""Cast ``value`` into a list, as needed."""
if isinstance(value, list):
return value
Expand Down
18 changes: 9 additions & 9 deletions colandr/lib/models/deduper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import functools
import logging
import pathlib
import typing as t
import urllib.parse
from collections.abc import Iterable
from typing import Any, Optional

import dedupe

Expand All @@ -14,7 +14,7 @@

SETTINGS_FNAME = "deduper_settings"
TRAINING_FNAME = "deduper_training.json"
VARIABLES: list[dict[str, Any]] = [
VARIABLES: list[dict[str, t.Any]] = [
{"field": "type_of_reference", "type": "ShortString"},
{"field": "title", "type": "String", "variable name": "title"},
{"field": "pub_year", "type": "Exact", "variable name": "pub_year"},
Expand All @@ -30,7 +30,7 @@ class Deduper:
def __init__(
self,
*,
settings_fpath: Optional[str | pathlib.Path] = None,
settings_fpath: t.Optional[str | pathlib.Path] = None,
num_cores: int = 1,
in_memory: bool = False,
):
Expand Down Expand Up @@ -64,14 +64,14 @@ def model(self) -> dedupe.Dedupe | dedupe.StaticDedupe:

def preprocess_data(
self,
data: Iterable[dict[str, Any]],
data: Iterable[dict[str, t.Any]],
id_key: str,
) -> dict[Any, dict[str, Any]]:
) -> dict[t.Any, dict[str, t.Any]]:
fields = [pv.field for pv in self.model.data_model.primary_variables]
LOGGER.info("preprocessing data with fields %s ...", fields)
return {record.pop(id_key): self._preprocess_record(record) for record in data}

def _preprocess_record(self, record: dict[str, Any]) -> dict[str, Any]:
def _preprocess_record(self, record: dict[str, t.Any]) -> dict[str, t.Any]:
# base fields
record = {
"type_of_reference": (
Expand Down Expand Up @@ -105,8 +105,8 @@ def _preprocess_record(self, record: dict[str, Any]) -> dict[str, Any]:

def fit(
self,
data: dict[Any, dict[str, Any]],
training_fpath: Optional[str | pathlib.Path] = None,
data: dict[t.Any, dict[str, t.Any]],
training_fpath: t.Optional[str | pathlib.Path] = None,
recall: float = 1.0,
index_predicates: bool = True,
) -> "Deduper":
Expand All @@ -126,7 +126,7 @@ def fit(
return self

def predict(
self, data: dict[Any, dict[str, Any]], threshold: float = 0.5
self, data: dict[t.Any, dict[str, t.Any]], threshold: float = 0.5
) -> list[tuple[tuple, tuple[float, ...]]]:
return self.model.partition(data, threshold=threshold) # type: ignore

Expand Down
4 changes: 2 additions & 2 deletions colandr/lib/models/ranker.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import functools
import logging
import pathlib
import typing as t
from collections.abc import Iterable, Sequence
from typing import Optional

import joblib
import numpy as np
Expand All @@ -19,7 +19,7 @@

class Ranker:
def __init__(
self, *, review_id: int, model_fpath: Optional[str | pathlib.Path] = None
self, *, review_id: int, model_fpath: t.Optional[str | pathlib.Path] = None
):
self.review_id = review_id
self.model_fpath = model_fpath
Expand Down
10 changes: 5 additions & 5 deletions colandr/lib/nlp/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import collections
import itertools
import logging
import typing as t
from collections.abc import Iterable
from operator import itemgetter
from typing import Optional

import spacy
import textacy
Expand All @@ -30,11 +30,11 @@ def get_lang_to_models() -> dict[str, list[str]]:
def process_texts_into_docs(
texts: Iterable[str],
*,
max_len: Optional[int] = 1000,
min_prob: Optional[float] = 0.5,
fallback_lang: Optional[str] = "en",
max_len: t.Optional[int] = 1000,
min_prob: t.Optional[float] = 0.5,
fallback_lang: t.Optional[str] = "en",
**kwargs,
) -> Iterable[Optional[Doc]]:
) -> Iterable[t.Optional[Doc]]:
"""
Args:
texts
Expand Down
4 changes: 2 additions & 2 deletions colandr/lib/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import pathlib
from typing import BinaryIO
import typing as t

from ..apis import schemas
from . import fileio, sanitizers
Expand All @@ -10,7 +10,7 @@


def preprocess_citations(
path_or_stream: BinaryIO | pathlib.Path, fname: str, review_id: int
path_or_stream: t.BinaryIO | pathlib.Path, fname: str, review_id: int
) -> list[dict]:
if fname.endswith(".bib"):
try:
Expand Down
4 changes: 2 additions & 2 deletions colandr/lib/sanitizers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Any
import typing as t

from . import constants


def sanitize_citation(record: dict[str, Any]) -> dict[str, Any]:
def sanitize_citation(record: dict[str, t.Any]) -> dict[str, t.Any]:
"""
Sanitize keys/values of a 'raw' citation record into something suitable
for insertion into the corresponding database table.
Expand Down
Loading

0 comments on commit 90fb3ec

Please sign in to comment.