Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wikidata v0 with author description and infobox #9130

Merged
merged 7 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions openlibrary/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from openlibrary.core.ratings import Ratings
from openlibrary.utils import extract_numeric_id_from_olid, dateutil
from openlibrary.utils.isbn import to_isbn_13, isbn_13_to_isbn_10, canonical
from openlibrary.core.wikidata import WikidataEntity, get_wikidata_entity

from . import cache, waitinglist

Expand Down Expand Up @@ -785,6 +786,15 @@ def url(self, suffix="", **params):
def get_url_suffix(self):
return self.name or "unnamed"

def wikidata(
self, bust_cache: bool = False, fetch_missing: bool = False
) -> WikidataEntity | None:
if wd_id := self.remote_ids.get("wikidata"):
return get_wikidata_entity(
qid=wd_id, bust_cache=bust_cache, fetch_missing=fetch_missing
)
return None

def __repr__(self):
return "<Author: %s>" % repr(self.key)

Expand Down
6 changes: 6 additions & 0 deletions openlibrary/core/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,9 @@ CREATE TABLE yearly_reading_goals (
updated timestamp without time zone default (current_timestamp at time zone 'utc'),
primary key (username, year)
);

CREATE TABLE wikidata (
id text not null primary key,
data json,
updated timestamp without time zone default (current_timestamp at time zone 'utc')
)
144 changes: 144 additions & 0 deletions openlibrary/core/wikidata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
The purpose of this file is to:
1. Interact with the Wikidata API
2. Store the results
3. Make the results easy to access from other files
"""

import requests
import logging
from dataclasses import dataclass
from openlibrary.core.helpers import days_since

from datetime import datetime
import json
from openlibrary.core import db

logger = logging.getLogger("core.wikidata")

WIKIDATA_API_URL = 'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/'
WIKIDATA_CACHE_TTL_DAYS = 30


@dataclass
class WikidataEntity:
"""
This is the model of the api response from WikiData plus the updated field
https://www.wikidata.org/wiki/Wikidata:REST_API
"""

id: str
type: str
labels: dict[str, str]
descriptions: dict[str, str]
aliases: dict[str, list[str]]
statements: dict[str, dict]
sitelinks: dict[str, dict]
_updated: datetime # This is when we fetched the data, not when the entity was changed in Wikidata

def get_description(self, language: str = 'en') -> str | None:
"""If a description isn't available in the requested language default to English"""
return self.descriptions.get(language) or self.descriptions.get('en')

@classmethod
def from_dict(cls, response: dict, updated: datetime):
return cls(
**response,
_updated=updated,
)

def to_wikidata_api_json_format(self) -> str:
"""
Transforms the dataclass a JSON string like we get from the Wikidata API.
This is used for storing the json in the database.
"""
entity_dict = {
'id': self.id,
'type': self.type,
'labels': self.labels,
'descriptions': self.descriptions,
'aliases': self.aliases,
'statements': self.statements,
'sitelinks': self.sitelinks,
}
return json.dumps(entity_dict)


def _cache_expired(entity: WikidataEntity) -> bool:
return days_since(entity._updated) > WIKIDATA_CACHE_TTL_DAYS


def get_wikidata_entity(
qid: str, bust_cache: bool = False, fetch_missing: bool = False
) -> WikidataEntity | None:
"""
This only supports QIDs, if we want to support PIDs we need to use different endpoints
By default this will only use the cache (unless it is expired).
This is to avoid overwhelming Wikidata servers with requests from every visit to an author page.
bust_cache must be set to True if you want to fetch new items from Wikidata.
# TODO: After bulk data imports we should set fetch_missing to true (or remove it).
"""
if bust_cache:
return _get_from_web(qid)

if entity := _get_from_cache(qid):
if _cache_expired(entity):
return _get_from_web(qid)
return entity
elif fetch_missing:
return _get_from_web(qid)

return None


def _get_from_web(id: str) -> WikidataEntity | None:
response = requests.get(f'{WIKIDATA_API_URL}{id}')
if response.status_code == 200:
entity = WikidataEntity.from_dict(
response=response.json(), updated=datetime.now()
)
_add_to_cache(entity)
return entity
else:
logger.error(f'Wikidata Response: {response.status_code}, id: {id}')
return None
# Responses documented here https://doc.wikimedia.org/Wikibase/master/js/rest-api/


def _get_from_cache_by_ids(ids: list[str]) -> list[WikidataEntity]:
response = list(
db.get_db().query(
'select * from wikidata where id IN ($ids)',
vars={'ids': ids},
)
)
return [
WikidataEntity.from_dict(response=r.data, updated=r.updated) for r in response
]


def _get_from_cache(id: str) -> WikidataEntity | None:
"""
The cache is OpenLibrary's Postgres instead of calling the Wikidata API
"""
if result := _get_from_cache_by_ids([id]):
return result[0]
return None


def _add_to_cache(entity: WikidataEntity) -> None:
# TODO: after we upgrade to postgres 9.5+ we should use upsert here
oldb = db.get_db()
json_data = entity.to_wikidata_api_json_format()

if _get_from_cache(entity.id):
return oldb.update(
"wikidata",
where="id=$id",
vars={'id': entity.id},
data=json_data,
updated=entity._updated,
)
else:
# We don't provide the updated column on insert because postgres defaults to the current time
return oldb.insert("wikidata", id=entity.id, data=json_data)
16 changes: 12 additions & 4 deletions openlibrary/i18n/messages.pot
Original file line number Diff line number Diff line change
Expand Up @@ -2212,6 +2212,18 @@ msgstr ""
msgid "including <i>%(topwork)s</i>"
msgstr ""

#: authors/infobox.html
msgid "Born"
msgstr ""

#: authors/infobox.html
msgid "Died"
msgstr ""

#: authors/infobox.html type/author/edit.html
msgid "Date"
msgstr ""

#: book_providers/gutenberg_download_options.html
#: book_providers/ia_download_options.html
#: book_providers/librivox_download_options.html
Expand Down Expand Up @@ -5264,10 +5276,6 @@ msgid ""
"is recommended."
msgstr ""

#: type/author/edit.html
msgid "Date"
msgstr ""

#: type/author/edit.html
msgid ""
"This is a deprecated field. You can help improve this record by removing "
Expand Down
1 change: 1 addition & 0 deletions openlibrary/plugins/wikidata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'wikidata plugin.'
33 changes: 33 additions & 0 deletions openlibrary/templates/authors/infobox.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
$def with (page, edit_view: bool = False)

$ is_librarian = ctx.user and (ctx.user.is_librarian() or ctx.user.is_super_librarian() or ctx.user.is_admin())

$if edit_view:
$ wikidata = page.wikidata(bust_cache=True, fetch_missing=True)
$else:
$ wikidata = page.wikidata(fetch_missing=is_librarian)

$def render_infobox_row(label, itemprop, value):
$if value:
<tr>
<td><strong>$label</strong></td>
<td><span itemprop="$itemprop">$value</span></td>
</tr>

<div class="infobox">
<div class="illustration">
$:render_template("covers/author_photo", page)
$:render_template("covers/change", page, ".bookCover img")
</div>
<p class="short-description">
$if wikidata:
$wikidata.get_description(i18n.get_locale())
</p>
<table>
$if page.birth_date or page.death_date:
$:render_infobox_row(_("Born"), 'birthDate', page.birth_date)
$:render_infobox_row(_("Died"), 'deathDate', page.death_date)
$elif page.date:
$:render_infobox_row(_("Date"), '', page.date)
</table>
</div>
6 changes: 2 additions & 4 deletions openlibrary/templates/type/author/edit.html
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,8 @@ <h1>$_("Edit Author")</h1>

</div>

<div class="illustration">
$:render_template("covers/author_photo", page)
$:render_template("covers/change", page)
</div>

$:render_template("authors/infobox", page, edit_view=True)

<div class="clearfix"></div>

Expand Down
29 changes: 15 additions & 14 deletions openlibrary/templates/type/author/view.html
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,20 @@

$set_share_links(url=request.canonical_url, title=title, view_context=ctx)

$ show_librarian_extras = ctx.user and (ctx.user.is_admin() or ctx.user.is_usergroup_member('/usergroup/librarians'))
<div id="contentHead">

$:macros.databarView(page)

<h1 itemprop="name">$title</h1>
<h2 class="author collapse">
$if page.birth_date or page.death_date:
<span itemprop="birthDate">$page.birth_date</span> - <span itemprop="deathDate">$page.death_date</span>
$else:
$if page.date:
$page.date
</h2>
$if show_librarian_extras:
<h2 class="author collapse">
$if page.birth_date or page.death_date:
<span itemprop="birthDate">$page.birth_date</span> - <span itemprop="deathDate">$page.death_date</span>
$else:
$if page.date:
$page.date
</h2>

</div>

Expand Down Expand Up @@ -107,7 +109,9 @@ <h2 class="author collapse">
<h6 class="collapse black uppercase">$_("Location")</h6>
$page.location
</div>

<span class="mobile-only">
$:render_template("authors/infobox", page)
</span>
<div class="clearfix"></div>
<div id="works" class="section">
<h2 class="collapse">
Expand Down Expand Up @@ -139,7 +143,6 @@ <h2 class="collapse">

<div id="searchResults">
<ul class="list-books">
$ show_librarian_extras = ctx.user and (ctx.user.is_admin() or ctx.user.is_usergroup_member('/usergroup/librarians'))
$for doc in books.docs:
$:macros.SearchResultsWork(doc, show_librarian_extras=show_librarian_extras, include_dropper=True)
</ul>
Expand All @@ -149,11 +152,9 @@ <h2 class="collapse">
</div>
</div>
<div class="contentOnethird">
<div class="illustration">
$:render_template("covers/author_photo", page)
$:render_template("covers/change", page, ".bookCover img")
</div>

<span class="desktop-only">
$:render_template("authors/infobox", page)
</span>
$def render_subjects(label, subjects, prefix):
$if subjects:
<div class="section link-box link-box--with-header">
Expand Down
77 changes: 77 additions & 0 deletions openlibrary/tests/core/test_wikidata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pytest
from unittest.mock import patch
from openlibrary.core import wikidata
from datetime import datetime, timedelta

EXAMPLE_WIKIDATA_DICT = {
'id': "Q42",
'type': 'str',
'labels': {'en': ''},
'descriptions': {'en': ''},
'aliases': {'en': ['']},
'statements': {'': {}},
'sitelinks': {'': {}},
}


def createWikidataEntity(
qid: str = "Q42", expired: bool = False
) -> wikidata.WikidataEntity:
merged_dict = EXAMPLE_WIKIDATA_DICT.copy()
merged_dict['id'] = qid
updated_days_ago = wikidata.WIKIDATA_CACHE_TTL_DAYS + 1 if expired else 0
return wikidata.WikidataEntity.from_dict(
merged_dict, datetime.now() - timedelta(days=updated_days_ago)
)


EXPIRED = "expired"
MISSING = "missing"
VALID_CACHE = ""


@pytest.mark.parametrize(
"bust_cache, fetch_missing, status, expected_web_call, expected_cache_call",
[
# if bust_cache, always call web, never call cache
(True, True, VALID_CACHE, True, False),
(True, False, VALID_CACHE, True, False),
# if not fetch_missing, only call web when expired
(False, False, VALID_CACHE, False, True),
(False, False, EXPIRED, True, True),
# if fetch_missing, only call web when missing or expired
(False, True, VALID_CACHE, False, True),
(False, True, MISSING, True, True),
(False, True, EXPIRED, True, True),
],
)
def test_get_wikidata_entity(
bust_cache: bool,
fetch_missing: bool,
status: str,
expected_web_call: bool,
expected_cache_call: bool,
) -> None:
with (
patch.object(wikidata, "_get_from_cache") as mock_get_from_cache,
patch.object(wikidata, "_get_from_web") as mock_get_from_web,
):
if status == EXPIRED:
mock_get_from_cache.return_value = createWikidataEntity(expired=True)
elif status == MISSING:
mock_get_from_cache.return_value = None
else:
mock_get_from_cache.return_value = createWikidataEntity()

wikidata.get_wikidata_entity(
'Q42', bust_cache=bust_cache, fetch_missing=fetch_missing
)
if expected_web_call:
mock_get_from_web.assert_called_once()
else:
mock_get_from_web.assert_not_called()

if expected_cache_call:
mock_get_from_cache.assert_called_once()
else:
mock_get_from_cache.assert_not_called()
Loading
Loading