Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

americana, catch googlebooks exceptions #1018

Merged
merged 3 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 24 additions & 13 deletions core/loaders/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,10 @@ def harvesters(ebook):
yield OPENBOOKPUB.search(ebook.url), harvest_obp
yield ebook.provider == 'Transcript-Verlag', harvest_transcript
yield ebook.provider == 'ksp.kit.edu', harvest_ksp
yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis
yield ebook.provider in ['digitalis.uc.pt', 'repositorio.americana.edu.co'], harvest_dspace2
yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2
yield ebook.provider == 'nomos-elibrary.de', harvest_nomos
yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis
yield 'frontiersin.org' in ebook.provider, harvest_frontiersin
yield ebook.provider in ['Palgrave Connect', 'Springer', 'springer.com'], harvest_springerlink
yield ebook.provider == 'pulp.up.ac.za', harvest_pulp
Expand Down Expand Up @@ -537,19 +539,12 @@ def selector(doc):
return doc.select_one('p.linkForPDF a')
return harvest_one_generic(ebook, selector)


def harvest_digitalis(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.find('meta', attrs={"name": "citation_pdf_url"})
if obj:
dl_url = urljoin(ebook.url, obj.get('content', None))
if dl_url:
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def selector(doc):
return doc.select_one('a.item-download-button')
return harvest_one_generic(ebook, selector)


NOMOSPDF = re.compile('download_full_pdf')
def harvest_nomos(ebook):
Expand Down Expand Up @@ -715,6 +710,22 @@ def selector(doc):
return harvest_one_generic(ebook, selector)


def harvest_dspace2(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.find('meta', attrs={"name": "citation_pdf_url"})
if obj:
dl_url = urljoin(ebook.url, obj.get('content', None))
if dl_url:
dl_url = dl_url.replace('http://', 'https://')
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0


# won't harvest page-image books
def harvest_unt(ebook):
def selector(doc):
Expand Down
7 changes: 6 additions & 1 deletion frontend/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from django.urls import reverse, reverse_lazy
from django.core.validators import validate_email
from django.db.models import Q, Count, Sum
from django.db.utils import IntegrityError
from django.forms import Select
from django.forms.models import inlineformset_factory
from django.http import (
Expand Down Expand Up @@ -559,14 +560,18 @@ def googlebooks(request, googlebooks_id):
return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id)
try:
edition = bookloader.add_by_googlebooks_id(googlebooks_id)
if edition.new:
if edition and edition.new:
# add related editions asynchronously
tasks.populate_edition.delay(edition.isbn_13)
if request.user.is_authenticated:
request.user.profile.works.add(edition.work)
except bookloader.LookupFailure:
logger.warning("failed to load googlebooks_id %s" % googlebooks_id)
return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id)
except IntegrityError:
logger.warning("duplicate (maybe) googlebooks_id %s" % googlebooks_id)
return HttpResponseNotFound("failed adding googlebooks id %s" % googlebooks_id)

if not edition:
return HttpResponseNotFound("invalid googlebooks id")
work_url = reverse('work', kwargs={'work_id': edition.work_id})
Expand Down