diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index df7fb273..730266c2 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -5,7 +5,7 @@ import logging import re import time -from urllib.parse import urljoin, quote +from urllib.parse import quote, unquote, urljoin, urlparse, urlsplit, urlunsplit import requests @@ -405,6 +405,10 @@ def harvest_oapen(ebook): if is_bookshop_url(ebook.url): return set_bookshop(ebook) if '/bitstream/' in ebook.url: + if "%" in ebook.url: + (scheme, netloc, path, query, fragment) = urlsplit(ebook.url) + newpath = quote(unquote(path), encoding='latin1') + ebook.url = urlunsplit((scheme, netloc, newpath, query, fragment)) return make_dl_ebook(ebook.url, ebook, user_agent=settings.USER_AGENT) return None, 0 diff --git a/core/models/loader.py b/core/models/loader.py index 35eed91e..d5f0b141 100644 --- a/core/models/loader.py +++ b/core/models/loader.py @@ -2,7 +2,7 @@ import re import requests import time -from urllib.parse import urlparse +from urllib.parse import quote, unquote, urlparse, urlsplit, urlunsplit from django.apps import apps from django.conf import settings @@ -73,16 +73,41 @@ def __init__(self): self.last_call = dict() def content_type(self, url): + def handle_ude(url, ude): + # fallback for non-ascii, non-utf8 bytes in redirect location + (scheme, netloc, path, query, fragment) = urlsplit(url) + newpath = quote(unquote(path), encoding='latin1') + url = urlunsplit((scheme, netloc, newpath, query, fragment)) + try: + r = requests.get(url, allow_redirects=True) + except: + logger.error('Error processing %s after unicode error', url) + return '', '' try: - r = requests.head(url, allow_redirects=True) - if r.status_code == 405: - r = requests.get(url) - elif r.status_code == 404: - logger.error('File not found (404) for %s', url) - return '404', '' - return r.headers.get('content-type', ''), r.headers.get('content-disposition', '') + try: + r = requests.head(url, allow_redirects=True) + if r.status_code == 405: + try: + r = requests.get(url) + except UnicodeDecodeError as ude: + if 'utf-8' in str(ude): + return handle_ude(url, ude) + except UnicodeDecodeError as ude: + if 'utf-8' in str(ude): + return handle_ude(url, ude) + except requests.exceptions.SSLError: + try: + r = requests.get(url, verify=False) + except: + logger.error('Error processing %s verification off', url) + return '', '' except: + logger.error('Error processing %s', url) return '', '' + if r.status_code == 404: + logger.error('File not found (404) for %s', url) + return '404', '' + return r.headers.get('content-type', ''), r.headers.get('content-disposition', '') def calc_type(self, url): logger.info(url) diff --git a/settings/common.py b/settings/common.py index 7b9dcf3d..740092d7 100644 --- a/settings/common.py +++ b/settings/common.py @@ -333,7 +333,7 @@ LOGOUT_URL = "/accounts/logout/" LOGIN_ERROR_URL = '/accounts/login-error/' -USER_AGENT = "unglue.it.bot v0.0.1 " +USER_AGENT = "unglue.it.bot v0.0.1 (https://unglue.it)" # The amount of the transaction that Gluejar takes GLUEJAR_COMMISSION = 0.06