Gluejar · eshellman · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py
@@ -5,7 +5,7 @@
 import logging
 import re
 import time
-from urllib.parse import urljoin, quote
+from urllib.parse import quote, unquote, urljoin, urlparse, urlsplit, urlunsplit
 
 import requests
 
@@ -405,6 +405,10 @@ def harvest_oapen(ebook):
     if is_bookshop_url(ebook.url):
         return set_bookshop(ebook)
     if '/bitstream/' in ebook.url:
+        if "%" in ebook.url:
+            (scheme, netloc, path, query, fragment) = urlsplit(ebook.url)
+            newpath = quote(unquote(path), encoding='latin1')
+            ebook.url = urlunsplit((scheme, netloc, newpath, query, fragment))
         return make_dl_ebook(ebook.url, ebook, user_agent=settings.USER_AGENT)
     return None, 0
 

diff --git a/core/models/loader.py b/core/models/loader.py
@@ -2,7 +2,7 @@
 import re
 import requests
 import time
-from urllib.parse import urlparse
+from urllib.parse import quote, unquote, urlparse, urlsplit, urlunsplit
 
 from django.apps import apps
 from django.conf import settings
@@ -73,16 +73,41 @@ def __init__(self):
         self.last_call = dict()
 
     def content_type(self, url):
+        def handle_ude(url, ude):
+            # fallback for non-ascii, non-utf8 bytes in redirect location
+            (scheme, netloc, path, query, fragment) = urlsplit(url)
+            newpath = quote(unquote(path), encoding='latin1')
+            url = urlunsplit((scheme, netloc, newpath, query, fragment))
+            try:
+                r = requests.get(url, allow_redirects=True)
+            except:
+                logger.error('Error processing %s after unicode error', url)
+                return '', ''
         try:
-            r = requests.head(url, allow_redirects=True)
-            if r.status_code == 405:
-                r =  requests.get(url)
-            elif r.status_code == 404:
-                logger.error('File not found (404) for %s', url)
-                return '404', ''
-            return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
+            try:
+                r = requests.head(url, allow_redirects=True)
+                if r.status_code == 405:
+                    try:
+                        r =  requests.get(url)
+                    except UnicodeDecodeError as ude:
+                        if 'utf-8' in str(ude):
+                            return handle_ude(url, ude)
+            except UnicodeDecodeError as ude:
+                if 'utf-8' in str(ude):
+                    return handle_ude(url, ude)
+        except requests.exceptions.SSLError:
+            try:
+                r = requests.get(url, verify=False)
+            except:
+                logger.error('Error processing %s verification off', url)
+                return '', ''
         except:
+            logger.error('Error processing %s', url)
             return '', ''
+        if r.status_code == 404:
+            logger.error('File not found (404) for %s', url)
+            return '404', ''
+        return r.headers.get('content-type', ''), r.headers.get('content-disposition', '')
 
     def calc_type(self, url):
         logger.info(url)

diff --git a/settings/common.py b/settings/common.py
@@ -333,7 +333,7 @@
 LOGOUT_URL = "/accounts/logout/"
 LOGIN_ERROR_URL    = '/accounts/login-error/'
 
-USER_AGENT = "unglue.it.bot v0.0.1 <https://unglue.it>"
+USER_AGENT = "unglue.it.bot v0.0.1 (https://unglue.it)"
 
 # The amount of the transaction that Gluejar takes 
 GLUEJAR_COMMISSION = 0.06