Skip to content

Commit

Permalink
Fix #293: processing non-latin chars in Location header
Browse files Browse the repository at this point in the history
  • Loading branch information
lorien committed May 5, 2018
1 parent f35b71b commit 18c0d35
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 3 deletions.
7 changes: 5 additions & 2 deletions grab/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,12 @@ def parse(self, charset=None, headers=None):
responses = self.head.rsplit(b'\nHTTP/', 1)
# Cut off the 'HTTP/*' line from the last response
_, response = responses[-1].split(b'\n', 1)
response = response.decode('ascii', 'ignore')
response = response.decode('utf-8', 'ignore')
else:
response = ''
response = u''
if six.PY2:
# email_from_string does not work with unicode input
response = response.encode('utf-8')
self.headers = email.message_from_string(response)

if charset is None:
Expand Down
6 changes: 6 additions & 0 deletions grab/transport/urllib3.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,9 @@ def prepare_response(self, grab):

head = ''
for key, val in self._response.getheaders().items():
if six.PY3:
key = key.encode('latin').decode('utf-8')
val = val.encode('latin').decode('utf-8')
head += '%s: %s\r\n' % (key, val)
head += '\r\n'
response.head = make_str(head, encoding='latin', errors='ignore')
Expand Down Expand Up @@ -394,6 +397,9 @@ def read_with_timeout():
import email.message
hdr = email.message.Message()
for key, val in self._response.getheaders().items():
if six.PY3:
key = key.encode('latin').decode('utf-8')
val = val.encode('latin').decode('utf-8')
hdr[key] = val
response.parse(charset=grab.config['document_charset'],
headers=hdr)
Expand Down
2 changes: 1 addition & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cssselect
feedparser
flake8
selenium
test_server>=0.0.30
test_server>=0.0.31
mock
runscript
pyprof2calltree
Expand Down
16 changes: 16 additions & 0 deletions tests/grab_redirect.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# coding: utf-8
from six.moves.urllib.parse import quote

from grab.error import GrabTooManyRedirectsError
from tests.util import BaseGrabTestCase, build_grab
Expand Down Expand Up @@ -122,3 +124,17 @@ def test_redirect_limit(self):
grab.setup(redirect_limit=20)
grab.go(self.server.get_url())
self.assertTrue(b'done' in grab.doc.body)

def test_redirect_utf_location(self):
self.server.response_once['code'] = 301
self.server.response_once['headers'] = [
('Location', (self.server.get_url() + u'фыва').encode('utf-8')),
]
self.server.response_once['data'] = 'content-1'
self.server.response['data'] = 'content-2'
grab = build_grab(debug=True, follow_location=True)
grab.go(self.server.get_url())
print('URL', grab.doc.url)
self.assertTrue(
quote(u'/фыва'.encode('utf-8'), safe='/') in grab.doc.url
)

0 comments on commit 18c0d35

Please sign in to comment.