Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #237: urllib3 transport fails without pycurl installed #239

Merged
merged 5 commits into from
Feb 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
ignore = E261,E265,W503
exclude = test/files/invalid_import.py
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ matrix:
include:
- os: linux
python: 3.4
env: TOX_ENV=pylint
env: TOX_ENV=quality
- os: linux
env: TOX_ENV=py27
- os: linux
Expand Down Expand Up @@ -109,8 +109,8 @@ before_script:

script:
- tox -e $TOX_ENV
- if [[ $TOX_ENV != pylint* ]]; then tox -e "$TOX_ENV-threaded-urllib3"; fi
- if [[ $TOX_ENV != pylint* ]]; then tox -e "$TOX_ENV-threaded-pycurl"; fi
- if [[ $TOX_ENV != quality* ]]; then tox -e "$TOX_ENV-threaded-urllib3"; fi
- if [[ $TOX_ENV != quality* ]]; then tox -e "$TOX_ENV-threaded-pycurl"; fi

after_success:
- coveralls
61 changes: 37 additions & 24 deletions grab/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
'pycurl': 'grab.transport.curl.CurlTransport',
'urllib3': 'grab.transport.urllib3.Urllib3Transport',
}
DEFAULT_TRANSPORT = 'pycurl'

# pylint: disable=invalid-name
logger = logging.getLogger('grab.base')
Expand Down Expand Up @@ -182,18 +183,19 @@ def default_config():

class Grab(DeprecatedThings):

__slots__ = ('request_head', 'request_body',
#'request_log',
'proxylist', 'config',
'transport',
'transport_param', 'request_method', 'request_counter',
'__weakref__', 'cookies',
'meta',

# Dirty hack to make it possible to inherit Grab from
# multiple base classes with __slots__
'_doc',
)
__slots__ = (
'request_head', 'request_body',
#'request_log',
'proxylist', 'config',
'transport',
'transport_param', 'request_method', 'request_counter',
'__weakref__', 'cookies',
'meta',

# Dirty hack to make it possible to inherit Grab from
# multiple base classes with __slots__
'_doc',
)

# Attributes which should be processed when clone
# of Grab instance is creating
Expand All @@ -209,7 +211,7 @@ class Grab(DeprecatedThings):
#

def __init__(self, document_body=None,
transport='pycurl', **kwargs):
transport=None, **kwargs):
"""
Create Grab instance
"""
Expand All @@ -226,8 +228,9 @@ def __init__(self, document_body=None,
self.request_head = None
self.request_body = None
self.request_method = None
self.transport_param = transport
self.transport = None

self.setup_transport(transport)
self.reset()
if kwargs:
self.setup(**kwargs)
Expand All @@ -244,12 +247,18 @@ def _set_doc(self, obj):

doc = property(_get_doc, _set_doc)

def setup_transport(self, transport_param):
self.transport_param = transport_param
def setup_transport(self, transport_param, reset=False):
if self.transport is not None and not reset:
raise error.GrabMisuseError(
'Transport is already set up. Use'
' setup_transport(..., reset=True) to explicitly setup'
' new transport')
if transport_param is None:
transport_param = DEFAULT_TRANSPORT
if isinstance(transport_param, six.string_types):
if transport_param in TRANSPORT_ALIAS:
transport_param = TRANSPORT_ALIAS[transport_param]
if not '.' in transport_param:
if '.' not in transport_param:
raise error.GrabMisuseError('Unknown transport: %s'
% transport_param)
else:
Expand Down Expand Up @@ -281,7 +290,8 @@ def reset(self):
self.request_body = None
self.request_method = None
self.request_counter = None
self.transport.reset()
if self.transport:
self.transport.reset()

def clone(self, **kwargs):
"""
Expand Down Expand Up @@ -397,6 +407,8 @@ def prepare_request(self, **kwargs):
transport extension.
"""

if self.transport is None:
self.setup_transport(self.transport_param)
self.reset()
self.request_counter = next(REQUEST_COUNTER)
if kwargs:
Expand Down Expand Up @@ -430,10 +442,13 @@ def log_request(self, extra=''):
proxy_info = ''
if extra:
extra = '[%s] ' % extra
logger_network.debug('[%02d%s] %s%s %s%s',
self.request_counter, thread_name,
extra, self.request_method or 'GET',
self.config['url'], proxy_info)
logger_network.debug(
'[%s%s] %s%s %s%s',
('%02d' % self.request_counter
if self.request_counter is not None else 'NA'),
thread_name,
extra, self.request_method or 'GET',
self.config['url'], proxy_info)

def request(self, **kwargs):
"""
Expand All @@ -445,7 +460,6 @@ def request(self, **kwargs):
Returns: ``Document`` objects.
"""


self.prepare_request(**kwargs)
refresh_count = 0

Expand Down Expand Up @@ -638,7 +652,6 @@ def change_proxy(self, random=True):
else:
logger.debug('Proxy list is empty')


#
# Private methods
#
Expand Down
47 changes: 25 additions & 22 deletions grab/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,17 @@ class Document(object):
i.e. result of network request)
"""

__slots__ = ('status', 'code', 'head', '_bytes_body',
'body_path', 'headers', 'url', 'cookies',
'charset', '_unicode_body',
'bom', 'timestamp',
'name_lookup_time', 'connect_time', 'total_time',
'download_size', 'upload_size', 'download_speed',
'error_code', 'error_msg', 'grab', 'remote_ip',
'_lxml_tree', '_strict_lxml_tree', '_pyquery',
'_lxml_form', '_file_fields', 'from_cache',
)
__slots__ = (
'status', 'code', 'head', '_bytes_body',
'body_path', 'headers', 'url', 'cookies',
'charset', '_unicode_body',
'bom', 'timestamp',
'name_lookup_time', 'connect_time', 'total_time',
'download_size', 'upload_size', 'download_speed',
'error_code', 'error_msg', 'grab', 'remote_ip',
'_lxml_tree', '_strict_lxml_tree', '_pyquery',
'_lxml_form', '_file_fields', 'from_cache',
)

def __init__(self, grab=None):
if grab is None:
Expand Down Expand Up @@ -452,6 +453,7 @@ def rex_text(self, regexp, flags=0, byte=False, default=NULL):
`response.unicode_body()` else the rex is searched in `response.body`.
"""

# pylint: disable=no-member
try:
match = self.rex_search(regexp, flags=flags, byte=byte)
except DataNotFound:
Expand All @@ -460,7 +462,7 @@ def rex_text(self, regexp, flags=0, byte=False, default=NULL):
else:
return default
else:
return normalize_space(decode_entities(match.group(1))) # pylint: disable=no-member
return normalize_space(decode_entities(match.group(1)))

def rex_search(self, regexp, flags=0, byte=False, default=NULL):
"""
Expand Down Expand Up @@ -582,7 +584,6 @@ def _write_body(self, body):

body = property(_read_body, _write_body)


# DomTreeExtension methods

@property
Expand Down Expand Up @@ -813,7 +814,8 @@ def set_input_by_id(self, _id, value):
self.choose_form_by_element(xpath)
sel = XpathSelector(self.form)
elem = sel.select(xpath).node()
return self.set_input(elem.get('name'), value) # pylint: disable=no-member
# pylint: disable=no-member
return self.set_input(elem.get('name'), value)

def set_input_by_number(self, number, value):
"""
Expand Down Expand Up @@ -847,7 +849,8 @@ def set_input_by_xpath(self, xpath, value):
self._lxml_form = parent
break

return self.set_input(elem.get('name'), value) # pylint: disable=no-member
# pylint: disable=no-member
return self.set_input(elem.get('name'), value)

# FIXME:
# * Remove set_input_by_id
Expand Down Expand Up @@ -900,14 +903,13 @@ def submit(self, submit_name=None, make_request=True,
g.submit()
"""

# TODO: add .x and .y items
# if submit element is image
# pylint: disable=no-member

post = self.form_fields()

# Build list of submit buttons which have a name
submit_controls = {}
for elem in self.form.inputs: # pylint: disable=no-member
for elem in self.form.inputs:
if (elem.tag == 'input' and elem.type == 'submit' and
elem.get('name') is not None):
submit_controls[elem.name] = elem
Expand All @@ -933,14 +935,15 @@ def submit(self, submit_name=None, make_request=True,
if url:
action_url = urljoin(self.url, url)
else:
action_url = urljoin(self.url, self.form.action) # pylint: disable=no-member
action_url = urljoin(self.url,
self.form.action)

# Values from `extra_post` should override values in form
# `extra_post` allows multiple value of one key

# Process saved values of file fields
if self.form.method == 'POST': # pylint: disable=no-member
if 'multipart' in self.form.get('enctype', ''): # pylint: disable=no-member
if self.form.method == 'POST':
if 'multipart' in self.form.get('enctype', ''):
for key, obj in self._file_fields.items():
post[key] = obj

Expand All @@ -965,8 +968,8 @@ def submit(self, submit_name=None, make_request=True,
post_items = [(x, y) for x, y in post_items
if x not in remove_from_post]

if self.form.method == 'POST': # pylint: disable=no-member
if 'multipart' in self.form.get('enctype', ''): # pylint: disable=no-member
if self.form.method == 'POST':
if 'multipart' in self.form.get('enctype', ''):
self.grab.setup(multipart_post=post_items)
else:
self.grab.setup(post=post_items)
Expand Down
13 changes: 9 additions & 4 deletions grab/proxylist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import logging
from random import randint
from collections import namedtuple
from six.moves.urllib.request import urlopen
from six.moves.urllib.error import URLError

import six

from grab.error import GrabError, GrabNetworkError
from grab.error import GrabError

RE_SIMPLE_PROXY = re.compile(r'^([^:]+):([^:]+)$')
RE_AUTH_PROXY = re.compile(r'^([^:]+):([^:]+):([^:]+):([^:]+)$')
Expand Down Expand Up @@ -100,14 +102,17 @@ def __init__(self, url, **kwargs):
super(WebProxySource, self).__init__(**kwargs)

def load_raw_data(self):
from grab import Grab
limit = 3
for count in range(limit):
try:
return Grab().go(url=self.url).unicode_body()
except GrabNetworkError:
data = urlopen(self.url, timeout=3).read()
return data.decode('utf-8', 'ignore')
except URLError:
if count >= (limit - 1):
raise
else:
logger.debug('Failed to retreive proxy list from %s.'
' Retrying.', self.url)


class ListProxySource(BaseProxySource):
Expand Down
Loading