From 9960ea8941f2fbba3a51930cb65845d643cafedf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Oct 2019 18:00:31 +0200 Subject: [PATCH 1/2] Instantiate parsers only once --- parsel/parser/__init__.py | 36 +++++++++++++++++++++++++++++++++++ parsel/parser/html.py | 3 +++ parsel/parser/xml.py | 3 +++ parsel/selector.py | 39 +++++++++++++++++++++++++++++++++----- tests/test_deprecations.py | 21 ++++++++++++++++++++ tests/test_selector.py | 22 +++++++++++++++++++++ 6 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 parsel/parser/__init__.py create mode 100644 parsel/parser/html.py create mode 100644 parsel/parser/xml.py create mode 100644 tests/test_deprecations.py diff --git a/parsel/parser/__init__.py b/parsel/parser/__init__.py new file mode 100644 index 00000000..f0bb4574 --- /dev/null +++ b/parsel/parser/__init__.py @@ -0,0 +1,36 @@ +from lxml import etree +from lxml.etree import XMLParser as _UnsafeXMLParser +from lxml.html import HTMLParser as _HTMLParser + + +class _LXMLBaseParser(object): + + def __init__(self, parser_cls): + self._parser = parser_cls(recover=True, encoding='utf8') + + def parse(self, text, base_url): + body = text.strip().replace('\x00', '').encode('utf8') or b'' + root = etree.fromstring(body, parser=self._parser, base_url=base_url) + if root is None: + root = etree.fromstring(b'', parser=self._parser, + base_url=base_url) + return root + + +class HTMLParser(_LXMLBaseParser): + + def __init__(self): + super(HTMLParser, self).__init__(_HTMLParser) + + +class _XMLParser(_UnsafeXMLParser): + + def __init__(self, *args, **kwargs): + kwargs.setdefault('resolve_entities', False) + super(_XMLParser, self).__init__(*args, **kwargs) + + +class XMLParser(_LXMLBaseParser): + + def __init__(self): + super(XMLParser, self).__init__(_XMLParser) diff --git a/parsel/parser/html.py b/parsel/parser/html.py new file mode 100644 index 00000000..305fd6d1 --- /dev/null +++ b/parsel/parser/html.py @@ -0,0 +1,3 @@ +from parsel.parser import HTMLParser + +HTML_PARSER = HTMLParser() diff --git a/parsel/parser/xml.py b/parsel/parser/xml.py new file mode 100644 index 00000000..f54bb55e --- /dev/null +++ b/parsel/parser/xml.py @@ -0,0 +1,3 @@ +from parsel.parser import XMLParser + +XML_PARSER = XMLParser() diff --git a/parsel/selector.py b/parsel/selector.py index 41315cbc..da8381e1 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -3,14 +3,39 @@ """ import sys +from importlib import import_module +from warnings import warn import six -from lxml import etree, html +from lxml import etree from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator +def _load_object(path): + """Load an object given its absolute object path, and return it. + + `path` can point to a class, function, variable or a class instance. For + example: ``'parsel.parser.html.HTML_PARSER'``. + """ + + try: + dot = path.rindex('.') + except ValueError: + raise ValueError("Error loading object '%s': not a full path" % path) + + module, name = path[:dot], path[dot+1:] + mod = import_module(module) + + try: + obj = getattr(mod, name) + except AttributeError: + raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) + + return obj + + class CannotRemoveElementWithoutRoot(Exception): pass @@ -21,14 +46,16 @@ class CannotRemoveElementWithoutParent(Exception): class SafeXMLParser(etree.XMLParser): def __init__(self, *args, **kwargs): + warn('parsel.selector.SafeXMLParser is deprecated', + DeprecationWarning, stacklevel=2) kwargs.setdefault('resolve_entities', False) super(SafeXMLParser, self).__init__(*args, **kwargs) _ctgroup = { - 'html': {'_parser': html.HTMLParser, + 'html': {'_parser': 'parsel.parser.html.HTML_PARSER', '_csstranslator': HTMLTranslator(), '_tostring_method': 'html'}, - 'xml': {'_parser': SafeXMLParser, + 'xml': {'_parser': 'parsel.parser.xml.XML_PARSER', '_csstranslator': GenericTranslator(), '_tostring_method': 'xml'}, } @@ -46,6 +73,8 @@ def _st(st): def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class. """ + warn('parsel.selector.create_root_node is deprecated', + DeprecationWarning, stacklevel=2) body = text.strip().replace('\x00', '').encode('utf8') or b'' parser = parser_cls(recover=True, encoding='utf8') root = etree.fromstring(body, parser=parser, base_url=base_url) @@ -195,7 +224,7 @@ class Selector(object): def __init__(self, text=None, type=None, namespaces=None, root=None, base_url=None, _expr=None): self.type = st = _st(type or self._default_type) - self._parser = _ctgroup[st]['_parser'] + self._parser = _load_object(_ctgroup[st]['_parser']) self._csstranslator = _ctgroup[st]['_csstranslator'] self._tostring_method = _ctgroup[st]['_tostring_method'] @@ -218,7 +247,7 @@ def __getstate__(self): raise TypeError("can't pickle Selector objects") def _get_root(self, text, base_url=None): - return create_root_node(text, self._parser, base_url=base_url) + return self._parser.parse(text=text, base_url=base_url) def xpath(self, query, namespaces=None, **kwargs): """ diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py new file mode 100644 index 00000000..74736c2f --- /dev/null +++ b/tests/test_deprecations.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- + + +from unittest import TestCase +from warnings import catch_warnings + +from parsel.selector import create_root_node, SafeXMLParser +from lxml.html import HTMLParser + + +class TestDeprecations(TestCase): + + def test_create_root_node(self): + with catch_warnings(record=True) as warnings: + create_root_node(u'…', HTMLParser) + self.assertEqual(len(warnings), 1) + + def test_SafeXMLParser(self): + with catch_warnings(record=True) as warnings: + parser = SafeXMLParser() + self.assertEqual(len(warnings), 1) diff --git a/tests/test_selector.py b/tests/test_selector.py index 376b0f71..077c0991 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -7,6 +7,7 @@ from parsel import Selector from parsel.selector import ( + _load_object, CannotRemoveElementWithoutRoot, CannotRemoveElementWithoutParent, ) @@ -913,3 +914,24 @@ def test_set(self): //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers']) + + +try: + ModuleNotFoundError +except NameError: + ModuleNotFoundError = ImportError + + +class LoadObjectTestCase(unittest.TestCase): + + def test_incomplete_path(self): + with self.assertRaises(ValueError): + object = _load_object('parsel') + + def test_inexistent_module(self): + with self.assertRaises(ModuleNotFoundError): + object = _load_object('parsel.inexistent.inexistent') + + def test_inexistent_object(self): + with self.assertRaises(NameError): + object = _load_object('parsel.parser.inexistent') From 0e5f2aab85df4dc1d8047d3d2ce14a2a33356588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 29 Oct 2019 11:42:31 +0100 Subject: [PATCH 2/2] Remove create_root_node from the documentation --- docs/parsel.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/parsel.rst b/docs/parsel.rst index 859e4ba6..382fd230 100644 --- a/docs/parsel.rst +++ b/docs/parsel.rst @@ -19,6 +19,7 @@ parsel.selector :members: :undoc-members: :show-inheritance: + :exclude-members: create_root_node parsel.utils