From 1913fb79987bc994057cd91bfeabd411369edc5e Mon Sep 17 00:00:00 2001 From: Laerte Pereira <5853172+Laerte@users.noreply.github.com> Date: Fri, 28 Oct 2022 05:29:17 -0300 Subject: [PATCH] Selector.drop and SelectorList.drop methods (#247) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Don't remove text after deleted element * Update test_selector.py * fix linter * feat: Add `drop` method, revert changes to `remove` and deprecate it * chore: Rename remove → drop * fix: linter * chore: Inherit from `CannotRemoveElementWithoutParent` exception * chore(docs): Switch `.remove` to `drop` * chore: Change tests to use `.drop()` method --- docs/usage.rst | 4 ++-- parsel/selector.py | 52 +++++++++++++++++++++++++++++++++++++++--- tests/test_selector.py | 24 +++++++++++++------ 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index d0a6fb0b..dcef13db 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -400,7 +400,7 @@ Removing elements ----------------- If for any reason you need to remove elements based on a Selector or -a SelectorList, you can do it with the ``remove()`` method, available for both +a SelectorList, you can do it with the ``drop()`` method, available for both classes. .. warning:: this is a destructive action and cannot be undone. The original @@ -425,7 +425,7 @@ Example removing an ad from a blog post: >>> sel = Selector(text=doc) >>> sel.xpath('//div/text()').getall() ['Content paragraph...', '\n ', '\n Ad content...\n ', '\n ', '\n ', 'More content...'] - >>> sel.xpath('//div[@class="ad"]').remove() + >>> sel.xpath('//div[@class="ad"]').drop() >>> sel.xpath('//div//text()').getall() ['Content paragraph...', 'More content...'] diff --git a/parsel/selector.py b/parsel/selector.py index e0d5a404..b84b0308 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -4,13 +4,14 @@ import typing import warnings -from typing import Any, Dict, List, Optional, Mapping, Pattern, Union +from typing import Any, Dict, List, Mapping, Optional, Pattern, Union +from warnings import warn from lxml import etree, html from pkg_resources import parse_version -from .utils import flatten, iflatten, extract_regex, shorten -from .csstranslator import HTMLTranslator, GenericTranslator +from .csstranslator import GenericTranslator, HTMLTranslator +from .utils import extract_regex, flatten, iflatten, shorten _SelectorType = typing.TypeVar("_SelectorType", bound="Selector") @@ -27,6 +28,10 @@ class CannotRemoveElementWithoutParent(Exception): pass +class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent): + pass + + class SafeXMLParser(etree.XMLParser): def __init__(self, *args, **kwargs) -> None: kwargs.setdefault("resolve_entities", False) @@ -236,9 +241,21 @@ def remove(self) -> None: """ Remove matched nodes from the parent for each element in this list. """ + warn( + "Method parsel.selector.SelectorList.remove is deprecated, please use parsel.selector.SelectorList.drop method instead", + category=DeprecationWarning, + stacklevel=2, + ) for x in self: x.remove() + def drop(self) -> None: + """ + Drop matched nodes from the parent for each element in this list. + """ + for x in self: + x.drop() + class Selector: """ @@ -503,6 +520,11 @@ def remove(self) -> None: """ Remove matched nodes from the parent element. """ + warn( + "Method parsel.selector.Selector.remove is deprecated, please use parsel.selector.Selector.drop method instead", + category=DeprecationWarning, + stacklevel=2, + ) try: parent = self.root.getparent() except AttributeError: @@ -523,6 +545,30 @@ def remove(self) -> None: "are you trying to remove a root element?" ) + def drop(self): + """ + Drop matched nodes from the parent element. + """ + try: + self.root.getparent() + except AttributeError: + # 'str' object has no attribute 'getparent' + raise CannotRemoveElementWithoutRoot( + "The node you're trying to drop has no root, " + "are you trying to drop a pseudo-element? " + "Try to use 'li' as a selector instead of 'li::text' or " + "'//li' instead of '//li/text()', for example." + ) + + try: + self.root.drop_tree() + except (AttributeError, AssertionError): + # 'NoneType' object has no attribute 'drop' + raise CannotDropElementWithoutParent( + "The node you're trying to remove has no parent, " + "are you trying to remove a root element?" + ) + @property def attrib(self) -> Dict[str, str]: """Return the attributes dictionary for underlying element.""" diff --git a/tests/test_selector.py b/tests/test_selector.py index 99d9a552..d0bb2816 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1050,7 +1050,7 @@ def test_remove_selector_list(self) -> None: text="" ) sel_list = sel.css("li") - sel_list.remove() + sel_list.drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li"), []) @@ -1059,7 +1059,7 @@ def test_remove_selector(self) -> None: text="" ) sel_list = sel.css("li") - sel_list[0].remove() + sel_list[0].drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) @@ -1070,7 +1070,7 @@ def test_remove_pseudo_element_selector_list(self) -> None: sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutRoot): - sel_list.remove() + sel_list.drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) @@ -1082,7 +1082,7 @@ def test_remove_pseudo_element_selector(self) -> None: sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutRoot): - sel_list[0].remove() + sel_list[0].drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) @@ -1094,15 +1094,15 @@ def test_remove_root_element_selector(self) -> None: sel_list = sel.css("li::text") self.assertEqual(sel_list.getall(), ["1", "2", "3"]) with self.assertRaises(CannotRemoveElementWithoutParent): - sel.remove() + sel.drop() with self.assertRaises(CannotRemoveElementWithoutParent): - sel.css("html").remove() + sel.css("html").drop() self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) - sel.css("body").remove() + sel.css("body").drop() self.assertEqual(sel.get(), "") def test_deep_nesting(self): @@ -1316,3 +1316,13 @@ def test_set(self) -> None: ).extract(), ["url", "name", "startDate", "location", "offers"], ) + + def test_dont_remove_text_after_deleted_element(self) -> None: + sel = self.sscls( + text="""Text before.Text in. Text after. + """ + ) + sel.css("span").drop() + self.assertEqual( + sel.get(), "Text before. Text after." + )