diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..a5db974 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -423,6 +423,7 @@ class Selector: "_huge_tree", "root", "_text", + "_text_lazy_html_root", "body", "__weakref__", ] @@ -507,6 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text + self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -606,7 +608,12 @@ def xpath( ) else: try: - xpathev = self._get_root(self._text or "", type="html").xpath + if self._text_lazy_html_root is None: + self._text_lazy_html_root = self._get_root( + self.root or "", type="html" + ) + if self._text_lazy_html_root is not None: + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -722,25 +729,30 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type in ("text", "json"): + if self.type in ("json", "text"): + if self.type == "text" and self._text_lazy_html_root is not None: + return etree.tostring( + self._text_lazy_html_root, encoding="unicode", with_tail=False + ) return self.root - try: - return typing.cast( - str, - etree.tostring( - self.root, - method=_ctgroup[self.type]["_tostring_method"], - encoding="unicode", - with_tail=False, - ), - ) - except (AttributeError, TypeError): - if self.root is True: - return "1" - elif self.root is False: - return "0" - else: - return str(self.root) + else: + try: + return typing.cast( + str, + etree.tostring( + self.root, + method=_ctgroup[self.type]["_tostring_method"], + encoding="unicode", + with_tail=False, + ), + ) + except (AttributeError, TypeError): + if self.root is True: + return "1" + elif self.root is False: + return "0" + else: + return str(self.root) extract = get diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..902f82b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,35 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_html_in_text(self) -> None: + html = ( + "

hello world

" + ) + expect_result = "

hello world

" + sel = self.sscls(text=html, type="text") + self.assertEqual(sel.type, "text") + li_sel_list = sel.css("style") + li_sel_list.drop() + self.assertEqual(sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(sel.type, "text") + + def test_remove_selector_from_html_in_json(self) -> None: + json_str = """{ + "title": "hello world", + "body": "

hello world

" + } + """ + expect_result = "

hello world

" + sel = self.sscls(text=json_str) + html_sel = sel.jmespath("body")[0] + self.assertEqual(html_sel.type, "text") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "text") + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text=""