From 8259dd4088b7b01c482f6d4db84c133f041994a6 Mon Sep 17 00:00:00 2001 From: dream2333 Date: Thu, 13 Jun 2024 20:27:20 +0800 Subject: [PATCH 1/5] Fix drop html element from a text type Selector --- parsel/selector.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..ebdcaa2 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -423,6 +423,7 @@ class Selector: "_huge_tree", "root", "_text", + "_text_lazy_html_root", "body", "__weakref__", ] @@ -507,6 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text + self._text_lazy_html_root = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -606,7 +608,9 @@ def xpath( ) else: try: - xpathev = self._get_root(self._text or "", type="html").xpath + if self._text_lazy_html_root is None: + self._text_lazy_html_root = self._get_root(self.root or "", type="html") + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -722,8 +726,12 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type in ("text", "json"): + if self.type == "json": return self.root + elif self.type == "text": + if self._text_lazy_html_root is None: + return self.root + return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False)) try: return typing.cast( str, From 70aca9bc9d5461266eca1f1262426b5420272eeb Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:24:39 +0800 Subject: [PATCH 2/5] Add testcases for drop html node --- tests/test_selector.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..754822e 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,34 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_html_in_text(self) -> None: + html = "

hello world

" + expect_result = "

hello world

" + sel = self.sscls(text=html, type="text") + self.assertEqual(sel.type, "text") + li_sel_list = sel.css("style") + li_sel_list.drop() + self.assertEqual(sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(sel.type, "text") + + def test_remove_selector_from_html_in_json(self) -> None: + json_str = """{ + "title": "hello world", + "body": "

hello world

" + } + """ + expect_result = "

hello world

" + sel = self.sscls(text=json_str) + html_sel = sel.jmespath("body")[0] + self.assertEqual(html_sel.type, "text") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "text") + + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="" From 0c2b57a20cf1d173d78593c64a9bfa57a7f7bc79 Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:52:42 +0800 Subject: [PATCH 3/5] Add type hint --- parsel/selector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index ebdcaa2..cf33aa7 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -508,7 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text - self._text_lazy_html_root = None + self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -610,7 +610,8 @@ def xpath( try: if self._text_lazy_html_root is None: self._text_lazy_html_root = self._get_root(self.root or "", type="html") - xpathev = self._text_lazy_html_root.xpath + if self._text_lazy_html_root is not None: + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) From 9c8869a11efeb0b1f7530e8f86f9f170408a87ae Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 01:24:39 +0800 Subject: [PATCH 4/5] Add testcases for drop html node --- tests/test_selector.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_selector.py b/tests/test_selector.py index 96713f9..902f82b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1007,6 +1007,35 @@ def test_remove_selector(self) -> None: self.assertIsSelectorList(sel.css("li")) self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) + def test_remove_selector_from_html_in_text(self) -> None: + html = ( + "

hello world

" + ) + expect_result = "

hello world

" + sel = self.sscls(text=html, type="text") + self.assertEqual(sel.type, "text") + li_sel_list = sel.css("style") + li_sel_list.drop() + self.assertEqual(sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(sel.type, "text") + + def test_remove_selector_from_html_in_json(self) -> None: + json_str = """{ + "title": "hello world", + "body": "

hello world

" + } + """ + expect_result = "

hello world

" + sel = self.sscls(text=json_str) + html_sel = sel.jmespath("body")[0] + self.assertEqual(html_sel.type, "text") + li_sel_list = html_sel.css("style") + li_sel_list.drop() + self.assertEqual(html_sel.get(), expect_result) + # The type of the parent selector should not change + self.assertEqual(html_sel.type, "text") + def test_remove_pseudo_element_selector_list(self) -> None: sel = self.sscls( text="
  • 1
  • 2
  • 3
" From 955abd900fc30a612ade74b19c0b240858a901fa Mon Sep 17 00:00:00 2001 From: dream2333 Date: Fri, 14 Jun 2024 02:34:17 +0800 Subject: [PATCH 5/5] Fix drop html element from a text type Selector --- parsel/selector.py | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index ebdcaa2..a5db974 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -508,7 +508,7 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text - self._text_lazy_html_root = None + self._text_lazy_html_root: Optional[etree._Element] = None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -609,8 +609,11 @@ def xpath( else: try: if self._text_lazy_html_root is None: - self._text_lazy_html_root = self._get_root(self.root or "", type="html") - xpathev = self._text_lazy_html_root.xpath + self._text_lazy_html_root = self._get_root( + self.root or "", type="html" + ) + if self._text_lazy_html_root is not None: + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -726,29 +729,30 @@ def get(self) -> Any: For HTML and XML, the result is always a string, and percent-encoded content is unquoted. """ - if self.type == "json": + if self.type in ("json", "text"): + if self.type == "text" and self._text_lazy_html_root is not None: + return etree.tostring( + self._text_lazy_html_root, encoding="unicode", with_tail=False + ) return self.root - elif self.type == "text": - if self._text_lazy_html_root is None: - return self.root - return typing.cast(str, etree.tostring(self._text_lazy_html_root, encoding="unicode", with_tail=False)) - try: - return typing.cast( - str, - etree.tostring( - self.root, - method=_ctgroup[self.type]["_tostring_method"], - encoding="unicode", - with_tail=False, - ), - ) - except (AttributeError, TypeError): - if self.root is True: - return "1" - elif self.root is False: - return "0" - else: - return str(self.root) + else: + try: + return typing.cast( + str, + etree.tostring( + self.root, + method=_ctgroup[self.type]["_tostring_method"], + encoding="unicode", + with_tail=False, + ), + ) + except (AttributeError, TypeError): + if self.root is True: + return "1" + elif self.root is False: + return "0" + else: + return str(self.root) extract = get