From 3c471b8cf1d5aba6d2c2a5ed9a684a7d5bc2761c Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 2 Nov 2018 14:03:26 +0500 Subject: [PATCH 01/21] [tmp] Selector.text and SelectorList.text methods --- parsel/selector.py | 25 ++++++++++++++++++++++++- setup.py | 3 ++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index f9292a4f..73725bc6 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -150,6 +150,15 @@ def attrib(self): else: return {} + def text(self, clean_html=True, guess_punct_space=True, guess_layout=True, + sep='\n'): + return sep.join( + x.text(clean_html=clean_html, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout) + for x in self + ) + class Selector(object): """ @@ -162,7 +171,7 @@ class Selector(object): If ``type`` is ``None``, the selector defaults to ``"html"``. """ - __slots__ = ['text', 'namespaces', 'type', '_expr', 'root', + __slots__ = ['namespaces', 'type', '_expr', 'root', '__weakref__', '_parser', '_csstranslator', '_tostring_method'] _default_type = None @@ -346,6 +355,20 @@ def attrib(self): """ return dict(self.root.attrib) + def text(self, clean_html=True, guess_punct_space=True, guess_layout=True): + from html_text.html_text import _clean_html, _html_to_text + tree = _clean_html(self.root) if clean_html else self.root + return _html_to_text(tree, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout) + + # def cleaned(self): + # from html_text.html_text import _clean_html + # root = _clean_html(self.root) + # return self.__class__(root=root, _expr=self._expr, + # namespaces=self.namespaces, + # type=self.type) + def __bool__(self): """ Return ``True`` if there is any real content selected or ``False`` diff --git a/setup.py b/setup.py index 53f6a1c4..575cf40b 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support(): 'w3lib>=1.19.0', 'lxml>=2.3', 'six>=1.5.2', - 'cssselect>=0.9' + 'cssselect>=0.9', + 'html-text>=0.4.1', ] extras_require = {} From 8dea4cefe9b581f8d3da747126228eb7bce7c3f5 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sat, 17 Nov 2018 15:40:55 +0500 Subject: [PATCH 02/21] [wip] move converting to text to .get method, add getall support, .cleaned --- parsel/selector.py | 157 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 37 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 73725bc6..d3877a04 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,8 @@ import six from lxml import etree, html +from lxml.html.clean import Cleaner +import html_text from .utils import flatten, iflatten, extract_regex from .csstranslator import HTMLTranslator, GenericTranslator @@ -121,21 +123,42 @@ def re_first(self, regex, default=None, replace_entities=True): else: return default - def getall(self): + def getall(self, text=False, cleaner='auto', + guess_punct_space=True, guess_layout=True): """ Call the ``.get()`` method for each element is this list and return their results flattened, as a list of unicode strings. - """ - return [x.get() for x in self] + + ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout`` + options are passed to :meth:`~.Selector.get`; see + :meth:`~.Selector.get` for more details. + """ + return [ + x.get( + text=text, + cleaner=cleaner, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout + ) + for x in self + ] extract = getall - def get(self, default=None): + def get(self, default=None, text=False, cleaner='auto', + guess_punct_space=True, guess_layout=True): """ Return the result of ``.get()`` for the first element in this list. - If the list is empty, return the default value. + If the list is empty, return the ``default`` value. + + ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout`` + options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get` + for more details. """ for x in self: - return x.get() + return x.get(text=text, + cleaner=cleaner, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout) else: return default extract_first = get @@ -150,15 +173,6 @@ def attrib(self): else: return {} - def text(self, clean_html=True, guess_punct_space=True, guess_layout=True, - sep='\n'): - return sep.join( - x.text(clean_html=clean_html, - guess_punct_space=guess_punct_space, - guess_layout=guess_layout) - for x in self - ) - class Selector(object): """ @@ -188,6 +202,8 @@ class Selector(object): } _lxml_smart_strings = False selectorlist_cls = SelectorList + _text_cleaner = html_text.cleaner + _html_cleaner = Cleaner() def __init__(self, text=None, type=None, namespaces=None, root=None, base_url=None, _expr=None): @@ -301,30 +317,87 @@ def re_first(self, regex, default=None, replace_entities=True): """ return next(iflatten(self.re(regex, replace_entities=replace_entities)), default) - def get(self): + def get(self, text=False, cleaner='auto', + guess_punct_space=True, guess_layout=True): """ Serialize and return the matched nodes in a single unicode string. Percent encoded content is unquoted. - """ + + When ``text`` is False (default), HTML or XML is extracted. Pass + ``text=True`` to extract text content (html-text library is used). + Text extraction algorithm assumes that the document is an HTML + document, and uses HTML-specific rules. + + ``cleaner`` argument allows to clean HTML before extracting the + content. Allowed values: + + * "auto" (default) - don't clean when text=False, clean with + options tuned for text extraction when text=True; + * "text" - clean with options tuned for text extraction: elements + like ``" "

hello

" + sel = Selector(div_html) + assert sel.css("script").getall() == [""] + assert sel.cleaned().css("script").getall() == [] + + assert len(sel.css("script")) == 1 + assert len(sel.css("style")) == 1 + assert len(sel.css("p")) == 1 + + assert len(sel.cleaned().css("script")) == 0 + assert len(sel.cleaned().css("style")) == 1 + assert len(sel.cleaned().css("p")) == 1 + + +def test_cleaned_options() -> None: + div_html = "
" "

hello

" + sel = Selector(div_html) + assert len(sel.css("script")) == 1 + assert len(sel.css("style")) == 1 + assert len(sel.css("p")) == 1 + + assert len(sel.cleaned().css("script")) == 0 + assert len(sel.cleaned().css("style")) == 1 + assert len(sel.cleaned().css("p")) == 1 + + assert len(sel.cleaned("html").css("script")) == 0 + assert len(sel.cleaned("html").css("style")) == 1 + assert len(sel.cleaned("html").css("p")) == 1 + + assert len(sel.cleaned("text").css("script")) == 0 + assert len(sel.cleaned("text").css("style")) == 0 + assert len(sel.cleaned("text").css("p")) == 1 + + cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False) + assert len(sel.cleaned(cleaner).css("script")) == 1 + assert len(sel.cleaned(cleaner).css("style")) == 1 + assert len(sel.cleaned(cleaner).css("p")) == 0 + + +def test_get_cleaner() -> None: + div_html = "

P

" + sel = Selector(div_html) + cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False) + + assert sel.get(text=True) == "P" + assert sel.get(text=True, cleaner=None) == "SCRIPT STYLE\n\nP" + assert sel.get(text=True, cleaner="html") == "STYLE\n\nP" + assert sel.get(text=True, cleaner="text") == "P" + assert sel.get(text=True, cleaner=cleaner) == "SCRIPT STYLE" + + div = sel.css("div") + assert div.get() == div_html + assert div.get(cleaner=None) == div_html + assert div.get(cleaner="html") == "

P

" + assert div.get(cleaner="text") == "

P

" + assert ( + div.get(cleaner=cleaner) + == "
" + ) + + +def test_guess_punct_space() -> None: + sel = Selector('

hello"Folks"

') + assert sel.get(text=True, guess_punct_space=False) == 'hello "Folks"' + assert sel.get(text=True, guess_punct_space=True) == 'hello"Folks"' + + assert sel.getall(text=True, guess_punct_space=False) == ['hello "Folks"'] + assert sel.getall(text=True, guess_punct_space=True) == ['hello"Folks"'] + + +def test_guess_layout() -> None: + sel = Selector("
  • option1
  • option2
") + assert sel.get(text=True, guess_layout=False) == "option1 option2" + assert sel.get(text=True, guess_layout=True) == "option1\noption2" + + assert sel.getall(text=True, guess_layout=False) == ["option1 option2"] + assert sel.getall(text=True, guess_layout=True) == ["option1\noption2"] From 4eea4fa0c5cdacf58820d115b77eb2dc3f7741f7 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 8 May 2024 21:39:35 +0500 Subject: [PATCH 19/21] fixed default .cleaned cleaner value --- parsel/selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsel/selector.py b/parsel/selector.py index aa052269..104db9d4 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -949,7 +949,7 @@ def attrib(self) -> Dict[str, str]: return dict(self.root.attrib) def cleaned( - self: _SelectorType, cleaner: Union[str, Cleaner] = "auto" + self: _SelectorType, cleaner: Union[str, Cleaner] = "html" ) -> _SelectorType: """ Return a copy of a Selector, with underlying subtree cleaned. From 27c9919a67eeef11231e43a93f27cf5c60c8516d Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 8 May 2024 22:47:54 +0500 Subject: [PATCH 20/21] fixed black formatting went wrong --- tests/test_text_and_cleaning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_text_and_cleaning.py b/tests/test_text_and_cleaning.py index 8b664cd9..a4b6ea75 100644 --- a/tests/test_text_and_cleaning.py +++ b/tests/test_text_and_cleaning.py @@ -30,7 +30,7 @@ def test_text_getall() -> None: def test_cleaned() -> None: - div_html = "
" "

hello

" + div_html = "

hello

" sel = Selector(div_html) assert sel.css("script").getall() == [""] assert sel.cleaned().css("script").getall() == [] @@ -45,7 +45,7 @@ def test_cleaned() -> None: def test_cleaned_options() -> None: - div_html = "
" "

hello

" + div_html = "

hello

" sel = Selector(div_html) assert len(sel.css("script")) == 1 assert len(sel.css("style")) == 1 From 852bbef86308515b0b873b3eaca4e26523dff6ee Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 8 May 2024 22:48:13 +0500 Subject: [PATCH 21/21] fix docs references --- docs/usage.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 7cfa2fce..0b97d8b2 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -143,13 +143,14 @@ pseudo-elements:: To extract all text of one or more element and all their child elements, formatted as plain text taking into account HTML tags (e.g. ``
`` is translated as a line break), set ``text=True`` in your call to -:meth:`~Selector.get` or :meth:`~Selector.getall` instead of including +:meth:`~parsel.selector.Selector.get` or +:meth:`~parsel.selector.Selector.getall` instead of including ``::text`` (CSS) or ``/text()`` (XPath) in your query:: >>> selector.css('#images').get(text=True) 'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5' -See :meth:`Selector.get` for additional parameters that you can use to change +See :meth:`Selector.get` for additional parameters that you can use to change how the extracted plain text is formatted. As you can see, ``.xpath()`` and ``.css()`` methods return a