scrapy · kmike · Nov 2, 2018 · Nov 17, 2018 · May 30, 2019 · Feb 9, 2022
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,6 @@
 [flake8]
-ignore = E203,W503
+max-line-length = 88
+ignore = E203,W503,E701
 per-file-ignores =
     docs/conftest.py:E501
     parsel/csstranslator.py:E501

diff --git a/.gitignore b/.gitignore
@@ -29,6 +29,7 @@ pip-log.txt
 nosetests.xml
 htmlcov
 .pytest_cache
+coverage.xml
 
 # Translations
 *.mo

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,14 +4,14 @@ repos:
   hooks:
   - id: bandit
     args: [-r, -c, .bandit.yml]
-- repo: https://github.com/PyCQA/flake8
-  rev: 7.0.0
-  hooks:
-  - id: flake8
 - repo: https://github.com/psf/black.git
   rev: 24.2.0
   hooks:
   - id: black
+- repo: https://github.com/PyCQA/flake8
+  rev: 7.0.0
+  hooks:
+  - id: flake8
 - repo: https://github.com/pycqa/isort
   rev: 5.13.2
   hooks:

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -140,6 +140,19 @@ pseudo-elements::
     >>> selector.css('title::text').get()
     'Example website'
 
+To extract all text of one or more element and all their child elements, 
+formatted as plain text taking into account HTML tags (e.g. ``<br/>`` is 
+translated as a line break), set ``text=True`` in your call to 
+:meth:`~parsel.selector.Selector.get` or
+:meth:`~parsel.selector.Selector.getall` instead of including
+``::text`` (CSS) or ``/text()`` (XPath) in your query::
+
+    >>> selector.css('#images').get(text=True)
+    'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5'
+
+See :meth:`Selector.get` for additional parameters that you can use to change
+how the extracted plain text is formatted.
+
 As you can see, ``.xpath()`` and ``.css()`` methods return a
 :class:`~parsel.selector.SelectorList` instance, which is a list of new
 selectors. This API can be used for quickly selecting nested data::

diff --git a/parsel/selector.py b/parsel/selector.py
@@ -22,8 +22,10 @@
 )
 from warnings import warn
 
+import html_text  # type: ignore[import-untyped]
 import jmespath
 from lxml import etree, html
+from lxml.html.clean import Cleaner  # pylint: disable=no-name-in-module
 from packaging.version import Version
 
 from .csstranslator import GenericTranslator, HTMLTranslator
@@ -245,30 +247,68 @@ def re_first(
             return typing.cast(str, el)
         return default
 
-    def getall(self) -> List[str]:
+    def getall(
+        self,
+        *,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> List[str]:
         """
         Call the ``.get()`` method for each element is this list and return
         their results flattened, as a list of strings.
-        """
-        return [x.get() for x in self]
-
-    extract = getall
 
-    @typing.overload
-    def get(self, default: None = None) -> Optional[str]:
-        pass
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`~.Selector.get`; see
+        :meth:`~.Selector.get` for more details.
+
+        .. note::
+
+            When either text extraction or cleaning is requested, they're
+            performed on each element in the list individually. So, if you match
+            nested elements (i.e. both parent and descendant), cleaning or
+            text extraction could be run multiple times on the same part
+            of the tree. For example, ``selector.xpath("*").getall(text=True)``
+            has O(N^2) complexity regarding the number of nodes in the tree,
+            not O(N).
+        """
+        return [
+            x.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
+            for x in self
+        ]
 
-    @typing.overload
-    def get(self, default: str) -> str:
-        pass
+    extract = getall
 
-    def get(self, default: Optional[str] = None) -> Any:
+    def get(
+        self,
+        default: Optional[str] = None,
+        *,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> Any:
         """
         Return the result of ``.get()`` for the first element in this list.
-        If the list is empty, return the default value.
+        If the list is empty, return the ``default`` value.
+
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
+        for more details.
         """
         for x in self:
-            return x.get()
+            return x.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
         return default
 
     extract_first = get
@@ -439,6 +479,8 @@ class Selector:
     }
     _lxml_smart_strings = False
     selectorlist_cls = SelectorList["Selector"]
+    _text_cleaner = html_text.cleaner
+    _html_cleaner = Cleaner()
 
     def __init__(
         self,
@@ -715,40 +757,110 @@ def re_first(
             default,
         )
 
-    def get(self) -> Any:
+    def get(
+        self,
+        *,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> Any:
         """
         Serialize and return the matched nodes.
 
         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
+
+        When ``text`` is False (default), HTML or XML is extracted. Pass
+        ``text=True`` to extract text content (html-text library is used).
+        Text extraction algorithm assumes that the document is an HTML
+        document, and uses HTML-specific rules.
+
+        ``cleaner`` argument allows cleaning HTML before extracting the
+        content. Allowed values:
+
+        * "auto" (default) - don't clean when text=False, clean with
+          options tuned for text extraction when text=True;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * "html" - use default ``lxml.html.clean.Cleaner``. This is useful
+          if you want to make .get() output more human-readable, but still
+          preserve HTML tags.
+        * None - don't clean, even when ``text=True``. Useful if you have
+          an already cleaned tree, e.g. after calling :meth:`Selector.cleaned`.
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+
+        ``guess_punct_space`` and ``guess_layout`` options allow to customize
+        text extraction algorithm. By default, when ``text=True``,
+        parsel tries to insert newlines and blank lines as appropriate,
+        and be smart about whitespaces around inline tags,
+        so that the text output looks similar to browser's.
+
+        Pass ``guess_punct_space=False`` to disable punctuation handling.
+        This option has no effect when ``text=False``.
+
+        Use ``guess_layout=False`` to avoid adding newlines - content will
+        be just a single line of text, using whitespaces as separators.
+        This option has no effect when ``text=False``.
         """
         if self.type in ("text", "json"):
+            # TODO: what should be the behavior with text=True?
             return self.root
+
+        sel = self
+        if cleaner == "auto":
+            if text:
+                sel = self.cleaned("text")
+        elif cleaner is not None:
+            sel = self.cleaned(cleaner)
+        tree = sel.root
+
+        if text:
+            return html_text.etree_to_text(
+                tree,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
+
         try:
-            return typing.cast(
-                str,
-                etree.tostring(
-                    self.root,
-                    method=_ctgroup[self.type]["_tostring_method"],
-                    encoding="unicode",
-                    with_tail=False,
-                ),
+            return etree.tostring(
+                tree,
+                method=_ctgroup[self.type]["_tostring_method"],
+                encoding="unicode",
+                with_tail=False,
             )
         except (AttributeError, TypeError):
-            if self.root is True:
+            if tree is True:
                 return "1"
-            elif self.root is False:
+            elif tree is False:
                 return "0"
             else:
-                return str(self.root)
+                return str(tree)
 
     extract = get
 
-    def getall(self) -> List[str]:
+    def getall(
+        self,
+        *,
+        text: bool = False,
+        cleaner: Union[str, None, Cleaner] = "auto",
+        guess_punct_space: bool = True,
+        guess_layout: bool = True,
+    ) -> List[str]:
         """
         Serialize and return the matched node in a 1-element list of strings.
+
+        See :meth:`~.Selector.get` for options.
         """
-        return [self.get()]
+        return [
+            self.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout,
+            )
+        ]
 
     def register_namespace(self, prefix: str, uri: str) -> None:
         """
@@ -836,6 +948,40 @@ def attrib(self) -> Dict[str, str]:
         """Return the attributes dictionary for underlying element."""
         return dict(self.root.attrib)
 
+    def cleaned(
+        self: _SelectorType, cleaner: Union[str, Cleaner] = "html"
+    ) -> _SelectorType:
+        """
+        Return a copy of a Selector, with underlying subtree cleaned.
+        Allowed values of ``cleaner`` argument:
+
+        * "html" (default) - use default ``lxml.html.clean.Cleaner``;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+        """
+        if isinstance(cleaner, str):
+            if cleaner not in {"html", "text"}:
+                raise ValueError(
+                    "cleaner must be 'html', 'text' or "
+                    "an lxml.html.clean.Cleaner instance"
+                )
+            if cleaner == "html":
+                cleaner_obj = self._html_cleaner
+            elif cleaner == "text":
+                cleaner_obj = self._text_cleaner
+        else:
+            cleaner_obj = cleaner
+
+        root = cleaner_obj.clean_html(self.root)
+        return self.__class__(
+            root=root,
+            _expr=self._expr,
+            namespaces=self.namespaces,
+            type=self.type,
+        )
+
     def __bool__(self) -> bool:
         """
         Return ``True`` if there is any real content selected or ``False``

diff --git a/setup.py b/setup.py
@@ -30,6 +30,7 @@
         "lxml",
         "packaging",
         "w3lib>=1.19.0",
+        "html-text>=0.5.2",
     ],
     python_requires=">=3.8",
     license="BSD",

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -1,6 +1,5 @@
 import pickle
 import re
-import typing
 import unittest
 import warnings
 import weakref
@@ -685,7 +684,7 @@ def test_namespaces_multiple_adhoc(self) -> None:
     def test_make_links_absolute(self) -> None:
         text = '<a href="file.html">link to file</a>'
         sel = Selector(text=text, base_url="http://example.com")
-        typing.cast(HtmlElement, sel.root).make_links_absolute()
+        cast(HtmlElement, sel.root).make_links_absolute()
         self.assertEqual(
             "http://example.com/file.html",
             sel.xpath("//a/@href").extract_first(),
@@ -1049,6 +1048,23 @@ def test_remove_root_element_selector(self) -> None:
         sel.css("body").drop()
         self.assertEqual(sel.get(), "<html></html>")
 
+    def test_dont_remove_text_after_deleted_element(self) -> None:
+        sel = self.sscls(
+            text="""<html><body>Text before.<span>Text in.</span> Text after.</body></html>
+            """
+        )
+        sel.css("span").drop()
+        self.assertEqual(
+            sel.get(), "<html><body>Text before. Text after.</body></html>"
+        )
+
+    def test_drop_with_xml_type(self) -> None:
+        sel = self.sscls(text="<a><b></b><c/></a>", type="xml")
+        el = sel.xpath("//b")[0]
+        assert el.root.getparent() is not None
+        el.drop()
+        assert sel.get() == "<a><c/></a>"
+
     def test_deep_nesting(self) -> None:
         lxml_version = Version(etree.__version__)
         lxml_huge_tree_version = Version("4.2")
@@ -1322,23 +1338,6 @@ def test_set(self) -> None:
             ["url", "name", "startDate", "location", "offers"],
         )
 
-    def test_dont_remove_text_after_deleted_element(self) -> None:
-        sel = self.sscls(
-            text="""<html><body>Text before.<span>Text in.</span> Text after.</body></html>
-            """
-        )
-        sel.css("span").drop()
-        self.assertEqual(
-            sel.get(), "<html><body>Text before. Text after.</body></html>"
-        )
-
-    def test_drop_with_xml_type(self) -> None:
-        sel = self.sscls(text="<a><b></b><c/></a>", type="xml")
-        el = sel.xpath("//b")[0]
-        assert el.root.getparent() is not None
-        el.drop()
-        assert sel.get() == "<a><c/></a>"
-
 
 class SelectorBytesInput(Selector):
     def __init__(