scrapy · kmike · Nov 2, 2018 · Nov 17, 2018 · May 30, 2019 · Feb 9, 2022
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -6,6 +6,8 @@
 
 import six
 from lxml import etree, html
+from lxml.html.clean import Cleaner
+import html_text
 
 from .utils import flatten, iflatten, extract_regex
 from .csstranslator import HTMLTranslator, GenericTranslator
@@ -121,21 +123,42 @@ def re_first(self, regex, default=None, replace_entities=True):
         else:
             return default
 
-    def getall(self):
+    def getall(self, text=False, cleaner='auto',
+               guess_punct_space=True, guess_layout=True):
         """
         Call the ``.get()`` method for each element is this list and return
         their results flattened, as a list of unicode strings.
-        """
-        return [x.get() for x in self]
+
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`~.Selector.get`; see
+        :meth:`~.Selector.get` for more details.
+        """
+        return [
+            x.get(
+                text=text,
+                cleaner=cleaner,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout
+            )
+            for x in self
+        ]
     extract = getall
 
-    def get(self, default=None):
+    def get(self, default=None, text=False, cleaner='auto',
+            guess_punct_space=True, guess_layout=True):
         """
         Return the result of ``.get()`` for the first element in this list.
-        If the list is empty, return the default value.
+        If the list is empty, return the ``default`` value.
+
+        ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+        options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
+        for more details.
         """
         for x in self:
-            return x.get()
+            return x.get(text=text,
+                         cleaner=cleaner,
+                         guess_punct_space=guess_punct_space,
+                         guess_layout=guess_layout)
         else:
             return default
     extract_first = get
@@ -162,7 +185,7 @@ class Selector(object):
     If ``type`` is ``None``, the selector defaults to ``"html"``.
     """
 
-    __slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
+    __slots__ = ['namespaces', 'type', '_expr', 'root',
                  '__weakref__', '_parser', '_csstranslator', '_tostring_method']
 
     _default_type = None
@@ -179,6 +202,8 @@ class Selector(object):
     }
     _lxml_smart_strings = False
     selectorlist_cls = SelectorList
+    _text_cleaner = html_text.cleaner
+    _html_cleaner = Cleaner()
 
     def __init__(self, text=None, type=None, namespaces=None, root=None,
                  base_url=None, _expr=None):
@@ -292,30 +317,87 @@ def re_first(self, regex, default=None, replace_entities=True):
         """
         return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
 
-    def get(self):
+    def get(self, text=False, cleaner='auto',
+            guess_punct_space=True, guess_layout=True):
         """
         Serialize and return the matched nodes in a single unicode string.
         Percent encoded content is unquoted.
-        """
+
+        When ``text`` is False (default), HTML or XML is extracted. Pass
+        ``text=True`` to extract text content (html-text library is used).
+        Text extraction algorithm assumes that the document is an HTML
+        document, and uses HTML-specific rules.
+
+        ``cleaner`` argument allows to clean HTML before extracting the
+        content. Allowed values:
+
+        * "auto" (default) - don't clean when text=False, clean with
+          options tuned for text extraction when text=True;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * "html" - use default ``lxml.html.clean.Cleaner``. This is useful
+          if you want to make .get() output more human-readable, but still
+          preserve HTML tags.
+        * None - don't clean, even when ``text=True``. Useful if you have
+          an already cleaned tree, e.g. after calling :meth:`Selector.cleaned`.
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+
+        ``guess_punct_space`` and ``guess_layout`` options allow to customize
+        text extraction algorithm. By default, when ``text=True``,
+        parsel tries to insert newlines and blank lines as appropriate,
+        and be smart about whitespaces around inline tags,
+        so that the text output looks similar to browser's.
+
+        Pass ``guess_punct_space=False`` to disable punctuation handling.
+        This option has no effect when ``text=False``.
+
+        Use ``guess_layout=False`` to avoid adding newlines - content will
+        be just a single line of text, using whitespaces as separators.
+        This option has no effect when ``text=False``.
+        """
+        sel = self
+        if cleaner == 'auto':
+            if text:
+                sel = self.cleaned('text')
+        elif cleaner is not None:
+            sel = self.cleaned(cleaner)
+        tree = sel.root
+
+        if text:
+            return html_text.etree_to_text(tree,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout
+            )
+
         try:
-            return etree.tostring(self.root,
+            return etree.tostring(tree,
                                   method=self._tostring_method,
                                   encoding='unicode',
                                   with_tail=False)
         except (AttributeError, TypeError):
-            if self.root is True:
+            if tree is True:
                 return u'1'
-            elif self.root is False:
+            elif tree is False:
                 return u'0'
             else:
-                return six.text_type(self.root)
+                return six.text_type(tree)
     extract = get
 
-    def getall(self):
+    def getall(self, text=False, cleaner='auto',
+               guess_punct_space=True, guess_layout=True):
         """
-        Serialize and return the matched node in a 1-element list of unicode strings.
+        Serialize and return the matched node in a 1-element list of unicode
+        strings.
+
+        See :meth:`~.Selector.get` for options.
         """
-        return [self.get()]
+        return [self.get(
+            text=text,
+            cleaner=cleaner,
+            guess_punct_space=guess_punct_space,
+            guess_layout=guess_layout,
+        )]
 
     def register_namespace(self, prefix, uri):
         """
@@ -346,6 +428,30 @@ def attrib(self):
         """
         return dict(self.root.attrib)
 
+    def cleaned(self, cleaner='html'):
+        """
+        Return a copy of a Selector, with underlying subtree cleaned.
+        Allowed values of ``cleaner`` argument:
+
+        * "html" (default) - use default ``lxml.html.clean.Cleaner``;
+        * "text" - clean with options tuned for text extraction: elements
+          like ``<script>`` and ``<style>`` are removed, cleaning options
+          are tuned for speed, assuming text extraction is the end goal;
+        * custom ``lxml.html.clean.Cleaner`` objects are also supported.
+        """
+        if isinstance(cleaner, six.string_types):
+            if cleaner not in {'html', 'text'}:
+                raise ValueError("cleaner must be 'html', 'text' or "
+                                 "an lxml.html.clean.Cleaner instance")
+        if cleaner == 'html':
+            cleaner = self._html_cleaner
+        elif cleaner == 'text':
+            cleaner = self._text_cleaner
+        root = cleaner.clean_html(self.root)
+        return self.__class__(root=root, _expr=self._expr,
+                              namespaces=self.namespaces,
+                              type=self.type)
+
     def __bool__(self):
         """
         Return ``True`` if there is any real content selected or ``False``

diff --git a/setup.py b/setup.py
@@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support():
     'w3lib>=1.19.0',
     'lxml>=2.3',
     'six>=1.5.2',
-    'cssselect>=0.9'
+    'cssselect>=0.9',
+    'html-text>=0.4.1',
 ]
 extras_require = {}