Merge pull request #240 from TeamMsgExtractor/next-release

v0.30.7 (ignore the name of the commit before, that was a typo lol)
TeamMsgExtractor · Jan 31, 2022 · 6b5e458 · 6b5e458
2 parents d8ef417 + 4b86664
commit 6b5e458
Show file tree

Hide file tree

Showing 8 changed files with 115 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+**v0.30.7**
+* [[TeamMsgExtractor #239](https://github.com/TeamMsgExtractor/msg-extractor/issues/239)] Fixed msg.py not having `import pathlib`.
+* After going through the details of the example MSG files provided with the module, specifically unicode.msg, I now am glad I decided to put in some fail-safes in the HTML body processing. One of them does not have an `<html>`, `<head>`, nor `<body>` tag, and so would have had an error. This will actually prevent the header from injecting properly as well, so a bit of validation before was made necessary to ensure the HTML saving would still work.
+* Added new exception `BadHtmlError`.
+* Added new function `utils.validateHtml`.
+* Updated README credits.
+* Changed header logic to generate manually if the header data has been stripped (filled with null bytes) and not just if the stream does not exist.
+
 **v0.30.6**
 * Small adjustments to internal code to make it a bit better.
 * Added `Message.getSaveBody`, `Message.getSaveHtmlBody`, and `Message.getSaveRtfBody`. These three functions generate their respective bodies that will be used when saving the file, allowing you to retrieve the final products without having to write them to the disk first. All arguments that are passed to `Message.save` that would influence the respective bodies are passed to their respective functions.

diff --git a/README.rst b/README.rst
@@ -199,15 +199,17 @@ Credits
 
 `Dean Malmgren`_ - First implementation of the setup.py script
 
+`Seamus Tuohy`_ - Developer of the Python RTFDE module. Gave first examples of how to use the module.
+
 `Liam`_ - Significant reorganization and transfer of data.
 
 And thank you to everyone who has opened an issue and helped us track down those pesky bugs.
 
 .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
    :target: LICENSE.txt
 
-.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.30.6-blue.svg
-   :target: https://pypi.org/project/extract-msg/0.30.6/
+.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.30.7-blue.svg
+   :target: https://pypi.org/project/extract-msg/0.30.7/
 
 .. |PyPI2| image:: https://img.shields.io/badge/python-3.6+-brightgreen.svg
    :target: https://www.python.org/downloads/release/python-367/
@@ -218,4 +220,5 @@ And thank you to everyone who has opened an issue and helped us track down those
 .. _Dean Malmgren: https://github.com/deanmalmgren
 .. _Joel Kaufman: https://github.com/joelkaufman
 .. _Liam: https://github.com/LiamPM5
+.. _Seamus Tuohy: https://github.com/seamustuohy
 .. _Discord: https://discord.com/invite/B77McRmzdc
diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py
@@ -27,8 +27,8 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = 'Destiny Peterson & Matthew Walker'
-__date__ = '2022-01-30'
-__version__ = '0.30.6'
+__date__ = '2022-01-31'
+__version__ = '0.30.7'
 
 import logging
 

diff --git a/extract_msg/exceptions.py b/extract_msg/exceptions.py
@@ -12,6 +12,12 @@
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
+class BadHtmlError(ValueError):
+    """
+    HTML failed to pass validation.
+    """
+    pass
+
 class ConversionError(Exception):
     """
     An error occured during type conversion.

diff --git a/extract_msg/message.py b/extract_msg/message.py
@@ -310,6 +310,9 @@ def getSaveHtmlBody(self, preparedHtml : bool = False, charset : str = 'utf-8',
             `None` or an empty string to not insert the tag (Default: 'utf-8').
         :param **kwargs: Used to allow kwargs expansion in the save function.
             Arguments absorbed by this are simply ignored.
+
+        :raises BadHtmlError: if :param preparedHtml: is False and the HTML
+            fails to validate.
         """
         if self.htmlBody:
             # Inject the header into the data.
@@ -328,15 +331,15 @@ def getSaveHtmlBody(self, preparedHtml : bool = False, charset : str = 'utf-8',
                     tag = bs4.Tag(parser = bs, name = 'meta', attrs = tagAttrs, can_be_empty_element = True)
                     # Add the tag to the head section.
                     if bs.find('head'):
-                        bs.find('head').insert(1, tag)
+                        bs.find('head').insert(0, tag)
                     else:
                         # If we are here, the head doesn't exist, so let's add
                         # it.
                         if bs.find('html'):
                             # This should always be true, but I want to be safe.
                             head = bs4.Tag(parser = bs, name = 'head')
-                            head.insert(1, tag)
-                            bs.find('html').insert(1, head)
+                            head.insert(0, tag)
+                            bs.find('html').insert(0, head)
 
                     data = bs.prettify('utf-8')
 

diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py
@@ -327,7 +327,7 @@ def header(self):
             return self._header
         except AttributeError:
             headerText = self._getStringStream('__substg1.0_007D')
-            if headerText is not None:
+            if headerText:
                 self._header = EmailParser().parsestr(headerText)
                 self._header['date'] = self.date
             else:

diff --git a/extract_msg/msg.py b/extract_msg/msg.py
@@ -2,6 +2,7 @@
 import copy
 import logging
 import os
+import pathlib
 import zipfile
 
 import olefile

diff --git a/extract_msg/utils.py b/extract_msg/utils.py
@@ -16,12 +16,13 @@
 import sys
 import zipfile
 
+import bs4
 import tzlocal
 
 from html import escape as htmlEscape
 
 from . import constants
-from .exceptions import ConversionError, IncompatibleOptionsError, InvaildPropertyIdError, UnknownCodepageError, UnknownTypeError, UnrecognizedMSGTypeError, UnsupportedMSGTypeError
+from .exceptions import BadHtmlError, ConversionError, IncompatibleOptionsError, InvaildPropertyIdError, UnknownCodepageError, UnknownTypeError, UnrecognizedMSGTypeError, UnsupportedMSGTypeError
 
 
 logger = logging.getLogger(__name__)
@@ -282,9 +283,11 @@ def injectHtmlHeader(msgFile, prepared : bool = False) -> bytes:
     the HTML header injected into it.
 
     :param prepared: Determines whether to be using the standard HTML (False) or
-                     the prepared HTML (True) body (Default: False).
+        the prepared HTML (True) body (Default: False).
 
     :raises AttributeError: if the correct HTML body cannot be acquired.
+    :raises BadHtmlError: if :param preparedHtml: is False and the HTML fails to
+        validate.
     """
     if not hasattr(msgFile, 'htmlBody') or not msgFile.htmlBody:
         raise AttributeError('Cannot inject the HTML header without an HTML body attribute.')
@@ -303,6 +306,77 @@ def injectHtmlHeader(msgFile, prepared : bool = False) -> bytes:
     else:
         body = msgFile.htmlBody
 
+    # Validate the HTML.
+    if not validateHtml(body):
+        # If we are not preparing the HTML body, then raise an
+        # exception.
+        if not prepared:
+            raise BadHtmlError('HTML body failed to pass validation.')
+
+        # If we are here, then we need to do what we can to fix the HTML body.
+        # Unfortunately this gets complicated because of the various ways the
+        # body could be wrong. If only the <body> tag is missing, then we just
+        # need to insert it at the end and be done. If both the <html> and
+        # <body> tag are missing, we determine where to put the body tag (around
+        # everything if there is no <head> tag, otherwise at the end) and then
+        # wrap it all in the <html> tag.
+        parser = bs4.BeautifulSoup(body)
+        if not parser.find('html') and not parser.find('body'):
+            if parser.find('head') or parser.find('footer'):
+                # Create the parser we will be using for the corrections.
+                correctedHtml = bs4.BeautifulSoup(b'<html></html>', features = 'html.parser')
+                htmlTag = correctedHtml.find('html')
+
+                # Iterate over each of the direct descendents of the parser and
+                # add each to a new tag if they are not the head or footer.
+                bodyTag = parser.new_tag('body')
+                # What we are going to be doing will be causing some of the tags
+                # to be moved out of the parser, and so the iterator will end up
+                # pointing to the wrong place after that. To compensate we first
+                # create a tuple and iterate over that.
+                for tag in tuple(parser.children):
+                    if tag.name.lower() in ('head', 'footer'):
+                        correctedHtml.append(tag)
+                    else:
+                        bodyTag.append(tag)
+
+                # All the tags should now be properly in the body, so let's
+                # insert it.
+                if correctedHtml.find('head'):
+                    correctedHtml.find('head').insert_after(bodyTag)
+                else:
+                    correctedHtml.find('footer').insert_before(bodyTag)
+            else:
+                # If there is no <html>, <head>, <footer>, or <body> tag, then
+                # we just add the tags to the beginning and end of the data and
+                # move on.
+                body = b'<html><body>' + body + b'</body></html>'
+        elif parser.find('html'):
+            # Found <html> but not <body>.
+            # Iterate over each of the direct descendents of the parser and
+            # add each to a new tag if they are not the head or footer.
+            bodyTag = parser.new_tag('body')
+            # What we are going to be doing will be causing some of the tags
+            # to be moved out of the parser, and so the iterator will end up
+            # pointing to the wrong place after that. To compensate we first
+            # create a tuple and iterate over that.
+            for tag in tuple(parser.find('html').children):
+                if tag.name.lower() not in ('head', 'footer'):
+                    bodyTag.append(tag)
+
+            # All the tags should now be properly in the body, so let's
+            # insert it.
+            if parser.find('head'):
+                parser.find('head').insert_after(bodyTag)
+            elif parser.find('footer'):
+                parser.find('footer').insert_before(bodyTag)
+            else:
+                parser.find('html').insert(0, bodyTag)
+        else:
+            # Found <body> but not <html>. Just wrap everything in the <html>
+            # tags.
+            body = b'<html>' + body + b'</html>'
+
     def replace(bodyMarker):
         """
         Internal function to replace the body tag with itself plus the header.
@@ -836,6 +910,16 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None
     logging.getLogger().setLevel(defaultLevel)
     return True
 
+def validateHtml(html : bytes) -> bool:
+    """
+    Checks whether the HTML is considered valid. To be valid, the HTML must, at
+    minimum, contain an <html> tag, a <body> tag, and closing tags for each.
+    """
+    bs = bs4.BeautifulSoup(html, 'html.parser')
+    if not bs.find('html') or not bs.find('body'):
+        return False
+    return True
+
 def verifyPropertyId(id : str) -> None:
     """
     Determines whether a property ID is valid for vertain functions. Property