diff --git a/CHANGELOG.md b/CHANGELOG.md index 05feef69..d018db6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +**v0.30.7** +* [[TeamMsgExtractor #239](https://github.com/TeamMsgExtractor/msg-extractor/issues/239)] Fixed msg.py not having `import pathlib`. +* After going through the details of the example MSG files provided with the module, specifically unicode.msg, I now am glad I decided to put in some fail-safes in the HTML body processing. One of them does not have an ``, ``, nor `` tag, and so would have had an error. This will actually prevent the header from injecting properly as well, so a bit of validation before was made necessary to ensure the HTML saving would still work. +* Added new exception `BadHtmlError`. +* Added new function `utils.validateHtml`. +* Updated README credits. +* Changed header logic to generate manually if the header data has been stripped (filled with null bytes) and not just if the stream does not exist. + **v0.30.6** * Small adjustments to internal code to make it a bit better. * Added `Message.getSaveBody`, `Message.getSaveHtmlBody`, and `Message.getSaveRtfBody`. These three functions generate their respective bodies that will be used when saving the file, allowing you to retrieve the final products without having to write them to the disk first. All arguments that are passed to `Message.save` that would influence the respective bodies are passed to their respective functions. diff --git a/README.rst b/README.rst index 6f1b11d8..fd607750 100644 --- a/README.rst +++ b/README.rst @@ -199,6 +199,8 @@ Credits `Dean Malmgren`_ - First implementation of the setup.py script +`Seamus Tuohy`_ - Developer of the Python RTFDE module. Gave first examples of how to use the module. + `Liam`_ - Significant reorganization and transfer of data. And thank you to everyone who has opened an issue and helped us track down those pesky bugs. @@ -206,8 +208,8 @@ And thank you to everyone who has opened an issue and helped us track down those .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.30.6-blue.svg - :target: https://pypi.org/project/extract-msg/0.30.6/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.30.7-blue.svg + :target: https://pypi.org/project/extract-msg/0.30.7/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.6+-brightgreen.svg :target: https://www.python.org/downloads/release/python-367/ @@ -218,4 +220,5 @@ And thank you to everyone who has opened an issue and helped us track down those .. _Dean Malmgren: https://github.com/deanmalmgren .. _Joel Kaufman: https://github.com/joelkaufman .. _Liam: https://github.com/LiamPM5 +.. _Seamus Tuohy: https://github.com/seamustuohy .. _Discord: https://discord.com/invite/B77McRmzdc diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 8ed8a24e..1f182076 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2022-01-30' -__version__ = '0.30.6' +__date__ = '2022-01-31' +__version__ = '0.30.7' import logging diff --git a/extract_msg/exceptions.py b/extract_msg/exceptions.py index a3d26153..f1fba67c 100644 --- a/extract_msg/exceptions.py +++ b/extract_msg/exceptions.py @@ -12,6 +12,12 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) +class BadHtmlError(ValueError): + """ + HTML failed to pass validation. + """ + pass + class ConversionError(Exception): """ An error occured during type conversion. diff --git a/extract_msg/message.py b/extract_msg/message.py index 8dd35bda..da0a3e59 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -310,6 +310,9 @@ def getSaveHtmlBody(self, preparedHtml : bool = False, charset : str = 'utf-8', `None` or an empty string to not insert the tag (Default: 'utf-8'). :param **kwargs: Used to allow kwargs expansion in the save function. Arguments absorbed by this are simply ignored. + + :raises BadHtmlError: if :param preparedHtml: is False and the HTML + fails to validate. """ if self.htmlBody: # Inject the header into the data. @@ -328,15 +331,15 @@ def getSaveHtmlBody(self, preparedHtml : bool = False, charset : str = 'utf-8', tag = bs4.Tag(parser = bs, name = 'meta', attrs = tagAttrs, can_be_empty_element = True) # Add the tag to the head section. if bs.find('head'): - bs.find('head').insert(1, tag) + bs.find('head').insert(0, tag) else: # If we are here, the head doesn't exist, so let's add # it. if bs.find('html'): # This should always be true, but I want to be safe. head = bs4.Tag(parser = bs, name = 'head') - head.insert(1, tag) - bs.find('html').insert(1, head) + head.insert(0, tag) + bs.find('html').insert(0, head) data = bs.prettify('utf-8') diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index 7589d62f..cdb31868 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -327,7 +327,7 @@ def header(self): return self._header except AttributeError: headerText = self._getStringStream('__substg1.0_007D') - if headerText is not None: + if headerText: self._header = EmailParser().parsestr(headerText) self._header['date'] = self.date else: diff --git a/extract_msg/msg.py b/extract_msg/msg.py index e9582ecc..12de36ee 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -2,6 +2,7 @@ import copy import logging import os +import pathlib import zipfile import olefile diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 15946d8c..496b121d 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -16,12 +16,13 @@ import sys import zipfile +import bs4 import tzlocal from html import escape as htmlEscape from . import constants -from .exceptions import ConversionError, IncompatibleOptionsError, InvaildPropertyIdError, UnknownCodepageError, UnknownTypeError, UnrecognizedMSGTypeError, UnsupportedMSGTypeError +from .exceptions import BadHtmlError, ConversionError, IncompatibleOptionsError, InvaildPropertyIdError, UnknownCodepageError, UnknownTypeError, UnrecognizedMSGTypeError, UnsupportedMSGTypeError logger = logging.getLogger(__name__) @@ -282,9 +283,11 @@ def injectHtmlHeader(msgFile, prepared : bool = False) -> bytes: the HTML header injected into it. :param prepared: Determines whether to be using the standard HTML (False) or - the prepared HTML (True) body (Default: False). + the prepared HTML (True) body (Default: False). :raises AttributeError: if the correct HTML body cannot be acquired. + :raises BadHtmlError: if :param preparedHtml: is False and the HTML fails to + validate. """ if not hasattr(msgFile, 'htmlBody') or not msgFile.htmlBody: raise AttributeError('Cannot inject the HTML header without an HTML body attribute.') @@ -303,6 +306,77 @@ def injectHtmlHeader(msgFile, prepared : bool = False) -> bytes: else: body = msgFile.htmlBody + # Validate the HTML. + if not validateHtml(body): + # If we are not preparing the HTML body, then raise an + # exception. + if not prepared: + raise BadHtmlError('HTML body failed to pass validation.') + + # If we are here, then we need to do what we can to fix the HTML body. + # Unfortunately this gets complicated because of the various ways the + # body could be wrong. If only the tag is missing, then we just + # need to insert it at the end and be done. If both the and + # tag are missing, we determine where to put the body tag (around + # everything if there is no tag, otherwise at the end) and then + # wrap it all in the tag. + parser = bs4.BeautifulSoup(body) + if not parser.find('html') and not parser.find('body'): + if parser.find('head') or parser.find('footer'): + # Create the parser we will be using for the corrections. + correctedHtml = bs4.BeautifulSoup(b'', features = 'html.parser') + htmlTag = correctedHtml.find('html') + + # Iterate over each of the direct descendents of the parser and + # add each to a new tag if they are not the head or footer. + bodyTag = parser.new_tag('body') + # What we are going to be doing will be causing some of the tags + # to be moved out of the parser, and so the iterator will end up + # pointing to the wrong place after that. To compensate we first + # create a tuple and iterate over that. + for tag in tuple(parser.children): + if tag.name.lower() in ('head', 'footer'): + correctedHtml.append(tag) + else: + bodyTag.append(tag) + + # All the tags should now be properly in the body, so let's + # insert it. + if correctedHtml.find('head'): + correctedHtml.find('head').insert_after(bodyTag) + else: + correctedHtml.find('footer').insert_before(bodyTag) + else: + # If there is no , ,