diff --git a/CHANGELOG.md b/CHANGELOG.md index 42f63a3e..00bb4278 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +**v0.41.3** +* [[TeamMsgExtractor #365](https://github.com/TeamMsgExtractor/msg-extractor/issues/365)] Fixed an issue that would cause certain values retrieved from the header to not be decoded properly. It does this when retrieving the values, so nothing about the header has been changed. +* Added new property `MessageBase.headerText` which is the text content of the header stream. Adjusted other things to use this instead of trying to retrieve the stream directly in multiple places. +* Added typing to `MessageBase.header`. + **v0.41.2** * Updated annotations on `MessageBase.save`. * Added new enum `BodyTypes`. diff --git a/README.rst b/README.rst index a83f9b45..5516c8a7 100644 --- a/README.rst +++ b/README.rst @@ -250,8 +250,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.41.2-blue.svg - :target: https://pypi.org/project/extract-msg/0.41.2/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.41.3-blue.svg + :target: https://pypi.org/project/extract-msg/0.41.3/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3816/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index d99c9906..0021ce89 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2023-05-24' -__version__ = '0.41.2' +__date__ = '2023-06-10' +__version__ = '0.41.3' __all__ = [ # Modules: diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index 4dfbf785..e9014bca 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -39,9 +39,9 @@ from .structures.report_tag import ReportTag from .recipient import Recipient from .utils import ( - addNumToDir, addNumToZipDir, createZipOpen, findWk, htmlSanitize, - inputToBytes, inputToString, isEncapsulatedRtf, prepareFilename, - rtfSanitizeHtml, rtfSanitizePlain, validateHtml + addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk, + htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf, + prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml ) from imapclient.imapclient import decode_utf7 @@ -135,6 +135,7 @@ def _genRecipient(self, recipientType, recipientInt : RecipientType) -> Optional if self.headerInit(): value = self.header[recipientType] if value: + value = decodeRfc2047(value) value = value.replace(',', self.__recipientSeparator) # If the header had a blank field or didn't have the field, generate @@ -792,7 +793,7 @@ def save(self, **kwargs) -> MessageBase: # If the user has requested the headers for this file, save it now. if kwargs.get('saveHeader', False): - headerText = self._getStringStream('__substg1.0_007D') + headerText = self.headerText if not headerText: headerText = constants.HEADER_FORMAT.format(subject = self.subject, **self.header) @@ -1047,7 +1048,7 @@ def detectedBodies(self) -> BodyTypes: return bodies @property - def header(self): + def header(self) -> email.message.Message: """ Returns the message header, if it exists. Otherwise it will generate one. @@ -1055,7 +1056,7 @@ def header(self): try: return self._header except AttributeError: - headerText = self._getStringStream('__substg1.0_007D') + headerText = self.headerText if headerText: self._header = EmailParser().parsestr(headerText) self._header['date'] = self.date @@ -1071,6 +1072,7 @@ def header(self): # TODO find authentication results outside of header header.add_header('Authentication-Results', None) self._header = header + return self._header @property @@ -1130,6 +1132,13 @@ def headerFormatProperties(self) -> constants.HEADER_FORMAT_TYPE: }, } + @functools.cached_property + def headerText(self) -> Optional[str]: + """ + The raw text of the header stream, if it exists. + """ + return self._getStringStream('__substg1.0_007D') + @property def htmlBody(self) -> Optional[bytes]: """ @@ -1229,7 +1238,7 @@ def messageId(self) -> Optional[str]: except AttributeError: headerResult = None if self.headerInit(): - headerResult = self._header['message-id'] + headerResult = self.header['message-id'] if headerResult is not None: self._messageId = headerResult else: @@ -1329,7 +1338,7 @@ def sender(self) -> Optional[str]: except AttributeError: # Check header first if self.headerInit(): - headerResult = self.header['from'] + headerResult = decodeRfc2047(self.header['from']) if headerResult is not None: self._sender = headerResult return headerResult diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 1d919769..d9801e3a 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -25,6 +25,7 @@ import collections import copy import datetime +import email.header import email.message import email.policy import glob @@ -169,6 +170,21 @@ def _open(name, mode, *args, **kwargs): return _open +def decodeRfc2047(encoded : str) -> str: + """ + Decodes text encoded using the method specified in RFC 2047. + """ + # This returns a list of tuples containing the bytes and the encoding they + # are using, so we decode each one and join them together. + # + # decode_header header will return a string instead of bytes for the first + # object if the input is not encoded, something that is frustrating. + return ''.join( + x[0].decode(x[1] or 'ascii') if isinstance(x[0], bytes) else x + for x in email.header.decode_header(encoded) + ) + + def dictGetCasedKey(_dict : Dict, key : Any) -> Any: """ Retrieves the key from the dictionary with the proper casing using a