From 761001c1332170239b6f6367bd5d54c9a96f9589 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 15:20:47 -0800 Subject: [PATCH 01/17] Started process of removing Python 2 compatability --- extract_msg/__main__.py | 4 ++-- extract_msg/attachment.py | 4 ++-- extract_msg/dev.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index 57f7e226..f587a59d 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -1,9 +1,9 @@ import logging +import os import sys import traceback from extract_msg import __doc__, utils -from extract_msg.compat import os_ as os from extract_msg.message import Message def main(): @@ -12,7 +12,7 @@ def main(): args = utils.getCommandArgs(sys.argv[1:]) level = logging.INFO if args.verbose else logging.WARNING - currentdir = os.getcwdu() # Store this just in case the paths that have been given are relative + currentdir = os.getcwd() # Store this just in case the paths that have been given are relative if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 7d930872..4d1a81c1 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -1,11 +1,11 @@ import logging +import os import random import string import zipfile from . import constants from .attachment_base import AttachmentBase -from .compat import os_ as os from .named import NamedAttachmentProperties from .prop import FixedLengthProp, VariableLengthProp from .properties import Properties @@ -150,7 +150,7 @@ def save(self, **kwargs): # Zip files use w for writing in binary. mode = 'w' else: - customPath = os.path.abspath(kwargs.get('customPath', os.getcwdu())).replace('\\', '/') + customPath = os.path.abspath(kwargs.get('customPath', os.getcwd())).replace('\\', '/') # Prepare the path. customPath += '' if customPath.endswith('/') else '/' mode = 'wb' diff --git a/extract_msg/dev.py b/extract_msg/dev.py index 5973fa16..783570cc 100644 --- a/extract_msg/dev.py +++ b/extract_msg/dev.py @@ -10,10 +10,10 @@ import logging +import os from . import dev_classes from . import utils -from .compat import os_ as os from .message import Message From 2f03095a5384cd50ad3b8abe561e38409fa90fdc Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 15:34:46 -0800 Subject: [PATCH 02/17] Progress on removing compatability (untested) --- .travis.yml | 3 +- README.rst | 19 ++----- extract_msg/__init__.py | 4 +- extract_msg/__main__.py | 2 +- extract_msg/attachment.py | 2 +- extract_msg/compat/__init__.py | 0 extract_msg/compat/os_.py | 9 ---- extract_msg/constants.py | 8 --- extract_msg/dev.py | 2 +- extract_msg/message.py | 10 ++-- extract_msg/message_base.py | 2 +- extract_msg/msg.py | 6 +-- extract_msg/utils.py | 97 +++++++++++----------------------- 13 files changed, 49 insertions(+), 115 deletions(-) delete mode 100644 extract_msg/compat/__init__.py delete mode 100644 extract_msg/compat/os_.py diff --git a/.travis.yml b/.travis.yml index 41ba8bf1..2293c0f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - - "2.7" - - "3.5" + - "3.6" install: - python setup.py install script: diff --git a/README.rst b/README.rst index 02bf1848..ce44b51d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|License: GPL v3| |PyPI3| |PyPI1| |PyPI2| +|License: GPL v3| |PyPI3| |PyPI2| msg-extractor ============= @@ -10,11 +10,6 @@ data (from, to, cc, date, subject, body) and the email's attachments. NOTICE ====== -0.29.* will be the last versions that will support Python 2. While we want to -continue to support it, it would just be too much work to do so. We are -providing notice ahead of time of this change so that you may sort out your -Python environments ahead of time. - This module has a Discord server for general discussion. You can find it here: `Discord`_ @@ -40,8 +35,7 @@ attachments. The script uses Philippe Lagadec's Python module that reads Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format). This is the underlying format of -Outlook's .msg files. This library currently supports up to Python 2.7 -and 3.4. +Outlook's .msg files. This library currently supports Python 3.6 and above. The script was built using Peter Fiskerstrand's documentation of the .msg format. Redemption's discussion of the different property types @@ -116,11 +110,8 @@ where ``CustomAttachmentClass`` is your custom class. #TODO: Finish this section -If you have any questions feel free to contact me, Matthew Walker, at -mattgwwalker at gmail.com. NOTE: Due to time constraints, The Elemental -of Destruction has been added as a contributor to help manage the project. -As such, it may be helpful to send emails to arceusthe@gmail.com as -well. +If you have any questions feel free to contact me, Destiny, as arceusthe [at] +gmail [dot] com. I am the co-owner and main developer of the project. If you have issues, it would be best to get help for them by opening a new github issue. @@ -200,8 +191,6 @@ And thank you to everyone who has opened an issue and helped us track down those .. |PyPI3| image:: https://img.shields.io/badge/pypi-0.29.0-blue.svg :target: https://pypi.org/project/extract-msg/0.29.0/ -.. |PyPI1| image:: https://img.shields.io/badge/python-2.7+-brightgreen.svg - :target: https://www.python.org/downloads/release/python-2715/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.6+-brightgreen.svg :target: https://www.python.org/downloads/release/python-367/ .. _Matthew Walker: https://github.com/mattgwwalker diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 2e35062f..8de35773 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2022-01-13' -__version__ = '0.29.0' +__date__ = '2022-01-16' +__version__ = '0.30.0' import logging diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index f587a59d..229e8ba1 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -36,7 +36,7 @@ def main(): print('These results have been saved to {}'.format(filename)) with open(filename, 'w') as fil: fil.write(json.dumps(valResults)) - utils.getInput('Press enter to exit...') + input('Press enter to exit...') else: if not args.dump_stdout: utils.setupLogging(args.config_path, level, args.log, args.file_logging) diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 4d1a81c1..82e891d7 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -135,7 +135,7 @@ def save(self, **kwargs): # ZipFile handling. if zip: # If we are doing a zip file, first check that we have been given a path. - if isinstance(zip, constants.STRING): + if isinstance(zip, str): # If we have a path then we use the zip file. zip = zipfile.ZipFile(zip, 'a', zipfile.ZIP_DEFLATED) kwargs['zip'] = zip diff --git a/extract_msg/compat/__init__.py b/extract_msg/compat/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/extract_msg/compat/os_.py b/extract_msg/compat/os_.py deleted file mode 100644 index 77a4b274..00000000 --- a/extract_msg/compat/os_.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Compatibility module to ensure that certain functions exist across python versions -""" - -from os import * -import sys - -if sys.version_info[0] >= 3: - getcwdu = getcwd diff --git a/extract_msg/constants.py b/extract_msg/constants.py index e99244b8..c2af094b 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -12,14 +12,6 @@ import ebcdic -if sys.version_info[0] >= 3: - BYTES = bytes - STRING = str -else: - BYTES = str - STRING = unicode - - # DEFINE CONSTANTS # WARNING DO NOT CHANGE ANY OF THESE VALUES UNLESS YOU KNOW # WHAT YOU ARE DOING! FAILURE TO FOLLOW THIS INSTRUCTION diff --git a/extract_msg/dev.py b/extract_msg/dev.py index 783570cc..5534f245 100644 --- a/extract_msg/dev.py +++ b/extract_msg/dev.py @@ -34,7 +34,7 @@ def main(args, argv): function. """ setupDevLogger(args.config_path, args.log) - currentdir = os.getcwdu() # Store this just in case the paths that have been given are relative + currentdir = os.getcwd() # Store this just in case the paths that have been given are relative if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) diff --git a/extract_msg/message.py b/extract_msg/message.py index 02fd323a..fecb766c 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -1,15 +1,15 @@ import json import logging +import os import zipfile from imapclient.imapclient import decode_utf7 from . import constants from .attachment import Attachment -from .compat import os_ as os from .exceptions import DataNotFoundError, IncompatibleOptionsError from .message_base import MessageBase -from .utils import addNumToDir, addNumToZipDir, injectHtmlHeader, injectRtfHeader, inputToBytes, inputToString, makeDirs, prepareFilename +from .utils import addNumToDir, addNumToZipDir, injectHtmlHeader, injectRtfHeader, inputToBytes, inputToString, prepareFilename logger = logging.getLogger(__name__) @@ -116,7 +116,7 @@ def save(self, **kwargs): if raw: raise IncompatibleOptionsError('The options `raw` and `zip` are incompatible.') # If we are doing a zip file, first check that we have been given a path. - if isinstance(_zip, constants.STRING): + if isinstance(_zip, str): # If we have a path then we use the zip file. _zip = zipfile.ZipFile(_zip, 'a', zipfile.ZIP_DEFLATED) kwargs['zip'] = _zip @@ -131,7 +131,7 @@ def save(self, **kwargs): # Zip files use w for writing in binary. mode = 'w' else: - path = os.path.abspath(kwargs.get('customPath', os.getcwdu())).replace('\\', '/') + path = os.path.abspath(kwargs.get('customPath', os.getcwd())).replace('\\', '/') # Prepare the path. path += '/' if path[-1] != '/' else '' mode = 'wb' @@ -177,7 +177,7 @@ def save(self, **kwargs): # Create the folders. if not zip: try: - makeDirs(path) + os.makedirs(path) except Exception: newDirName = addNumToDir(path) if newDirName: diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index f05aa8e1..ba3a9e78 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -1,12 +1,12 @@ import email.utils import logging +import os import re import compressed_rtf from . import constants from .attachment import Attachment, BrokenAttachment, UnsupportedAttachment -from .compat import os_ as os from .exceptions import UnrecognizedMSGTypeError from .msg import MSGFile from .recipient import Recipient diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 477d6b95..312a262a 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -1,6 +1,7 @@ import codecs import copy import logging +import os import sys import zipfile @@ -8,11 +9,10 @@ from . import constants from .attachment import Attachment -from .compat import os_ as os from .named import Named from .prop import FixedLengthProp, VariableLengthProp from .properties import Properties -from .utils import divide, getEncodingName, hasLen, inputToMsgpath, inputToString, makeDirs, msgpathToString, parseType, properHex, verifyPropertyId, verifyType, windowsUnicode +from .utils import divide, getEncodingName, hasLen, inputToMsgpath, inputToString, msgpathToString, parseType, properHex, verifyPropertyId, verifyType, windowsUnicode from .exceptions import InvalidFileFormatError, MissingEncodingError @@ -381,7 +381,7 @@ def saveRaw(self, path): path = path.replace('\\', '/') path += '/' if path[-1] != '/' else '' # Make the location - makeDirs(path, exist_ok = True) + os.makedirs(path, exist_ok = True) # Create the zipfile path += 'raw.zip' if os.path.exists(path): diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 6e325937..cfc14098 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -9,77 +9,22 @@ import json import logging import logging.config +import os import struct import sys import tzlocal +from html import escape as htmlEscape + from . import constants -from .compat import os_ as os from .exceptions import ConversionError, IncompatibleOptionsError, InvaildPropertyIdError, UnknownCodepageError, UnknownTypeError, UnrecognizedMSGTypeError, UnsupportedMSGTypeError + logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) logging.addLevelName(5, 'DEVELOPER') -if sys.version_info[0] >= 3: # Python 3 - getInput = input - - makeDirs = os.makedirs - - def properHex(inp, length = 0): - """ - Taken (with permission) from https://github.com/TheElementalOfDestruction/creatorUtils - """ - a = '' - if isinstance(inp, str): - a = ''.join([hex(ord(inp[x]))[2:].rjust(2, '0') for x in range(len(inp))]) - elif isinstance(inp, bytes): - a = inp.hex() - elif isinstance(inp, int): - a = hex(inp)[2:] - if len(a) % 2 != 0: - a = '0' + a - return a.rjust(length, '0').upper() - - def windowsUnicode(string): - return str(string, 'utf-16-le') if string is not None else None - - from html import escape as htmlEscape - -else: # Python 2 - getInput = raw_input - - def makeDirs(name, mode = 0o0777, exist_ok = False): - try: - os.makedirs(name, mode) - except WindowsError as e: - if exist_ok and e.winerror == 183: # Path exists. - return - raise - - def properHex(inp, length = 0): - """ - Converts the input into a hexadecimal string without the beginning "0x". The string - will also always have a length that is a multiple of 2 (unless :param length: has - been specified). :param length: only specifies the MINIMUM length that the string - will use. - """ - a = '' - if isinstance(inp, (str, unicode)): - a = ''.join([hex(ord(inp[x]))[2:].rjust(2, '0') for x in range(len(inp))]) - elif isinstance(inp, int): - a = hex(inp)[2:] - elif isinstance(inp, long): - a = hex(inp)[2:-1] - if len(a) % 2 != 0: - a = '0' + a - return a.rjust(length, '0').upper() - - def windowsUnicode(string): - return unicode(string, 'utf-16-le') if string is not None else None - - from cgi import escape as htmlEscape def addNumToDir(dirName): """ @@ -88,7 +33,7 @@ def addNumToDir(dirName): for i in range(2, 100): try: newDirName = dirName + ' (' + str(i) + ')' - makeDirs(newDirName) + os.makedirs(newDirName) return newDirName except Exception as e: pass @@ -440,14 +385,14 @@ def replace(bodyMarker): raise Exception('All injection attempts failed.') def inputToBytes(stringInputVar, encoding): - if isinstance(stringInputVar, constants.BYTES): + if isinstance(stringInputVar, bytes): return stringInputVar - elif isinstance(stringInputVar, constants.STRING): + elif isinstance(stringInputVar, str): return stringInputVar.encode(encoding) elif stringInputVar is None: return b'' else: - raise ConversionError('Cannot convert to BYTES type') + raise ConversionError('Cannot convert to bytes.') def inputToMsgpath(inp): """ @@ -459,14 +404,14 @@ def inputToMsgpath(inp): return ret if ret[0] != '' else [] def inputToString(bytesInputVar, encoding): - if isinstance(bytesInputVar, constants.STRING): + if isinstance(bytesInputVar, str): return bytesInputVar - elif isinstance(bytesInputVar, constants.BYTES): + elif isinstance(bytesInputVar, bytes): return bytesInputVar.decode(encoding) elif bytesInputVar is None: return '' else: - raise ConversionError('Cannot convert to STRING type') + raise ConversionError('Cannot convert to str type.') def isEncapsulatedRtf(inp): """ @@ -702,6 +647,21 @@ def prepareFilename(filename): # I would use re here, but it tested to be slightly slower than this. return ''.join(i for i in filename if i not in r'\/:*?"<>|' + '\x00') +def properHex(inp, length = 0): + """ + Taken (with permission) from https://github.com/TheElementalOfDestruction/creatorUtils + """ + a = '' + if isinstance(inp, str): + a = ''.join([hex(ord(inp[x]))[2:].rjust(2, '0') for x in range(len(inp))]) + elif isinstance(inp, bytes): + a = inp.hex() + elif isinstance(inp, int): + a = hex(inp)[2:] + if len(a) % 2 != 0: + a = '0' + a + return a.rjust(length, '0').upper() + def roundUp(inp, mult): """ Rounds :param inp: up to the nearest multiple of :param mult:. @@ -772,7 +732,7 @@ def setupLogging(defaultPath=None, defaultLevel=logging.WARN, logfile=None, enab os.path.expandvars(logfile if logfile else config['handlers'][x]['filename'])) tmp = getContFileDir(tmp) if not os.path.exists(tmp): - makeDirs(tmp) + os.makedirs(tmp) else: config['handlers'][x]['filename'] = null @@ -804,3 +764,6 @@ def verifyType(_type): if _type is not None: if (_type not in constants.VARIABLE_LENGTH_PROPS_STRING) and (_type not in constants.FIXED_LENGTH_PROPS_STRING): raise UnknownTypeError('Unknown type {}'.format(_type)) + +def windowsUnicode(string): + return str(string, 'utf-16-le') if string is not None else None From 11120bcd4f363c681bb897909d2eda1eaa13ddad Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 16:38:01 -0800 Subject: [PATCH 03/17] Update to match changes to 0.29.1 --- CHANGELOG.md | 7 +++++++ README.rst | 4 ++-- extract_msg/attachment.py | 2 +- extract_msg/constants.py | 11 +++++++++++ extract_msg/message.py | 12 +++++++++++- 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef69de0a..6e77b7d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +**v0.30.0** +* Removed all support for Python 2. This caused a lot of things to be moved around and changed from indirect references to direct references, so it's possible something fell through the cracks. I'm doing my best to test it, but let me know if you have an issue. +* Converted much of the path nonsense to use `pathlib` so the code is smaller and more reliable. This does have a slight speed penalty, but given that the logic is right next to read/write operations the penalty is negligable. + +**v0.29.1** +* [[TeamMsgExtractor #198](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Added a feature to save the header in it's own file (prefers the full raw header if it can find it, otherwise puts in a generated one) that was actually supposed to be in v0.29.0 but I forgot, lol. + **v0.29.0** * [[TeamMsgExtractor #207](https://github.com/TeamMsgExtractor/msg-extractor/issues/207)] Made it so that unspecified dates are handled properly. For clarification, an unspecified date is a custom value in MSG files for dates that means that the date is unspecified. It is distinctly different from a property not existing, which will still return None. For unspecified dates, `datetime.datetime.max` is returned. While perhaps not the best solution, it will have to do for now. * Fixed an issue where `utils.parseType` was returning a string for the date when it makes more sense to return an actual datetime instance. diff --git a/README.rst b/README.rst index ce44b51d..583a44f7 100644 --- a/README.rst +++ b/README.rst @@ -188,8 +188,8 @@ And thank you to everyone who has opened an issue and helped us track down those .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.29.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.29.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.30.0-blue.svg + :target: https://pypi.org/project/extract-msg/0.30.0/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.6+-brightgreen.svg :target: https://www.python.org/downloads/release/python-367/ diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 82e891d7..9b4c6947 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -135,7 +135,7 @@ def save(self, **kwargs): # ZipFile handling. if zip: # If we are doing a zip file, first check that we have been given a path. - if isinstance(zip, str): + if isinstance(zip, (str, pathlib.Path)): # If we have a path then we use the zip file. zip = zipfile.ZipFile(zip, 'a', zipfile.ZIP_DEFLATED) kwargs['zip'] = zip diff --git a/extract_msg/constants.py b/extract_msg/constants.py index c2af094b..b6b61c67 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -289,6 +289,17 @@ """.replace(' ', '').replace('\r', '').replace('\n', '') +# Used to format the header for saving only the header. +HEADER_FORMAT = """From: {From} +To: {To} +CC: {Cc} +Bcc: {Bcc} +Subject: {subject} +Date: {Date} +Message-ID: {Message-Id} +""" + + KNOWN_CLASS_TYPES = ( 'ipm.activity', 'ipm.appointment', diff --git a/extract_msg/message.py b/extract_msg/message.py index fecb766c..9f0259d2 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -1,6 +1,7 @@ import json import logging import os +import pathlib import zipfile from imapclient.imapclient import decode_utf7 @@ -116,7 +117,7 @@ def save(self, **kwargs): if raw: raise IncompatibleOptionsError('The options `raw` and `zip` are incompatible.') # If we are doing a zip file, first check that we have been given a path. - if isinstance(_zip, str): + if isinstance(_zip, (str, pathlib.Path)): # If we have a path then we use the zip file. _zip = zipfile.ZipFile(_zip, 'a', zipfile.ZIP_DEFLATED) kwargs['zip'] = _zip @@ -203,6 +204,15 @@ def save(self, **kwargs): self.saveRaw(path) return self + # If the user has requested the headers for this file, save it now. + if kwargs.get('saveHeader', False): + headerText = self._getStringStream('__substg1.0_007D') + if not headerText: + headerText = constants.HEADER_FORMAT.format(subject = self.subject, **self.header) + + with _open(path + 'header.txt', mode) as f: + f.write(headerText.encode('utf-8')) + try: # Check whether we should be using HTML or RTF. fext = 'txt' From ee2d7cf7a6ba427ab59d9cc38bb473b474dc4440 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 16:55:33 -0800 Subject: [PATCH 04/17] More compatability removal and changes to better syntax --- CHANGELOG.md | 2 ++ extract_msg/appointment.py | 2 +- extract_msg/attachment.py | 2 +- extract_msg/attachment_base.py | 3 +-- extract_msg/contact.py | 2 +- extract_msg/data.py | 3 +-- extract_msg/dev_classes/attachment.py | 3 +-- extract_msg/dev_classes/message.py | 2 +- extract_msg/message.py | 2 +- extract_msg/message_base.py | 4 ++-- extract_msg/msg.py | 2 +- extract_msg/named.py | 11 ++++------- extract_msg/prop.py | 3 +-- extract_msg/properties.py | 3 +-- extract_msg/recipient.py | 3 +-- 15 files changed, 20 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e77b7d6..080c7bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ **v0.30.0** * Removed all support for Python 2. This caused a lot of things to be moved around and changed from indirect references to direct references, so it's possible something fell through the cracks. I'm doing my best to test it, but let me know if you have an issue. * Converted much of the path nonsense to use `pathlib` so the code is smaller and more reliable. This does have a slight speed penalty, but given that the logic is right next to read/write operations the penalty is negligable. +* Changed classes to now prefer super() over direct superclass initalization. +* Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore.) **v0.29.1** * [[TeamMsgExtractor #198](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Added a feature to save the header in it's own file (prefers the full raw header if it can find it, otherwise puts in a generated one) that was actually supposed to be in v0.29.0 but I forgot, lol. diff --git a/extract_msg/appointment.py b/extract_msg/appointment.py index 21106ded..c1e03814 100644 --- a/extract_msg/appointment.py +++ b/extract_msg/appointment.py @@ -9,7 +9,7 @@ class Appointment(MessageBase): """ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = None, delayAttachments = False, overrideEncoding = None, attachmentErrorBehavior = constants.ATTACHMENT_ERROR_THROW, recipientSeparator = ';'): - MessageBase.__init__(self, path, prefix, attachmentClass, filename, delayAttachments, overrideEncoding, attachmentErrorBehavior, recipientSeparator) + super().__init__(path, prefix, attachmentClass, filename, delayAttachments, overrideEncoding, attachmentErrorBehavior, recipientSeparator) @property def appointmentClassType(self): diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 9b4c6947..8aca8262 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -28,7 +28,7 @@ def __init__(self, msg, dir_): :param msg: the Message instance that the attachment belongs to. :param dir_: the directory inside the msg file where the attachment is located. """ - AttachmentBase.__init__(self, msg, dir_) + super().__init__(msg, dir_) # Get attachment data if self.exists('__substg1.0_37010102'): diff --git a/extract_msg/attachment_base.py b/extract_msg/attachment_base.py index c839011c..93adfc5b 100644 --- a/extract_msg/attachment_base.py +++ b/extract_msg/attachment_base.py @@ -10,7 +10,7 @@ logger.addHandler(logging.NullHandler()) -class AttachmentBase(object): +class AttachmentBase: """ Stores the attachment data of a Message instance. Should the attachment be an embeded message, the @@ -23,7 +23,6 @@ def __init__(self, msg, dir_): :param msg: the Message instance that the attachment belongs to. :param dir_: the directory inside the msg file where the attachment is located. """ - object.__init__(self) self.__msg = msg self.__dir = dir_ self.__props = Properties(self._getStream('__properties_version1.0'), constants.TYPE_ATTACHMENT) diff --git a/extract_msg/contact.py b/extract_msg/contact.py index 5b4e776f..5bb95367 100644 --- a/extract_msg/contact.py +++ b/extract_msg/contact.py @@ -9,7 +9,7 @@ class Contact(MSGFile): """ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = None, overrideEncoding = None, attachmentErrorBehavior = constants.ATTACHMENT_ERROR_THROW): - MSGFile.__init__(self, path, prefix, attachmentClass, filename, overrideEncoding, attachmentErrorBehavior) + super().__init__(path, prefix, attachmentClass, filename, overrideEncoding, attachmentErrorBehavior) self.named @property diff --git a/extract_msg/data.py b/extract_msg/data.py index c161f023..bcbd73b5 100644 --- a/extract_msg/data.py +++ b/extract_msg/data.py @@ -5,9 +5,8 @@ from . import constants -class PermanentEntryID(object): +class PermanentEntryID: def __init__(self, data): - super(PermanentEntryID, self).__init__() self.__data = data unpacked = constants.STPEID.unpack(data[:28]) if unpacked[0] != 0: diff --git a/extract_msg/dev_classes/attachment.py b/extract_msg/dev_classes/attachment.py index 92bf984d..73d17923 100644 --- a/extract_msg/dev_classes/attachment.py +++ b/extract_msg/dev_classes/attachment.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) -class Attachment(object): +class Attachment: """ Developer version of the `extract_msg.attachment.Attachment` class. """ @@ -17,7 +17,6 @@ def __init__(self, msg, dir_): :param msg: the Message instance that the attachment belongs to. :param dir_: the directory inside the msg file where the attachment is located. """ - object.__init__(self) self.__msg = msg self.__dir = dir_ self.__props = Properties( diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index c400ba51..f705e9b9 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -27,7 +27,7 @@ def __init__(self, path, prefix='', filename=None): """ logger.log(5, 'prefix: {}'.format(prefix)) self.__path = path - olefile.OleFileIO.__init__(self, path) + super().__init__(path) prefixl = [] tmp_condition = prefix != '' if tmp_condition: diff --git a/extract_msg/message.py b/extract_msg/message.py index 9f0259d2..4c01d894 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -21,7 +21,7 @@ class Message(MessageBase): Parser for Microsoft Outlook message files. """ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = None, delayAttachments = False, overrideEncoding = None, attachmentErrorBehavior = constants.ATTACHMENT_ERROR_THROW, recipientSeparator = ';'): - MessageBase.__init__(self, path, prefix, attachmentClass, filename, delayAttachments, overrideEncoding, attachmentErrorBehavior, recipientSeparator) + super().__init__(path, prefix, attachmentClass, filename, delayAttachments, overrideEncoding, attachmentErrorBehavior, recipientSeparator) def dump(self): """ diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index ba3a9e78..fbaa5574 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -47,7 +47,7 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N :param recipientSeparator: Optional, Separator string to use between recipients. """ - MSGFile.__init__(self, path, prefix, attachmentClass, filename, overrideEncoding, attachmentErrorBehavior) + super().__init__(path, prefix, attachmentClass, filename, overrideEncoding, attachmentErrorBehavior) self.__attachmentsDelayed = delayAttachments self.__attachmentsReady = False self.__recipientSeparator = recipientSeparator @@ -128,7 +128,7 @@ def close(self): attachment.data.close() except AttributeError: pass - MSGFile.close(self) + super().close() def headerInit(self): """ diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 312a262a..8c8cbfc9 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -50,7 +50,7 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N self.__overrideEncoding = overrideEncoding try: - olefile.OleFileIO.__init__(self, path) + super().__init__(path) except IOError as e: # py2 and py3 compatible logger.error(e) if str(e) == 'not an OLE2 structured storage file': diff --git a/extract_msg/named.py b/extract_msg/named.py index 7cdf0a2c..11319eba 100644 --- a/extract_msg/named.py +++ b/extract_msg/named.py @@ -9,10 +9,9 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) -class Named(object): +class Named: __dir = '__nameid_version1.0' def __init__(self, msg): - super(Named, self).__init__() self.__msg = msg guidStream = self._getStream('__substg1.0_00020102') or self._getStream('__substg1.0_00020102', False) entryStream = self._getStream('__substg1.0_00030102') or self._getStream('__substg1.0_00030102', False) @@ -133,7 +132,7 @@ def namedProperties(self): -class NamedAttachmentProperties(object): +class NamedAttachmentProperties: """ The named properties associated with a specific attachment. """ @@ -169,9 +168,8 @@ def namedProperties(self): -class StringNamedProperty(object): +class StringNamedProperty: def __init__(self, entry, name, data): - super(StringNamedProperty, self).__init__() self.__entry = entry self.__name = name self.__guidIndex = entry['guid_index'] @@ -257,9 +255,8 @@ def type(self): -class NumericalNamedProperty(object): +class NumericalNamedProperty: def __init__(self, entry, data): - super(NumericalNamedProperty, self).__init__() self.__propertyID = properHex(entry['id'], 4).upper() self.__guidIndex = entry['guid_index'] self.__namedPropertyID = entry['pid'] diff --git a/extract_msg/prop.py b/extract_msg/prop.py index fdb87f08..96e47fd9 100644 --- a/extract_msg/prop.py +++ b/extract_msg/prop.py @@ -19,13 +19,12 @@ def createProp(string): return VariableLengthProp(string) -class PropBase(object): +class PropBase: """ Base class for Prop instances. """ def __init__(self, string): - super(PropBase, self).__init__() self.__raw = string self.__name = properHex(string[3::-1]).upper() self.__type, self.__flags = constants.ST2.unpack(string) diff --git a/extract_msg/properties.py b/extract_msg/properties.py index e8932e48..d6a2b0b3 100644 --- a/extract_msg/properties.py +++ b/extract_msg/properties.py @@ -10,13 +10,12 @@ logger.addHandler(logging.NullHandler()) -class Properties(object): +class Properties: """ Parser for msg properties files. """ def __init__(self, stream, type=None, skip=None): - object.__init__(self) self.__stream = stream self.__pos = 0 self.__len = len(stream) diff --git a/extract_msg/recipient.py b/extract_msg/recipient.py index 32b90082..56f3baea 100644 --- a/extract_msg/recipient.py +++ b/extract_msg/recipient.py @@ -10,12 +10,11 @@ logger.addHandler(logging.NullHandler()) -class Recipient(object): +class Recipient: """ Contains the data of one of the recipients in an msg file. """ def __init__(self, _dir, msg): - object.__init__(self) self.__msg = msg # Allows calls to original msg file self.__dir = _dir self.__props = Properties(self._getStream('__properties_version1.0'), constants.TYPE_RECIPIENT) From 7371a1f868f53237446f03b5deca215218182bc9 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 19:04:15 -0800 Subject: [PATCH 05/17] Further updates and matches to 0.29.2 --- CHANGELOG.md | 6 ++- README.rst | 3 ++ extract_msg/attachment.py | 24 +++++----- extract_msg/constants.py | 1 - extract_msg/message.py | 8 ++-- extract_msg/msg.py | 18 ++----- extract_msg/utils.py | 99 ++++++++++++++++++++++----------------- 7 files changed, 84 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 080c7bca..c858bbf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,11 @@ * Removed all support for Python 2. This caused a lot of things to be moved around and changed from indirect references to direct references, so it's possible something fell through the cracks. I'm doing my best to test it, but let me know if you have an issue. * Converted much of the path nonsense to use `pathlib` so the code is smaller and more reliable. This does have a slight speed penalty, but given that the logic is right next to read/write operations the penalty is negligable. * Changed classes to now prefer super() over direct superclass initalization. -* Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore.) +* Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore). + +**v0.29.2** +* Fixed issue where the RTF injection was accidentally doing HTML escapes for non-encapsulated streams and *not* doing escapes for encapsulated streams. +* Fixed name error in `Message.save` causing bad logic. For context, the internal variable `zip` was renamed to `_zip` to avoid a name conflict with the built-in function. Some instances of it were missed. **v0.29.1** * [[TeamMsgExtractor #198](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Added a feature to save the header in it's own file (prefers the full raw header if it can find it, otherwise puts in a generated one) that was actually supposed to be in v0.29.0 but I forgot, lol. diff --git a/README.rst b/README.rst index 583a44f7..8ec11922 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,9 @@ data (from, to, cc, date, subject, body) and the email's attachments. NOTICE ====== +0.29.* is the branch that supports both Python 2 and Python 3. It is now only receiving bug fixes +and will not be receiving feature updates. + This module has a Discord server for general discussion. You can find it here: `Discord`_ diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 8aca8262..da496982 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -129,16 +129,16 @@ def save(self, **kwargs): filename = name[:maxNameLength - len(ext)] + ext # Check if we are doing a zip file. - zip = kwargs.get('zip') + _zip = kwargs.get('zip') # ZipFile handling. - if zip: + if _zip: # If we are doing a zip file, first check that we have been given a path. - if isinstance(zip, (str, pathlib.Path)): + if isinstance(_zip, (str, pathlib.Path)): # If we have a path then we use the zip file. - zip = zipfile.ZipFile(zip, 'a', zipfile.ZIP_DEFLATED) - kwargs['zip'] = zip + _zip = zipfile.ZipFile(_zip, 'a', zipfile.ZIP_DEFLATED) + kwargs['zip'] = _zip createdZip = True else: createdZip = False @@ -146,7 +146,7 @@ def save(self, **kwargs): customPath = kwargs.get('customPath', '').replace('\\', '/') customPath += '/' if customPath and customPath[-1] != '/' else '' # Set the open command to be that of the zip file. - _open = zip.open + _open = _zip.open # Zip files use w for writing in binary. mode = 'w' else: @@ -159,9 +159,9 @@ def save(self, **kwargs): fullFilename = customPath + filename if self.__type == 'data': - if zip: + if _zip: name, ext = os.path.splitext(filename) - nameList = zip.namelist() + nameList = _zip.namelist() if fullFilename in nameList: for i in range(2, 100): testName = customPath + name + ' (' + str(i) + ')' + ext @@ -189,16 +189,16 @@ def save(self, **kwargs): f.write(self.__data) # Close the ZipFile if this function created it. - if zip and createdZip: - zip.close() + if _zip and createdZip: + _zip.close() return fullFilename else: self.saveEmbededMessage(**kwargs) # Close the ZipFile if this function created it. - if zip and createdZip: - zip.close() + if _zip and createdZip: + _zip.close() return self.msg diff --git a/extract_msg/constants.py b/extract_msg/constants.py index b6b61c67..935898f7 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -7,7 +7,6 @@ import datetime import re import struct -import sys import ebcdic diff --git a/extract_msg/message.py b/extract_msg/message.py index 4c01d894..02563692 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -176,7 +176,7 @@ def save(self, **kwargs): path += self.defaultFolderName[:maxNameLength] # Create the folders. - if not zip: + if not _zip: try: os.makedirs(path) except Exception: @@ -267,13 +267,13 @@ def save(self, **kwargs): f.write(inputToBytes(self.body, 'utf-8')) except Exception: - if not zip: + if not _zip: self.saveRaw(path) raise finally: # Close the ZipFile if this function created it. - if zip and createdZip: - zip.close() + if _zip and createdZip: + _zip.close() # Return the instance so that functions can easily be chained. return self diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 8c8cbfc9..93844e5c 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -2,7 +2,6 @@ import copy import logging import os -import sys import zipfile import olefile @@ -401,20 +400,13 @@ def saveRaw(self, path): filename = 'contents.bin' # Save contents of directory - if sys.version_info[0] < 3: - # Python 2 zip files don't seem to actually match the docs, and `open` simply opens in read mode, even though it should be able to open in write mode. + with zfile.open(sysdir + '/' + filename, 'w') as f: data = self._getStream(dir_) + # Specifically check for None. If this is bytes we still want to do this line. + # There was actually this weird issue where for some reason data would be bytes + # but then also simultaneously register as None? if data is not None: - zfile.writestr(sysdir + '/' + filename, data, zipfile.ZIP_DEFLATED) - - else: - with zfile.open(sysdir + '/' + filename, 'w') as f: - data = self._getStream(dir_) - # Specifically check for None. If this is bytes we still want to do this line. - # There was actually this weird issue where for some reason data would be bytes - # but then also simultaneously register as None? - if data is not None: - f.write(data) + f.write(data) @property def areStringsUnicode(self): diff --git a/extract_msg/utils.py b/extract_msg/utils.py index cfc14098..5c2ad0bd 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -11,6 +11,7 @@ import logging.config import os import struct +# Not actually sure if this needs to be here for the logging, so just in case. import sys import tzlocal @@ -277,52 +278,10 @@ def injectRtfHeader(msgFile): # rtf. if isEncapsulatedRtf(msgFile.rtfBody): injectableHeader = constants.RTF_ENC_INJECTABLE_HEADER - def rtfSanitize(inp): - if not inp: - return '' - output = '' - for char in inp: - # Check if it is in the right range to be printed directly. - if 32 <= ord(char) < 128: - if char in ('\\', '{', '}'): - output += '\\' - output += char - elif ord(char) < 32 or 128 <= ord(char) <= 255: - # Otherwise, see if it is just a small escape. - output += "\\'" + properHex(char, 2) - else: - # Handle Unicode characters. - output += '\\u' + str(ord(char)) + '?' - - return output + rtfSanitize = rtfSanitizeHtml else: injectableHeader = constants.RTF_PLAIN_INJECTABLE_HEADER - def rtfSanitize(inp): - if not inp: - return '' - output = '' - for char in inp: - # Check if it is in the right range to be printed directly. - if 32 <= ord(char) < 128: - # Quick check for handling the HTML escapes. Will eventually - # upgrade this code to actually handle all the HTML escapes - # but this will do for now. - if char == '<': - output += r'{\*\htmltag84 <}\htmlrtf <\htmlrtf0 ' - elif char == '>': - output += r'{\*\htmltag84 >}\htmlrtf >\htmlrtf0' - else: - if char in ('\\', '{', '}'): - output += '\\' - output += char - elif ord(char) < 32 or 128 <= ord(char) <= 255: - # Otherwise, see if it is just a small escape. - output += "\\'" + properHex(char, 2) - else: - # Handle Unicode characters. - output += '\\u' + str(ord(char)) + '?' - - return output + rtfSanitize = rtfSanitizePlain def replace(bodyMarker): """ @@ -668,6 +627,58 @@ def roundUp(inp, mult): """ return inp + (mult - inp) % mult +def rtfSanitizeHtml(inp): + """ + Sanitizes input to an RTF stream that has encapsulated HTML. + """ + if not inp: + return '' + output = '' + for char in inp: + # Check if it is in the right range to be printed directly. + if 32 <= ord(char) < 128: + # Quick check for handling the HTML escapes. Will eventually + # upgrade this code to actually handle all the HTML escapes + # but this will do for now. + if char == '<': + output += r'{\*\htmltag84 <}\htmlrtf <\htmlrtf0 ' + elif char == '>': + output += r'{\*\htmltag84 >}\htmlrtf >\htmlrtf0' + else: + if char in ('\\', '{', '}'): + output += '\\' + output += char + elif ord(char) < 32 or 128 <= ord(char) <= 255: + # Otherwise, see if it is just a small escape. + output += "\\'" + properHex(char, 2) + else: + # Handle Unicode characters. + output += '\\u' + str(ord(char)) + '?' + + return output + +def rtfSanitizePlain(inp): + """ + Sanitizes input to a plain RTF stream. + """ + if not inp: + return '' + output = '' + for char in inp: + # Check if it is in the right range to be printed directly. + if 32 <= ord(char) < 128: + if char in ('\\', '{', '}'): + output += '\\' + output += char + elif ord(char) < 32 or 128 <= ord(char) <= 255: + # Otherwise, see if it is just a small escape. + output += "\\'" + properHex(char, 2) + else: + # Handle Unicode characters. + output += '\\u' + str(ord(char)) + '?' + + return output + def setupLogging(defaultPath=None, defaultLevel=logging.WARN, logfile=None, enableFileLogging=False, env_key='EXTRACT_MSG_LOG_CFG'): """ From 4ff01aca17a5b570efc853a5e23ead2c85025621 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 22:02:17 -0800 Subject: [PATCH 06/17] Progress report: Message now uses pathlib in saving. --- CHANGELOG.md | 2 +- extract_msg/__main__.py | 4 +-- extract_msg/attachment.py | 4 +-- extract_msg/data.py | 2 +- extract_msg/dev.py | 6 ++--- extract_msg/dev_classes/attachment.py | 2 +- extract_msg/dev_classes/message.py | 37 ++++++++++++++++----------- extract_msg/message.py | 36 ++++++++++++-------------- extract_msg/message_base.py | 8 +++--- extract_msg/msg.py | 7 +++-- extract_msg/utils.py | 8 +++--- 11 files changed, 61 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c858bbf9..69f2bc38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ **v0.30.0** * Removed all support for Python 2. This caused a lot of things to be moved around and changed from indirect references to direct references, so it's possible something fell through the cracks. I'm doing my best to test it, but let me know if you have an issue. -* Converted much of the path nonsense to use `pathlib` so the code is smaller and more reliable. This does have a slight speed penalty, but given that the logic is right next to read/write operations the penalty is negligable. * Changed classes to now prefer super() over direct superclass initalization. * Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore). +* Converted most `.format`s into f strings. **v0.29.2** * Fixed issue where the RTF injection was accidentally doing HTML escapes for non-encapsulated streams and *not* doing escapes for encapsulated streams. diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index 229e8ba1..573bca6e 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -30,10 +30,10 @@ def main(): from extract_msg import validation valResults = {x[0]: validation.validate(x[0]) for x in args.msgs} - filename = 'validation {}.json'.format(int(time.time())) + filename = f'validation {int(time.time())}.json' print('Validation Results:') pprint.pprint(valResults) - print('These results have been saved to {}'.format(filename)) + print(f'These results have been saved to {filename}') with open(filename, 'w') as fil: fil.write(json.dumps(valResults)) input('Press enter to exit...') diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index da496982..ac9db2e8 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -170,7 +170,7 @@ def save(self, **kwargs): break else: # If we couldn't find one that didn't exist. - raise FileExistsError('Could not create the specified file because it already exists ("{}").'.format(fullFilename)) + raise FileExistsError(f'Could not create the specified file because it already exists ("{fullFilename}").') else: if os.path.exists(fullFilename): # Try to split the filename into a name and extention. @@ -183,7 +183,7 @@ def save(self, **kwargs): break else: # If we couldn't find one that didn't exist. - raise FileExistsError('Could not create the specified file because it already exists ("{}").'.format(fullFilename)) + raise FileExistsError(f'Could not create the specified file because it already exists ("{fullFilename}").') with _open(fullFilename, mode) as f: f.write(self.__data) diff --git a/extract_msg/data.py b/extract_msg/data.py index bcbd73b5..1ed729d6 100644 --- a/extract_msg/data.py +++ b/extract_msg/data.py @@ -10,7 +10,7 @@ def __init__(self, data): self.__data = data unpacked = constants.STPEID.unpack(data[:28]) if unpacked[0] != 0: - raise TypeError('Not a PermanentEntryID (expected 0, got {}).'.format(unpacked[0])) + raise TypeError(f'Not a PermanentEntryID (expected 0, got {unpacked[0]}).') self.__providerUID = unpacked[1] self.__displayTypeString = unpacked[2] self.__distinguishedName = data[28:-1].decode('ascii') # Cut off the null character at the end and decode the data as ascii diff --git a/extract_msg/dev.py b/extract_msg/dev.py index 5534f245..448c35fd 100644 --- a/extract_msg/dev.py +++ b/extract_msg/dev.py @@ -41,9 +41,9 @@ def main(args, argv): out = args.out_path else: out = currentdir - logger.log(5, 'ARGV: {}'.format(argv)) + logger.log(5, f'ARGV: {argv}') for y, x in enumerate(args.msgs): - logger.log(5, '---- RUNNING DEVELOPER MODE ON FILE {} ----'.format(x[0])) + logger.log(5, f'---- RUNNING DEVELOPER MODE ON FILE {x[0]} ----') logger.log(5, 'EXCEPTION CHECK:') try: with Message(x[0]) as msg: @@ -64,4 +64,4 @@ def main(args, argv): logpath = x.baseFilename except AttributeError: pass; - print('Logging complete. Log has been saved to {}'.format(logpath)) + print(g'Logging complete. Log has been saved to {logpath}') diff --git a/extract_msg/dev_classes/attachment.py b/extract_msg/dev_classes/attachment.py index 73d17923..354b4ed8 100644 --- a/extract_msg/dev_classes/attachment.py +++ b/extract_msg/dev_classes/attachment.py @@ -30,7 +30,7 @@ def __init__(self, msg, dir_): elif msg.exists([dir_, '__substg1.0_3701000D']): if (self.__props['37050003'].value & 0x7) != 0x5: logger.log(5, 'Printing details of NotImplementedError...') - logger.log(5, 'dir_ = {}'.format(dir_)) + logger.log(5, f'dir_ = {dir_}') logger.log(5, 'Writing properties stream to output:') logger.log(5, '--------Start-Properties-Stream--------\n' + properHex(self.__props.stream) + diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index f705e9b9..40d13e5e 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -25,7 +25,7 @@ def __init__(self, path, prefix='', filename=None): inside the main one. Do not set manually unless you know what you are doing. """ - logger.log(5, 'prefix: {}'.format(prefix)) + logger.log(5, f'Prefix: {prefix}') self.__path = path super().__init__(path) prefixl = [] @@ -35,7 +35,7 @@ def __init__(self, path, prefix='', filename=None): try: prefix = '/'.join(prefix) except: - raise TypeError('Invalid prefix type: ' + str(type(prefix)) + + raise TypeError(f'Invalid prefix type: {type(prefix)}' + '\n(This was probably caused by you setting it manually).') prefix = prefix.replace('\\', '/') g = prefix.split('/') @@ -52,13 +52,13 @@ def __init__(self, path, prefix='', filename=None): if filename is not None: self.filename = filename else: - logger.log(5, ':param path: has __len__ attribute?: {}'.format(has_len(path))) + logger.log(5, f':param path: has __len__ attribute?: {has_len(path)}') if has_len(path): if len(path) < 1536: self.filename = path - logger.log(5, ':param path: length is {}; Using :param path: as file path'.format(len(path))) + logger.log(5, f':param path: length is {len(path)}; Using :param path: as file path') else: - logger.log(5, ':param path: length is {}; Using :param path: as raw msg stream'.format(len(path))) + logger.log(5, f':param path: length is {len(path)}; Using :param path: as raw msg stream') self.filename = None else: self.filename = None @@ -81,7 +81,7 @@ def _getStream(self, filename, prefix=True): stream = self.openstream(filename) return stream.read() else: - logger.info('Stream "{}" was requested but could not be found. Returning `None`.'.format(filename)) + logger.info(f'Stream "{filename}" was requested but could not be found. Returning `None`.') return None def _getStringStream(self, filename, prefer='unicode', prefix=True): @@ -127,23 +127,30 @@ def listDir(self, streams=True, storages=False): """ Replacement for OleFileIO.listdir that runs at the current prefix directory. """ - temp = self.listdir(streams, storages) - if self.__prefix == '': - return temp + entries = self.listdir(streams, storages) + # If we are in the top level MSG already, we can just return. + if not self.__prefix: + return entries + + # Get a list for the prefix. prefix = self.__prefix.split('/') if prefix[-1] == '': prefix.pop() + out = [] - for x in temp: + for pathEntry in entries: good = True + # If the entry we are looking at is not longer then the prefix, it's not good. if len(x) <= len(prefix): - good = False - if good: - for y in range(len(prefix)): - if x[y] != prefix[y]: - good = False + continue + + for index, entry in enumerate(prefix): + if x[y] != entry: + good = False + if good: out.append(x) + return out @property diff --git a/extract_msg/message.py b/extract_msg/message.py index 02563692..80721102 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -125,16 +125,13 @@ def save(self, **kwargs): else: createdZip = False # Path needs to be done in a special way if we are in a zip file. - path = kwargs.get('customPath', '').replace('\\', '/') - path += '/' if path and path[-1] != '/' else '' + path = pathlib.Path(kwargs.get('customPath', '')) # Set the open command to be that of the zip file. _open = _zip.open # Zip files use w for writing in binary. mode = 'w' else: - path = os.path.abspath(kwargs.get('customPath', os.getcwd())).replace('\\', '/') - # Prepare the path. - path += '/' if path[-1] != '/' else '' + path = pathlib.Path(kwargs.get('customPath', '.')).absolute() mode = 'wb' _open = open @@ -154,7 +151,7 @@ def save(self, **kwargs): # First we need to validate it. If there are invalid characters, this will detect it. if constants.RE_INVALID_FILENAME_CHARACTERS.search(customFilename): raise ValueError('Invalid character found in customFilename. Must not contain any of the following characters: \\/:*?"<>|') - path += customFilename[:maxNameLength] + path /= customFilename[:maxNameLength] elif useMsgFilename: if not self.filename: raise ValueError(':param useMsgFilename: is only available if you are using an msg file on the disk or have provided a filename.') @@ -168,12 +165,12 @@ def save(self, **kwargs): filename = filename[:maxNameLength] # Check to make sure we actually have a filename to use. if not filename: - raise ValueError('Invalid filename found in self.filename: "{}"'.format(self.filename)) + raise ValueError(f'Invalid filename found in self.filename: "{self.filename}"') # Add the file name to the path. - path += filename[:maxNameLength] + path /= filename[:maxNameLength] else: - path += self.defaultFolderName[:maxNameLength] + path /= self.defaultFolderName[:maxNameLength] # Create the folders. if not _zip: @@ -184,18 +181,17 @@ def save(self, **kwargs): if newDirName: path = newDirName else: - raise Exception( - 'Failed to create directory "%s". Does it already exist?' % - path - ) + raise Exception(f'Failed to create directory "{path}". Does it already exist?') else: # In my testing I ended up with multiple files in a zip at the same # location so let's try to handle that. - if any(x.startswith(path.rstrip('/') + '/') for x in _zip.namelist()): - path = newDirName = addNumToZipDir(path, _zip) - - # Prepare the path one last time. - path += '/' if path[-1] != '/' else '' + pathCompare = str(path).rstrip('/') + '/' + if any(x.startswith(pathCompare) for x in _zip.namelist()): + newDirName = addNumToZipDir(path, _zip) + if newDirName: + path = newDireName + else: + raise Exception(f'Failed to create directory "{path}". Does it already exist?') # Update the kwargs. kwargs['customPath'] = path @@ -210,7 +206,7 @@ def save(self, **kwargs): if not headerText: headerText = constants.HEADER_FORMAT.format(subject = self.subject, **self.header) - with _open(path + 'header.txt', mode) as f: + with _open(str(path / 'header.txt'), mode) as f: f.write(headerText.encode('utf-8')) try: @@ -239,7 +235,7 @@ def save(self, **kwargs): # Determine the extension to use for the body. fext = 'json' if _json else fext - with _open(path + 'message.' + fext, mode) as f: + with _open(str(path / ('message.' + fext)), mode) as f: if _json: emailObj = json.loads(self.getJson()) emailObj['attachments'] = attachmentNames diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index fbaa5574..21696334 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -86,7 +86,7 @@ def _genRecipient(self, recipientType, recipientInt): if not value: # Check if the header has initialized. if self.headerInit(): - logger.info('Header found, but "{}" is not included. Will be generated from other streams.'.format(recipientType)) + logger.info(f'Header found, but "{recipientType}" is not included. Will be generated from other streams.') # Get a list of the recipients of the specified type. foundRecipients = tuple(recipient.formatted for recipient in self.recipients if recipient.type & 0x0000000f == recipientInt) @@ -119,6 +119,8 @@ def _registerNamedProperty(self, entry, _type, name = None): for attachment in self.attachments: attachment._registerNamedProperty(entry, _type, name) + super()._registerNamedProperty(entry, _type, name) + def close(self): try: # If this throws an AttributeError then we have not loaded the attachments. @@ -170,14 +172,14 @@ def attachments(self): self._attachments.append(self.attachmentClass(self, attachmentDir)) except (NotImplementedError, UnrecognizedMSGTypeError) as e: if self.attachmentErrorBehavior > constants.ATTACHMENT_ERROR_THROW: - logger.error('Error processing attachment at {}'.format(attachmentDir)) + logger.error(f'Error processing attachment at {attachmentDir}') logger.exception(e) self._attachments.append(UnsupportedAttachment(self, attachmentDir)) else: raise except Exception as e: if self.attachmentErrorBehavior == constants.ATTACHMENT_ERROR_BROKEN: - logger.error('Error processing attachment at {}'.format(attachmentDir)) + logger.error(f'Error processing attachment at {attachmentDir}') logger.exception(e) self._attachments.append(BrokenAttachment(self, attachmentDir)) else: diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 93844e5c..88c3b4de 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -353,18 +353,17 @@ def listDir(self, streams = True, storages = False): try: return self.__listDirRes except AttributeError: - temp = self.listdir(streams, storages) + entries = self.listdir(streams, storages) if not self.__prefix: - return temp + return entries prefix = self.__prefix.split('/') if prefix[-1] == '': prefix.pop() prefixLength = self.__prefixLen - self.__listDirRes = [x for x in temp if len(x) > prefixLength and x[:prefixLength] == prefix] + self.__listDirRes = [x for x in entries if len(x) > prefixLength and x[:prefixLength] == prefix] return self.__listDirRes - def slistDir(self, streams = True, storages = False): """ Replacement for OleFileIO.listdir that runs at the current prefix directory. diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 5c2ad0bd..d7b09de9 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -10,6 +10,7 @@ import logging import logging.config import os +import pathlib import struct # Not actually sure if this needs to be here for the logging, so just in case. import sys @@ -40,13 +41,14 @@ def addNumToDir(dirName): pass return None -def addNumToZipDir(dirName, _zip): +def addNumToZipDir(dirName : pathlib.Path, _zip): """ Attempt to create the directory with a '(n)' appended. """ for i in range(2, 100): - newDirName = dirName + ' (' + str(i) + ')' - if not any(x.startswith(newDirName.rstrip('/') + '/') for x in _zip.namelist()): + newDirName = dirName / f' ({i})' + pathCompare = str(newDirName).rstrip('/') + '/' + if not any(x.startswith(pathCompare) for x in _zip.namelist()): return newDirName return None From 55c2e73b5d6d6f18c9d0c3bf5ecd431f8b5d3cc9 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 22:33:17 -0800 Subject: [PATCH 07/17] Pathlib stuff for saving should now all be functional --- extract_msg/attachment.py | 27 +++++++++++++-------------- extract_msg/attachment_base.py | 11 +++++++---- extract_msg/utils.py | 4 ++-- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index ac9db2e8..58ba6036 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -1,5 +1,6 @@ import logging import os +import pathlib import random import string import zipfile @@ -63,6 +64,7 @@ def getFilename(self, **kwargs): filename = None customFilename = kwargs.get('customFilename') if customFilename: + customFilename = str(customFilename) # First we need to validate it. If there are invalid characters, this will detect it. if constants.RE_INVALID_FILENAME_CHARACTERS.search(customFilename): raise ValueError('Invalid character found in customFilename. Must not contain any of the following characters: \\/:*?"<>|') @@ -73,13 +75,13 @@ def getFilename(self, **kwargs): if kwargs.get('contentId', False): filename = self.cid # If filename is None at this point, use long filename as first preference - if filename is None: + if not filename: filename = self.longFilename # Otherwise use the short filename - if filename is None: + if not filename: filename = self.shortFilename # Otherwise just make something up! - if filename is None: + if not filename: return self.randomFilename return filename @@ -143,20 +145,17 @@ def save(self, **kwargs): else: createdZip = False # Path needs to be done in a special way if we are in a zip file. - customPath = kwargs.get('customPath', '').replace('\\', '/') - customPath += '/' if customPath and customPath[-1] != '/' else '' + customPath = pathlib.Path(kwargs.get('customPath', '')) # Set the open command to be that of the zip file. _open = _zip.open # Zip files use w for writing in binary. mode = 'w' else: - customPath = os.path.abspath(kwargs.get('customPath', os.getcwd())).replace('\\', '/') - # Prepare the path. - customPath += '' if customPath.endswith('/') else '/' + customPath = pathlib.Path(kwargs.get('customPath', '.')).absolute() mode = 'wb' _open = open - fullFilename = customPath + filename + fullFilename = customPath / filename if self.__type == 'data': if _zip: @@ -164,7 +163,7 @@ def save(self, **kwargs): nameList = _zip.namelist() if fullFilename in nameList: for i in range(2, 100): - testName = customPath + name + ' (' + str(i) + ')' + ext + testName = customPath / f'{name} ({i}){ext}' if testName not in nameList: fullFilename = testName break @@ -172,20 +171,20 @@ def save(self, **kwargs): # If we couldn't find one that didn't exist. raise FileExistsError(f'Could not create the specified file because it already exists ("{fullFilename}").') else: - if os.path.exists(fullFilename): + if fullFilename.exists(): # Try to split the filename into a name and extention. name, ext = os.path.splitext(filename) # Try to add a number to it so that we can save without overwriting. for i in range(2, 100): - testName = customPath + name + ' (' + str(i) + ')' + ext - if not os.path.exists(testName): + testName = customPath / f'{name} ({i}){ext}' + if not testName.exists(): fullFilename = testName break else: # If we couldn't find one that didn't exist. raise FileExistsError(f'Could not create the specified file because it already exists ("{fullFilename}").') - with _open(fullFilename, mode) as f: + with _open(str(fullFilename), mode) as f: f.write(self.__data) # Close the ZipFile if this function created it. diff --git a/extract_msg/attachment_base.py b/extract_msg/attachment_base.py index 93adfc5b..89f82a71 100644 --- a/extract_msg/attachment_base.py +++ b/extract_msg/attachment_base.py @@ -75,7 +75,9 @@ def _ensureSetProperty(self, variable, propertyName): def _ensureSetTyped(self, variable, _id): """ - Like the other ensure set functions, but designed for when something could be multiple types (where only one will be present). This way you have no need to set the type, it will be handled for you. + Like the other ensure set functions, but designed for when something + could be multiple types (where only one will be present). This way you + have no need to set the type, it will be handled for you. """ try: return getattr(self, variable) @@ -178,15 +180,16 @@ def sExists(self, filename): def existsTypedProperty(self, id, _type = None): """ Determines if the stream with the provided id exists. The return of this - function is 2 values, the first being a boolean for if anything was found, - and the second being how many were found. + function is 2 values, the first being a boolean for if anything was + found, and the second being how many were found. """ return self.__msg.existsTypedProperty(id, self.__dir, _type, True, self.__props) @property def dir(self): """ - Returns the directory inside the msg file where the attachment is located. + Returns the directory inside the msg file where the attachment is + located. """ return self.__dir diff --git a/extract_msg/utils.py b/extract_msg/utils.py index d7b09de9..70c3d58e 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -34,7 +34,7 @@ def addNumToDir(dirName): """ for i in range(2, 100): try: - newDirName = dirName + ' (' + str(i) + ')' + newDirName = dirName.with_name(dirName.name + f' ({i})') os.makedirs(newDirName) return newDirName except Exception as e: @@ -46,7 +46,7 @@ def addNumToZipDir(dirName : pathlib.Path, _zip): Attempt to create the directory with a '(n)' appended. """ for i in range(2, 100): - newDirName = dirName / f' ({i})' + newDirName = dirName.with_name(dirName.name + f' ({i})') pathCompare = str(newDirName).rstrip('/') + '/' if not any(x.startswith(pathCompare) for x in _zip.namelist()): return newDirName From e2f4e2085eebf0a6644a6d26dc6bf6644a167ea0 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 22:54:01 -0800 Subject: [PATCH 08/17] Further progress on the update. --- CHANGELOG.md | 1 + extract_msg/message_base.py | 10 ++- extract_msg/msg.py | 152 ++++++++++++++++++------------------ extract_msg/prop.py | 9 +-- extract_msg/recipient.py | 86 ++++++++++---------- extract_msg/utils.py | 23 +++--- 6 files changed, 138 insertions(+), 143 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69f2bc38..a71b6726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ * Changed classes to now prefer super() over direct superclass initalization. * Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore). * Converted most `.format`s into f strings. +* Improved consistency of docstrings. It's not perfect, but it should at least be better. **v0.29.2** * Fixed issue where the RTF injection was accidentally doing HTML escapes for non-encapsulated streams and *not* doing escapes for encapsulated streams. diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index 21696334..d0904f68 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -42,8 +42,8 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N :param overrideEncoding: optional, an encoding to use instead of the one specified by the msg file. Do not report encoding errors caused by this. - :param attachmentErrorBehavior: Optional, the behaviour to use in the event - of an error when parsing the attachments. + :param attachmentErrorBehavior: Optional, the behaviour to use in the + event of an error when parsing the attachments. :param recipientSeparator: Optional, Separator string to use between recipients. """ @@ -252,7 +252,8 @@ def compressedRtf(self): @property def crlf(self): """ - Returns the value of self.__crlf, should you need it for whatever reason. + Returns the value of self.__crlf, should you need it for whatever + reason. """ self.body return self.__crlf @@ -287,7 +288,8 @@ def defaultFolderName(self): @property def header(self): """ - Returns the message header, if it exists. Otherwise it will generate one. + Returns the message header, if it exists. Otherwise it will generate + one. """ try: return self._header diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 88c3b4de..02cb8f66 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -32,9 +32,11 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N will use for attachments. You probably should not change this value unless you know what you are doing. - :param filename: optional, the filename to be used by default when saving. + :param filename: optional, the filename to be used by default when + saving. :param overrideEncoding: optional, an encoding to use instead of the one - specified by the msg file. Do not report encoding errors caused by this. + specified by the msg file. Do not report encoding errors caused by + this. """ # WARNING DO NOT MANUALLY MODIFY PREFIX. Let the program set it. self.__path = path @@ -91,9 +93,11 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N def _ensureSet(self, variable, streamID, stringStream = True): """ - Ensures that the variable exists, otherwise will set it using the specified stream. - After that, return said variable. - If the specified stream is not a string stream, make sure to set :param string stream: to False. + Ensures that the variable exists, otherwise will set it using the + specified stream. After that, return said variable. + + If the specified stream is not a string stream, make sure to set + :param string stream: to False. """ try: return getattr(self, variable) @@ -107,8 +111,8 @@ def _ensureSet(self, variable, streamID, stringStream = True): def _ensureSetNamed(self, variable, propertyName): """ - Ensures that the variable exists, otherwise will set it using the named property. - After that, return said variable. + Ensures that the variable exists, otherwise will set it using the named + property. After that, return said variable. """ try: return getattr(self, variable) @@ -119,8 +123,8 @@ def _ensureSetNamed(self, variable, propertyName): def _ensureSetProperty(self, variable, propertyName): """ - Ensures that the variable exists, otherwise will set it using the property. - After that, return said variable. + Ensures that the variable exists, otherwise will set it using the + property. After that, return said variable. """ try: return getattr(self, variable) @@ -134,7 +138,9 @@ def _ensureSetProperty(self, variable, propertyName): def _ensureSetTyped(self, variable, _id): """ - Like the other ensure set functions, but designed for when something could be multiple types (where only one will be present). This way you have no need to set the type, it will be handled for you. + Like the other ensure set functions, but designed for when something + could be multiple types (where only one will be present). This way you + have no need to set the type, it will be handled for you. """ try: return getattr(self, variable) @@ -147,26 +153,25 @@ def _getStream(self, filename, prefix = True): """ Gets a binary representation of the requested filename. - This should ALWAYS return a bytes object (string in python 2) + This should ALWAYS return a bytes object. """ filename = self.fixPath(filename, prefix) if self.exists(filename, False): with self.openstream(filename) as stream: return stream.read() or b'' else: - logger.info('Stream "{}" was requested but could not be found. Returning `None`.'.format(filename)) + logger.info(f'Stream "{filename}" was requested but could not be found. Returning `None`.') return None def _getStringStream(self, filename, prefix = True): """ Gets a string representation of the requested filename. - Rather than the full filename, you should only feed this - function the filename sans the type. So if the full name - is "__substg1.0_001A001F", the filename this function - should receive should be "__substg1.0_001A". + Rather than the full filename, you should only feed this function the + filename sans the type. So if the full name is "__substg1.0_001A001F", + the filename this function should receive should be "__substg1.0_001A". - This should ALWAYS return a string (Unicode in python 2) + This should ALWAYS return a string. """ filename = self.fixPath(filename, prefix) @@ -178,14 +183,12 @@ def _getStringStream(self, filename, prefix = True): def _getTypedData(self, _id, _type = None, prefix = True): """ - Gets the data for the specified id as the type that it is - supposed to be. :param id: MUST be a 4 digit hexadecimal - string. + Gets the data for the specified id as the type that it is supposed to + be. :param id: MUST be a 4 digit hexadecimal string. - If you know for sure what type the data is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the data is before hand, you can specify + it as being one of the strings in the constant FIXED_LENGTH_PROPS_STRING + or VARIABLE_LENGTH_PROPS_STRING. """ verifyPropertyId(_id) _id = _id.upper() @@ -198,14 +201,12 @@ def _getTypedData(self, _id, _type = None, prefix = True): def _getTypedProperty(self, propertyID, _type = None): """ - Gets the property with the specified id as the type that it - is supposed to be. :param id: MUST be a 4 digit hexadecimal - string. + Gets the property with the specified id as the type that it is supposed + to be. :param id: MUST be a 4 digit hexadecimal string. - If you know for sure what type the property is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the property is before hand, you can + specify it as being one of the strings in the constant + FIXED_LENGTH_PROPS_STRING or VARIABLE_LENGTH_PROPS_STRING. """ verifyPropertyId(propertyID) verifyType(_type) @@ -218,24 +219,21 @@ def _getTypedProperty(self, propertyID, _type = None): def _getTypedStream(self, filename, prefix = True, _type = None): """ - Gets the contents of the specified stream as the type that - it is supposed to be. + Gets the contents of the specified stream as the type that it is + supposed to be. - Rather than the full filename, you should only feed this - function the filename sans the type. So if the full name - is "__substg1.0_001A001F", the filename this function - should receive should be "__substg1.0_001A". + Rather than the full filename, you should only feed this function the + filename sans the type. So if the full name is "__substg1.0_001A001F", + the filename this function should receive should be "__substg1.0_001A". - If you know for sure what type the stream is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the stream is before hand, you can + specify it as being one of the strings in the constant + FIXED_LENGTH_PROPS_STRING or VARIABLE_LENGTH_PROPS_STRING. - If you have not specified the type, the type this function - returns in many cases cannot be predicted. As such, when - using this function it is best for you to check the type - that it returns. If the function returns None, that means - it could not find the stream specified. + If you have not specified the type, the type this function returns in + many cases cannot be predicted. As such, when using this function it is + best for you to check the type that it returns. If the function returns + None, that means it could not find the stream specified. """ verifyType(_type) filename = self.fixPath(filename, prefix) @@ -255,10 +253,10 @@ def _getTypedStream(self, filename, prefix = True, _type = None): try: streams = self.mainProperties[x[-8:]].realLength except: - logger.error('Could not find matching VariableLengthProp for stream {}'.format(x)) + logger.error(f'Could not find matching VariableLengthProp for stream {x}') streams = len(contents) // (2 if _type in constants.MULTIPLE_2_BYTES else 4 if _type in constants.MULTIPLE_4_BYTES else 8 if _type in constants.MULTIPLE_8_BYTES else 16) else: - raise NotImplementedError('The stream specified is of type {}. We don\'t currently understand exactly how this type works. If it is mandatory that you have the contents of this stream, please create an issue labled "NotImplementedError: _getTypedStream {}".'.format(_type, _type)) + raise NotImplementedError(f'The stream specified is of type {_type}. We don\'t currently understand exactly how this type works. If it is mandatory that you have the contents of this stream, please create an issue labled "NotImplementedError: _getTypedStream {_type}".') if _type in ('101F', '101E', '1102'): if self.exists(x + '-00000000', False): for y in range(streams): @@ -274,7 +272,8 @@ def _registerNamedProperty(self, entry, _type, name = None): """ FOR INTERNAL USE ONLY! DO NOT CALL MANUALLY! - Function to allow things like attachments in subclasses to have their own named properties. + Function to allow things like attachments in subclasses to have their + own named properties. """ pass @@ -282,7 +281,7 @@ def debug(self): for dir_ in self.listDir(): if dir_[-1].endswith('001E') or dir_[-1].endswith('001F'): print('Directory: ' + str(dir_[:-1])) - print('Contents: {}'.format(self._getStream(dir_))) + print(f'Contents: {self._getStream(dir_)}') def exists(self, inp, prefix = True): """ @@ -300,13 +299,14 @@ def sExists(self, inp, prefix = True): def existsTypedProperty(self, _id, location = None, _type = None, prefix = True, propertiesInstance = None): """ - Determines if the stream with the provided id exists in the location specified. - If no location is specified, the root directory is searched. The return of this - function is 2 values, the first being a boolean for if anything was found, and - the second being how many were found. + Determines if the stream with the provided id exists in the location + specified. If no location is specified, the root directory is searched. + The return of this function is 2 values, the first being a boolean for + if anything was found, and the second being how many were found. Because of how this function works, any folder that contains it's own - "__properties_version1.0" file should have this function called from it's class. + "__properties_version1.0" file should have this function called from + it's class. """ verifyPropertyId(_id) verifyType(_type) @@ -336,9 +336,8 @@ def existsTypedProperty(self, _id, location = None, _type = None, prefix = True, def fixPath(self, inp, prefix = True): """ - Changes paths so that they have the proper - prefix (should :param prefix: be True) and - are strings rather than lists or tuples. + Changes paths so that they have the proper prefix (should :param prefix: + be True) and are strings rather than lists or tuples. """ inp = msgpathToString(inp) if prefix: @@ -347,7 +346,8 @@ def fixPath(self, inp, prefix = True): def listDir(self, streams = True, storages = False): """ - Replacement for OleFileIO.listdir that runs at the current prefix directory. + Replacement for OleFileIO.listdir that runs at the current prefix + directory. """ # Get the items from OleFileIO. try: @@ -366,24 +366,23 @@ def listDir(self, streams = True, storages = False): def slistDir(self, streams = True, storages = False): """ - Replacement for OleFileIO.listdir that runs at the current prefix directory. - Returns a list of strings instead of lists. + Replacement for OleFileIO.listdir that runs at the current prefix + directory. Returns a list of strings instead of lists. """ return [msgpathToString(x) for x in self.listDir(streams, storages)] def save(self, *args, **kwargs): - raise NotImplementedError('Saving is not yet supported for the {} class'.format(self.__class__.__name__)) + raise NotImplementedError(f'Saving is not yet supported for the {self.__class__.__name__} class.') def saveRaw(self, path): # Create a 'raw' folder - path = path.replace('\\', '/') - path += '/' if path[-1] != '/' else '' + path = pathlib.Path(path) # Make the location os.makedirs(path, exist_ok = True) # Create the zipfile - path += 'raw.zip' - if os.path.exists(path): - raise FileExistsError('File "{}" already exists.'.format(path)) + path /= 'raw.zip' + if path.exists(): + raise FileExistsError(f'File "{path}" already exists.') with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zfile: # Loop through all the directories for dir_ in self.listdir(): @@ -425,15 +424,16 @@ def areStringsUnicode(self): @property def attachmentClass(self): """ - Returns the Attachment class being used, should you need to use it externally for whatever reason. + Returns the Attachment class being used, should you need to use it + externally for whatever reason. """ return self.__attachmentClass @property def attachmentErrorBehavior(self): """ - The behavior to follow when an attachment raises an exception. Will be one - of the following values: + The behavior to follow when an attachment raises an exception. Will be + one of the following values: ATTACHMENT_ERROR_THROW: Don't catch exceptions. ATTACHMENT_ERROR_NOT_IMPLEMENTED: Catch NotImplementedError exceptions. ATTACHMENT_ERROR_BROKEN: Catch all exceptions. @@ -488,17 +488,15 @@ def overrideEncoding(self): @property def path(self): """ - Returns the message path if generated from a file, - otherwise returns the data used to generate the - Message instance. + Returns the message path if generated from a file, otherwise returns the + data used to generate the Message instance. """ return self.__path @property def prefix(self): """ - Returns the prefix of the Message instance. - Intended for developer use. + Returns the prefix of the Message instance. Intended for developer use. """ return self.__prefix @@ -512,8 +510,8 @@ def prefixLen(self): @property def prefixList(self): """ - Returns the prefix list of the Message instance. - Intended for developer use. + Returns the prefix list of the Message instance. Intended for developer + use. """ return copy.deepcopy(self.__prefixList) diff --git a/extract_msg/prop.py b/extract_msg/prop.py index 96e47fd9..8e0e8a9b 100644 --- a/extract_msg/prop.py +++ b/extract_msg/prop.py @@ -15,7 +15,7 @@ def createProp(string): else: if temp not in constants.VARIABLE_LENGTH_PROPS: # DEBUG - logger.warning('Unknown property type: {}'.format(properHex(temp))) + logger.warning(f'Unknown property type: {properHex(temp)}') return VariableLengthProp(string) @@ -95,9 +95,8 @@ def __init__(self, string): def parseType(self, _type, stream): """ - Converts the data in :param stream: to a - much more accurate type, specified by - :param _type:, if possible. + Converts the data in :param stream: to a much more accurate type, + specified by :param _type:, if possible. :param stream: #TODO what is stream for? WARNING: Not done. @@ -142,7 +141,7 @@ def parseType(self, _type, stream): value = datetime.datetime.max except Exception as e: logger.exception(e) - logger.error('Timestamp value of {} caused an exception. This was probably caused by the time stamp being too far in the future.'.format(msgEpoch(constants.ST3.unpack(value)[0]))) + logger.error(f'Timestamp value of {msgEpoch(constants.ST3.unpack(value)[0])} caused an exception. This was probably caused by the time stamp being too far in the future.') logger.error(self.raw) elif _type == 0x0048: # PtypGuid # TODO parsing for this diff --git a/extract_msg/recipient.py b/extract_msg/recipient.py index 56f3baea..ad1e6df7 100644 --- a/extract_msg/recipient.py +++ b/extract_msg/recipient.py @@ -23,13 +23,15 @@ def __init__(self, _dir, msg): self.__email = self._getStringStream('__substg1.0_3003') self.__name = self._getStringStream('__substg1.0_3001') self.__type = self.__props.get('0C150003').value - self.__formatted = u'{0} <{1}>'.format(self.__name, self.__email) + self.__formatted = uf'{self.__name} <{self.__email}>' def _ensureSet(self, variable, streamID, stringStream = True): """ - Ensures that the variable exists, otherwise will set it using the specified stream. - After that, return said variable. - If the specified stream is not a string stream, make sure to set :param string stream: to False. + Ensures that the variable exists, otherwise will set it using the + specified stream. After that, return said variable. + + If the specified stream is not a string stream, make sure to set + :param string stream: to False. """ try: return getattr(self, variable) @@ -43,8 +45,8 @@ def _ensureSet(self, variable, streamID, stringStream = True): def _ensureSetNamed(self, variable, propertyName): """ - Ensures that the variable exists, otherwise will set it using the named property. - After that, return said variable. + Ensures that the variable exists, otherwise will set it using the named + property. After that, return said variable. """ try: return getattr(self, variable) @@ -55,8 +57,8 @@ def _ensureSetNamed(self, variable, propertyName): def _ensureSetProperty(self, variable, propertyName): """ - Ensures that the variable exists, otherwise will set it using the property. - After that, return said variable. + Ensures that the variable exists, otherwise will set it using the + property. After that, return said variable. """ try: return getattr(self, variable) @@ -70,7 +72,9 @@ def _ensureSetProperty(self, variable, propertyName): def _ensureSetTyped(self, variable, _id): """ - Like the other ensure set functions, but designed for when something could be multiple types (where only one will be present). This way you have no need to set the type, it will be handled for you. + Like the other ensure set functions, but designed for when something + could be multiple types (where only one will be present). This way you + have no need to set the type, it will be handled for you. """ try: return getattr(self, variable) @@ -84,24 +88,21 @@ def _getStream(self, filename): def _getStringStream(self, filename): """ - Gets a string representation of the requested filename. - Checks for both ASCII and Unicode representations and returns - a value if possible. If there are both ASCII and Unicode - versions, then :param prefer: specifies which will be - returned. + Gets a string representation of the requested filename. Checks for both + Unicode and Non-Unicode representations and returns a value if possible. + If there are both Unicode and Non-Unicode versions, then :param prefer: + specifies which will be returned. """ return self.__msg._getStringStream([self.__dir, filename]) def _getTypedData(self, id, _type = None): """ - Gets the data for the specified id as the type that it is - supposed to be. :param id: MUST be a 4 digit hexadecimal - string. + Gets the data for the specified id as the type that it is supposed to + be. :param id: MUST be a 4 digit hexadecimal string. - If you know for sure what type the data is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the data is before hand, you can specify + it as being one of the strings in the constant FIXED_LENGTH_PROPS_STRING + or VARIABLE_LENGTH_PROPS_STRING. """ verifyPropertyId(id) id = id.upper() @@ -114,14 +115,12 @@ def _getTypedData(self, id, _type = None): def _getTypedProperty(self, propertyID, _type = None): """ - Gets the property with the specified id as the type that it - is supposed to be. :param id: MUST be a 4 digit hexadecimal - string. + Gets the property with the specified id as the type that it is supposed + to be. :param id: MUST be a 4 digit hexadecimal string. - If you know for sure what type the property is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the property is before hand, you can + specify it as being one of the strings in the constant + FIXED_LENGTH_PROPS_STRING or VARIABLE_LENGTH_PROPS_STRING. """ verifyPropertyId(propertyID) verifyType(_type) @@ -134,24 +133,21 @@ def _getTypedProperty(self, propertyID, _type = None): def _getTypedStream(self, filename, _type = None): """ - Gets the contents of the specified stream as the type that - it is supposed to be. + Gets the contents of the specified stream as the type that it is + supposed to be. - Rather than the full filename, you should only feed this - function the filename sans the type. So if the full name - is "__substg1.0_001A001F", the filename this function - should receive should be "__substg1.0_001A". + Rather than the full filename, you should only feed this function the + filename sans the type. So if the full name is "__substg1.0_001A001F", + the filename this function should receive should be "__substg1.0_001A". - If you know for sure what type the stream is before hand, - you can specify it as being one of the strings in the - constant FIXED_LENGTH_PROPS_STRING or - VARIABLE_LENGTH_PROPS_STRING. + If you know for sure what type the stream is before hand, you can + specify it as being one of the strings in the constant + FIXED_LENGTH_PROPS_STRING or VARIABLE_LENGTH_PROPS_STRING. - If you have not specified the type, the type this function - returns in many cases cannot be predicted. As such, when - using this function it is best for you to check the type - that it returns. If the function returns None, that means - it could not find the stream specified. + If you have not specified the type, the type this function returns in + many cases cannot be predicted. As such, when using this function it is + best for you to check the type that it returns. If the function returns + None, that means it could not find the stream specified. """ self.__msg._getTypedStream(self, [self.__dir, filename], True, _type) @@ -170,8 +166,8 @@ def sExists(self, filename): def existsTypedProperty(self, id, _type = None): """ Determines if the stream with the provided id exists. The return of this - function is 2 values, the first being a boolean for if anything was found, - and the second being how many were found. + function is 2 values, the first being a boolean for if anything was + found, and the second being how many were found. """ return self.__msg.existsTypedProperty(id, self.__dir, _type, True, self.__props) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 70c3d58e..4606d25e 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -232,7 +232,7 @@ def getEncodingName(codepage): codecs.lookup(constants.CODE_PAGES[codepage]) return constants.CODE_PAGES[codepage] except LookupError: - raise UnsupportedEncodingError('The codepage {} ({}) is not currently supported by your version of Python.'.format(codepage, constants.CODE_PAGES[codepage])) + raise UnsupportedEncodingError(f'The codepage {codepage} ({constants.CODE_PAGES[codepage]}) is not currently supported by your version of Python.') def getFullClassName(inp): return inp.__class__.__module__ + '.' + inp.__class__.__name__ @@ -425,13 +425,11 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt Function to automatically open an MSG file and detect what type it is. :param path: Path to the msg file in the system or is the raw msg file. - :param prefix: Used for extracting embeded msg files - inside the main one. Do not set manually unless - you know what you are doing. - :param attachmentClass: Optional, the class the Message object - will use for attachments. You probably should - not change this value unless you know what you - are doing. + :param prefix: Used for extracting embeded msg files inside the main one. + Do not set manually unless you know what you are doing. + :param attachmentClass: Optional, the class the Message object will use for + attachments. You probably should not change this value unless you know + what you are doing. :param filename: Optional, the filename to be used by default when saving. :param delayAttachments: Optional, delays the initialization of attachments until the user attempts to retrieve them. Allows MSG files with bad @@ -483,13 +481,14 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt def parseType(_type, stream, encoding, extras): """ - Converts the data in :param stream: to a - much more accurate type, specified by + Converts the data in :param stream: to a much more accurate type, specified + by :param _type:. :param _type: the data's type. :param stream: is the data to be converted. :param encoding: is the encoding to be used for regular strings. :param extras: is used in the case of types like PtypMultipleString. - For that example, extras should be a list of the bytes from rest of the streams. + For that example, extras should be a list of the bytes from rest of the + streams. WARNING: Not done. Do not try to implement anywhere where it is not already implemented """ @@ -776,7 +775,7 @@ def verifyPropertyId(id): def verifyType(_type): if _type is not None: if (_type not in constants.VARIABLE_LENGTH_PROPS_STRING) and (_type not in constants.FIXED_LENGTH_PROPS_STRING): - raise UnknownTypeError('Unknown type {}'.format(_type)) + raise UnknownTypeError(f'Unknown type {_type}') def windowsUnicode(string): return str(string, 'utf-16-le') if string is not None else None From 4aac5a506590a85c8b00fd2de875f61145c3c274 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 23:01:13 -0800 Subject: [PATCH 09/17] Finished with f strings. --- extract_msg/utils.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 4606d25e..6c623eef 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -473,10 +473,10 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt ct = msg.classType msg.close() if knownMsgClass(classType): - raise UnsupportedMSGTypeError('MSG type "{}" currently is not supported by the module. If you would like support, please make a feature request.'.format(ct)) - raise UnrecognizedMSGTypeError('Could not recognize msg class type "{}".'.format(ct)) + raise UnsupportedMSGTypeError(f'MSG type "{ct}" currently is not supported by the module. If you would like support, please make a feature request.') + raise UnrecognizedMSGTypeError(f'Could not recognize msg class type "{ct}".') else: - logger.error('Could not recognize msg class type "{}". This most likely means it hasn\'t been implemented yet, and you should ask the developers to add support for it.'.format(msg.classType)) + logger.error(f'Could not recognize msg class type "{msg.classType}". This most likely means it hasn\'t been implemented yet, and you should ask the developers to add support for it.') return msg def parseType(_type, stream, encoding, extras): @@ -557,27 +557,27 @@ def parseType(_type, stream, encoding, extras): # TODO parsing for `multiple` types if _type in (0x101F, 0x101E): ret = [x.decode(encoding) for x in extras] - lengths = struct.unpack('<{}i'.format(len(ret)), stream) + lengths = struct.unpack(f'<{len(ret)}i', stream) lengthLengths = len(lengths) if lengthLengths > lengthExtras: - logger.warning('Error while parsing multiple type. Expected {} stream{}, got {}. Ignoring.'.format(lengthLengths, 's' if lengthLengths != 1 else '', lengthExtras)) + logger.warning(f'Error while parsing multiple type. Expected {lengthLengths} stream{"s" if lengthLengths != 1 else ""}, got {lengthExtras}. Ignoring.') for x, y in enumerate(extras): if lengths[x] != len(y): - logger.warning('Error while parsing multiple type. Expected length {}, got {}. Ignoring.'.format(lengths[x], len(y))) + logger.warning(f'Error while parsing multiple type. Expected length {lengths[x]}, got {len(y)}. Ignoring.') return ret elif _type == 0x1102: ret = copy.deepcopy(extras) lengths = tuple(constants.STUI32.unpack(stream[pos*8:(pos+1)*8])[0] for pos in range(len(stream) // 8)) lengthLengths = len(lengths) if lengthLengths > lengthExtras: - logger.warning('Error while parsing multiple type. Expected {} stream{}, got {}. Ignoring.'.format(lengthLengths, 's' if lengthLengths != 1 else '', lengthExtras)) + logger.warning(f'Error while parsing multiple type. Expected {lengthLengths} stream{"s" if lengthLengths != 1 else ""}, got {lengthExtras}. Ignoring.') for x, y in enumerate(extras): if lengths[x] != len(y): - logger.warning('Error while parsing multiple type. Expected length {}, got {}. Ignoring.'.format(lengths[x], len(y))) + logger.warning(f'Error while parsing multiple type. Expected length {lengths[x]}, got {len(y)}. Ignoring.') return ret elif _type in (0x1002, 0x1003, 0x1004, 0x1005, 0x1007, 0x1014, 0x1040, 0x1048): if stream != len(extras): - logger.warning('Error while parsing multiple type. Expected {} entr{}, got {}. Ignoring.'.format(stream, ('y' if stream == 1 else 'ies'), len(extras))) + logger.warning(f'Error while parsing multiple type. Expected {stream} entr{"y" if stream == 1 else "ies"}, got {len(extras)}. Ignoring.') if _type == 0x1002: return tuple(constants.STMI16.unpack(x)[0] for x in extras) if _type == 0x1003: @@ -596,7 +596,7 @@ def parseType(_type, stream, encoding, extras): if _type == 0x1048: return tuple(bytesToGuid(x) for x in extras) else: - raise NotImplementedError('Parsing for type {} has not yet been implmented. If you need this type, please create a new issue labeled "NotImplementedError: parseType {}"'.format(_type, _type)) + raise NotImplementedError(f'Parsing for type {_type} has not yet been implmented. If you need this type, please create a new issue labeled "NotImplementedError: parseType {_type}"') return value def prepareFilename(filename): @@ -609,7 +609,8 @@ def prepareFilename(filename): def properHex(inp, length = 0): """ - Taken (with permission) from https://github.com/TheElementalOfDestruction/creatorUtils + Taken (with permission) from + https://github.com/TheElementalOfDestruction/creatorUtils """ a = '' if isinstance(inp, str): @@ -731,7 +732,7 @@ def setupLogging(defaultPath=None, defaultLevel=logging.WARN, logfile=None, enab print(str(paths[1:])) logging.basicConfig(level=defaultLevel) logging.warning('The extract_msg logging configuration was not found - using a basic configuration.' - 'Please check the extract_msg installation directory for "logging-{}.json".'.format(os.name)) + f'Please check the extract_msg installation directory for "logging-{os.name}.json".') return False with open(path, 'rt') as f: From 9009058270c5eae9d2bd61bb0d511dd2098555c6 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 23:03:01 -0800 Subject: [PATCH 10/17] Fixed one f string --- extract_msg/recipient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_msg/recipient.py b/extract_msg/recipient.py index ad1e6df7..64e41419 100644 --- a/extract_msg/recipient.py +++ b/extract_msg/recipient.py @@ -23,7 +23,7 @@ def __init__(self, _dir, msg): self.__email = self._getStringStream('__substg1.0_3003') self.__name = self._getStringStream('__substg1.0_3001') self.__type = self.__props.get('0C150003').value - self.__formatted = uf'{self.__name} <{self.__email}>' + self.__formatted = f'{self.__name} <{self.__email}>' def _ensureSet(self, variable, streamID, stringStream = True): """ From 495f71c5eb770f64062b0a8dd5f84d9eee7075ab Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 23:11:22 -0800 Subject: [PATCH 11/17] language updates and logic updates for pathlib. --- extract_msg/dev_classes/message.py | 4 ++-- extract_msg/msg.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 40d13e5e..50616149 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -87,7 +87,7 @@ def _getStream(self, filename, prefix=True): def _getStringStream(self, filename, prefer='unicode', prefix=True): """ Gets a string representation of the requested filename. - This should ALWAYS return a string (Unicode in python 2) + This should ALWAYS return a string. """ filename = self.fix_path(filename, prefix) @@ -150,7 +150,7 @@ def listDir(self, streams=True, storages=False): if good: out.append(x) - + return out @property diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 02cb8f66..08412f9c 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -42,7 +42,7 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N self.__path = path self.__attachmentClass = attachmentClass if not (constants.ATTACHMENT_ERROR_THROW <= attachmentErrorBehavior <= constants.ATTACHMENT_ERROR_BROKEN): - raise ValueError("`attachmentErrorBehavior` must be ATTACHMENT_ERROR_THROW, ATTACHMENT_ERROR_NOT_IMPLEMENTED, or ATTACHMENT_ERROR_BROKEN.") + raise ValueError(":param attachmentErrorBehavior: must be ATTACHMENT_ERROR_THROW, ATTACHMENT_ERROR_NOT_IMPLEMENTED, or ATTACHMENT_ERROR_BROKEN.") self.__attachmentErrorBehavior = attachmentErrorBehavior if overrideEncoding is not None: codecs.lookup(overrideEncoding) @@ -85,9 +85,11 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N self.filename = filename elif hasLen(path): if len(path) < 1536: - self.filename = path + self.filename = str(path) else: self.filename = None + elif isinstance(path, pathlib.Path): + self.filename = str(path) else: self.filename = None @@ -97,7 +99,7 @@ def _ensureSet(self, variable, streamID, stringStream = True): specified stream. After that, return said variable. If the specified stream is not a string stream, make sure to set - :param string stream: to False. + :param stringStream: to False. """ try: return getattr(self, variable) From 02252d2f7fadd39333dd96d683556b8fb0b1294f Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 16 Jan 2022 23:18:31 -0800 Subject: [PATCH 12/17] Update readme with new usage doc --- README.rst | 65 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/README.rst b/README.rst index 8ec11922..266b2674 100644 --- a/README.rst +++ b/README.rst @@ -10,8 +10,8 @@ data (from, to, cc, date, subject, body) and the email's attachments. NOTICE ====== -0.29.* is the branch that supports both Python 2 and Python 3. It is now only receiving bug fixes -and will not be receiving feature updates. +0.29.* is the branch that supports both Python 2 and Python 3. It is now only +receiving bug fixes and will not be receiving feature updates. This module has a Discord server for general discussion. You can find it here: `Discord`_ @@ -52,35 +52,48 @@ refer to the usage information provided from the program's help dialog: :: usage: extract_msg [-h] [--use-content-id] [--dev] [--validate] [--json] - [--file-logging] [--verbose] [--log LOG] - [--config CONFIG_PATH] [--out OUT_PATH] [--use-filename] - msg [msg ...] + [--file-logging] [--verbose] [--log LOG] + [--config CONFIG_PATH] [--out OUT_PATH] [--use-filename] + [--dump-stdout] [--html] [--raw] [--rtf] + [--allow-fallback] [--out-name OUT_NAME] msg [msg ...] - extract_msg: Extracts emails and attachments saved in Microsoft Outlook's .msg - files. https://github.com/mattgwwalker/msg-extractor + extract_msg: Extracts emails and attachments saved in Microsoft Outlook's + .msg files. https://github.com/TeamMsgExtractor/msg-extractor positional arguments: - msg An msg file to be parsed + msg An msg file to be parsed optional arguments: - -h, --help show this help message and exit - --use-content-id, --cid - Save attachments by their Content ID, if they have - one. Useful when working with the HTML body. - --dev Changes to use developer mode. Automatically enables - the --verbose flag. Takes precedence over the - --validate flag. - --validate Turns on file validation mode. Turns off regular file - output. - --json Changes to write output files as json. - --file-logging Enables file logging. Implies --verbose - --verbose Turns on console logging. - --log LOG Set the path to write the file log to. - --config CONFIG_PATH Set the path to load the logging config from. - --out OUT_PATH Set the folder to use for the program output. - (Default: Current directory) - --use-filename Sets whether the name of each output is based on the - msg filename. + -h, --help show this help message and exit + --use-content-id, --cid + Save attachments by their Content ID, if they have + one. Useful when working with the HTML body. + --dev Changes to use developer mode. Automatically + enables the --verbose flag. Takes precedence over + the --validate flag. + --validate Turns on file validation mode. Turns off regular + file output. + --json Changes to write output files as json. + --file-logging Enables file logging. Implies --verbose. + --verbose Turns on console logging. + --log LOG Set the path to write the file log to. + --config CONFIG_PATH Set the path to load the logging config from. + --out OUT_PATH Set the folder to use for the program output. + (Default: Current directory) + --use-filename Sets whether the name of each output is based on + the msg filename. + --dump-stdout Tells the program to dump the message body (plain + text) to stdout. Overrides saving arguments. + --html Sets whether the output should be html. If this is + not possible, will error. + --raw Sets whether the output should be html. If this is + not possible, will error. + --rtf Sets whether the output should be rtf. If this is + not possible, will error. + --allow-fallback Tells the program to fallback to a different save + type if the selected one is not possible. + --out-name OUT_NAME Name to be used with saving the file output. + Should come immediately after the file name. **To use this in your own script**, start by using: From 17026ded409c538f0e88c93cd19b7b7c719fc4ba Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Tue, 18 Jan 2022 18:56:34 -0800 Subject: [PATCH 13/17] typehints, documentation improvement, code improvement --- CHANGELOG.md | 5 +- README.rst | 9 ++-- extract_msg/constants.py | 4 +- extract_msg/data.py | 8 +-- extract_msg/message_base.py | 25 ++++----- extract_msg/msg.py | 42 ++++++++------- extract_msg/prop.py | 6 +-- extract_msg/properties.py | 2 +- extract_msg/utils.py | 105 ++++++++++++++++++++---------------- 9 files changed, 115 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a71b6726..df1fad59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,12 @@ **v0.30.0** * Removed all support for Python 2. This caused a lot of things to be moved around and changed from indirect references to direct references, so it's possible something fell through the cracks. I'm doing my best to test it, but let me know if you have an issue. -* Changed classes to now prefer super() over direct superclass initalization. +* Changed classes to now prefer super() over direct superclass initialization. * Removed explicit object subclassing (it's implicit in Python 3 so we don't need it anymore). * Converted most `.format`s into f strings. * Improved consistency of docstrings. It's not perfect, but it should at least be better. +* Started the addition of type hints to functions and methods. +* Updated `utils.bytesToGuid` to make it faster and more efficient. +* Renamed `utils.msgEpoch` to `utils.filetimeToUtc` to be more descriptive. **v0.29.2** * Fixed issue where the RTF injection was accidentally doing HTML escapes for non-encapsulated streams and *not* doing escapes for encapsulated streams. diff --git a/README.rst b/README.rst index 266b2674..f0a2d9fa 100644 --- a/README.rst +++ b/README.rst @@ -40,10 +40,11 @@ OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format). This is the underlying format of Outlook's .msg files. This library currently supports Python 3.6 and above. -The script was built using Peter Fiskerstrand's documentation of the -.msg format. Redemption's discussion of the different property types -used within Extended MAPI was also useful. For future reference, I note -that Microsoft have opened up their documentation of the file format. +The script was originally built using Peter Fiskerstrand's documentation of the +.msg format. Redemption's discussion of the different property types used within +Extended MAPI was also useful. For future reference, I note that Microsoft have +opened up their documentation of the file format, which is what is currently +being used for development. #########REWRITE COMMAND LINE USAGE############# diff --git a/extract_msg/constants.py b/extract_msg/constants.py index 935898f7..b7706012 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -619,6 +619,8 @@ ST1 = struct.Struct('<8x4I') ST2 = struct.Struct(' bytes: """ Returns the raw data used to generate this instance. """ return self.__data @property - def displayTypeString(self): + def displayTypeString(self) -> int: """ Returns the display type string. This will be one of the display type constants. """ return self.__displayTypeString @property - def distinguishedName(self): + def distinguishedName(self) -> str: """ Returns the distinguished name. """ diff --git a/extract_msg/message_base.py b/extract_msg/message_base.py index d0904f68..549020c4 100644 --- a/extract_msg/message_base.py +++ b/extract_msg/message_base.py @@ -17,6 +17,7 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) + class MessageBase(MSGFile): """ Base class for Message like msg files. @@ -121,7 +122,7 @@ def _registerNamedProperty(self, entry, _type, name = None): super()._registerNamedProperty(entry, _type, name) - def close(self): + def close(self) -> None: try: # If this throws an AttributeError then we have not loaded the attachments. self._attachments @@ -132,7 +133,7 @@ def close(self): pass super().close() - def headerInit(self): + def headerInit(self) -> bool: """ Checks whether the header has been initialized. """ @@ -142,7 +143,7 @@ def headerInit(self): except AttributeError: return False - def saveAttachments(self, **kwargs): + def saveAttachments(self, **kwargs) -> None: """ Saves only attachments in the same folder. """ @@ -270,7 +271,7 @@ def date(self): return self._date @property - def defaultFolderName(self): + def defaultFolderName(self) -> str: """ Generates the default name of the save folder. """ @@ -313,7 +314,7 @@ def header(self): return self._header @property - def headerDict(self): + def headerDict(self) -> dict: """ Returns a dictionary of the entries in the header """ @@ -328,21 +329,21 @@ def headerDict(self): return self._headerDict @property - def htmlBody(self): + def htmlBody(self) -> bytes: """ Returns the html body, if it exists. """ return self._ensureSet('_htmlBody', '__substg1.0_10130102', False) @property - def inReplyTo(self): + def inReplyTo(self) -> str: """ Returns the message id that this message is in reply to. """ return self._ensureSet('_in_reply_to', '__substg1.0_1042') @property - def isRead(self): + def isRead(self) -> bool: """ Returns if this email has been marked as read. """ @@ -369,11 +370,11 @@ def parsedDate(self): return email.utils.parsedate(self.date) @property - def recipientSeparator(self): + def recipientSeparator(self) -> str: return self.__recipientSeparator @property - def recipients(self): + def recipients(self) -> list: """ Returns a list of all recipients. """ @@ -396,7 +397,7 @@ def recipients(self): return self._recipients @property - def rtfBody(self): + def rtfBody(self) -> bytes: """ Returns the decompressed Rtf body from the message. """ @@ -407,7 +408,7 @@ def rtfBody(self): return self._rtfBody @property - def sender(self): + def sender(self) -> str: """ Returns the message sender, if it exists. """ diff --git a/extract_msg/msg.py b/extract_msg/msg.py index 08412f9c..e9582ecc 100644 --- a/extract_msg/msg.py +++ b/extract_msg/msg.py @@ -93,7 +93,7 @@ def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = N else: self.filename = None - def _ensureSet(self, variable, streamID, stringStream = True): + def _ensureSet(self, variable : str, streamID, stringStream : bool = True): """ Ensures that the variable exists, otherwise will set it using the specified stream. After that, return said variable. @@ -111,7 +111,7 @@ def _ensureSet(self, variable, streamID, stringStream = True): setattr(self, variable, value) return value - def _ensureSetNamed(self, variable, propertyName): + def _ensureSetNamed(self, variable : str, propertyName): """ Ensures that the variable exists, otherwise will set it using the named property. After that, return said variable. @@ -123,7 +123,7 @@ def _ensureSetNamed(self, variable, propertyName): setattr(self, variable, value) return value - def _ensureSetProperty(self, variable, propertyName): + def _ensureSetProperty(self, variable : str, propertyName): """ Ensures that the variable exists, otherwise will set it using the property. After that, return said variable. @@ -138,7 +138,7 @@ def _ensureSetProperty(self, variable, propertyName): setattr(self, variable, value) return value - def _ensureSetTyped(self, variable, _id): + def _ensureSetTyped(self, variable : str, _id): """ Like the other ensure set functions, but designed for when something could be multiple types (where only one will be present). This way you @@ -151,11 +151,12 @@ def _ensureSetTyped(self, variable, _id): setattr(self, variable, value) return value - def _getStream(self, filename, prefix = True): + def _getStream(self, filename, prefix : bool = True) -> bytes: """ Gets a binary representation of the requested filename. - This should ALWAYS return a bytes object. + This should ALWAYS return a bytes object if it was found, otherwise + returns None. """ filename = self.fixPath(filename, prefix) if self.exists(filename, False): @@ -165,7 +166,7 @@ def _getStream(self, filename, prefix = True): logger.info(f'Stream "{filename}" was requested but could not be found. Returning `None`.') return None - def _getStringStream(self, filename, prefix = True): + def _getStringStream(self, filename, prefix : bool = True) -> str: """ Gets a string representation of the requested filename. @@ -173,7 +174,8 @@ def _getStringStream(self, filename, prefix = True): filename sans the type. So if the full name is "__substg1.0_001A001F", the filename this function should receive should be "__substg1.0_001A". - This should ALWAYS return a string. + This should ALWAYS return a string if it was found, otherwise returns + None. """ filename = self.fixPath(filename, prefix) @@ -183,7 +185,7 @@ def _getStringStream(self, filename, prefix = True): tmp = self._getStream(filename + '001E', prefix = False) return None if tmp is None else tmp.decode(self.stringEncoding) - def _getTypedData(self, _id, _type = None, prefix = True): + def _getTypedData(self, _id : str, _type = None, prefix : bool = True): """ Gets the data for the specified id as the type that it is supposed to be. :param id: MUST be a 4 digit hexadecimal string. @@ -201,7 +203,7 @@ def _getTypedData(self, _id, _type = None, prefix = True): found, result = self._getTypedProperty(_id, _type) return result if found else None - def _getTypedProperty(self, propertyID, _type = None): + def _getTypedProperty(self, propertyID : str, _type = None): """ Gets the property with the specified id as the type that it is supposed to be. :param id: MUST be a 4 digit hexadecimal string. @@ -219,7 +221,7 @@ def _getTypedProperty(self, propertyID, _type = None): return True, (prop.value if isinstance(prop, FixedLengthProp) else prop) return False, None - def _getTypedStream(self, filename, prefix = True, _type = None): + def _getTypedStream(self, filename, prefix : bool = True, _type = None): """ Gets the contents of the specified stream as the type that it is supposed to be. @@ -279,20 +281,20 @@ def _registerNamedProperty(self, entry, _type, name = None): """ pass - def debug(self): + def debug(self) -> None: for dir_ in self.listDir(): if dir_[-1].endswith('001E') or dir_[-1].endswith('001F'): print('Directory: ' + str(dir_[:-1])) print(f'Contents: {self._getStream(dir_)}') - def exists(self, inp, prefix = True): + def exists(self, inp, prefix : bool = True) -> bool: """ Checks if :param inp: exists in the msg file. """ inp = self.fixPath(inp, prefix) return olefile.OleFileIO.exists(self, inp) - def sExists(self, inp, prefix = True): + def sExists(self, inp, prefix : bool = True) -> bool: """ Checks if string stream :param inp: exists in the msg file. """ @@ -336,7 +338,7 @@ def existsTypedProperty(self, _id, location = None, _type = None, prefix = True, foundNumber += 1 return (foundNumber > 0), foundNumber - def fixPath(self, inp, prefix = True): + def fixPath(self, inp, prefix : bool = True): """ Changes paths so that they have the proper prefix (should :param prefix: be True) and are strings rather than lists or tuples. @@ -346,7 +348,7 @@ def fixPath(self, inp, prefix = True): inp = self.__prefix + inp return inp - def listDir(self, streams = True, storages = False): + def listDir(self, streams : bool = True, storages : bool = False): """ Replacement for OleFileIO.listdir that runs at the current prefix directory. @@ -366,7 +368,7 @@ def listDir(self, streams = True, storages = False): self.__listDirRes = [x for x in entries if len(x) > prefixLength and x[:prefixLength] == prefix] return self.__listDirRes - def slistDir(self, streams = True, storages = False): + def slistDir(self, streams : bool = True, storages : bool = False): """ Replacement for OleFileIO.listdir that runs at the current prefix directory. Returns a list of strings instead of lists. @@ -409,7 +411,7 @@ def saveRaw(self, path): f.write(data) @property - def areStringsUnicode(self): + def areStringsUnicode(self) -> bool: """ Returns a boolean telling if the strings are unicode encoded. """ @@ -457,7 +459,7 @@ def importance(self): return self._ensureSetProperty('_importance', '00170003') @property - def mainProperties(self): + def mainProperties(self) -> Properties: """ Returns the Properties instance used by the MSGFile instance. """ @@ -469,7 +471,7 @@ def mainProperties(self): return self._prop @property - def named(self): + def named(self) -> Named: """ The main named properties instance for this file. """ diff --git a/extract_msg/prop.py b/extract_msg/prop.py index 8e0e8a9b..44462046 100644 --- a/extract_msg/prop.py +++ b/extract_msg/prop.py @@ -2,7 +2,7 @@ import logging from . import constants -from .utils import fromTimeStamp, msgEpoch, properHex +from .utils import fromTimeStamp, filetimeToUtc, properHex logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -135,13 +135,13 @@ def parseType(self, _type, stream): try: rawtime = constants.ST3.unpack(value)[0] if rawtime != 915151392000000000: - value = fromTimeStamp(msgEpoch(rawtime)) + value = fromTimeStamp(filetimeToUtc(rawtime)) else: # Temporarily just set to max time to signify a null date. value = datetime.datetime.max except Exception as e: logger.exception(e) - logger.error(f'Timestamp value of {msgEpoch(constants.ST3.unpack(value)[0])} caused an exception. This was probably caused by the time stamp being too far in the future.') + logger.error(f'Timestamp value of {filetimeToUtc(constants.ST3.unpack(value)[0])} caused an exception. This was probably caused by the time stamp being too far in the future.') logger.error(self.raw) elif _type == 0x0048: # PtypGuid # TODO parsing for this diff --git a/extract_msg/properties.py b/extract_msg/properties.py index d6a2b0b3..20ab8325 100644 --- a/extract_msg/properties.py +++ b/extract_msg/properties.py @@ -4,7 +4,7 @@ from . import constants from .prop import createProp -from .utils import divide, fromTimeStamp, msgEpoch, properHex +from .utils import divide, fromTimeStamp, filetimeToUtc, properHex logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 6c623eef..c92e0dac 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -28,7 +28,7 @@ logging.addLevelName(5, 'DEVELOPER') -def addNumToDir(dirName): +def addNumToDir(dirName : pathlib.Path) -> pathlib.Path: """ Attempt to create the directory with a '(n)' appended. """ @@ -52,15 +52,15 @@ def addNumToZipDir(dirName : pathlib.Path, _zip): return newDirName return None -def bitwiseAdjust(inp, mask): +def bitwiseAdjust(inp : int, mask : int) -> int: """ Uses a given mask to adjust the location of bits after an operation like bitwise AND. This is useful for things like flags where you are trying to get a small portion of a larger number. Say for example, you had the number 0xED (0b11101101) and you needed the adjusted result of the AND operation - with 0x70 (0b01110000). The result of the and operation (0b01100000) and the - mask used to get it (0x70) are give and the output gets adjusted to be 0x6 - (0b110). + with 0x70 (0b01110000). The result of the AND operation (0b01100000) and the + mask used to get it (0x70) are given to this function and the adjustment + will be done automatically. :param mask: MUST be greater than 0. """ @@ -68,7 +68,7 @@ def bitwiseAdjust(inp, mask): raise ValueError('Mask MUST be greater than 0') return inp >> bin(mask)[::-1].index('1') -def bitwiseAdjustedAnd(inp, mask): +def bitwiseAdjustedAnd(inp : int, mask : int) -> int: """ Preforms the bitwise AND operation between :param inp: and :param mask: and adjusts the results based on the rules of the bitwiseAdjust function. @@ -77,12 +77,14 @@ def bitwiseAdjustedAnd(inp, mask): raise ValueError('Mask MUST be greater than 0') return (inp & mask) >> bin(mask)[::-1].index('1') -def bytesToGuid(bytes_input): - hexinput = [properHex(byte) for byte in bytes_input] - hexs = [hexinput[3] + hexinput[2] + hexinput[1] + hexinput[0], hexinput[5] + hexinput[4], hexinput[7] + hexinput[6], hexinput[8] + hexinput[9], ''.join(hexinput[10:16])] - return '{{{}-{}-{}-{}-{}}}'.format(*hexs).upper() +def bytesToGuid(bytesInput : bytes) -> str: + """ + Converts a bytes instance to a GUID. + """ + guidVals = constants.ST_GUID.unpack(bytesInput) + return f'{{{guidVals[0]:08X}-{guidVals[1]:04X}-{guidVals[2]:04X}-{guidVals[3][:2].hex().upper()}-{guidVals[3][2:].hex().upper()}}}' -def ceilDiv(n, d): +def ceilDiv(n : int, d : int) -> int: """ Returns the int from the ceil division of n / d. ONLY use ints as inputs to this function. @@ -92,14 +94,11 @@ def ceilDiv(n, d): """ return -(n // -d) -def divide(string, length): +def divide(string, length : int) -> list: """ - Taken (with permission) from https://github.com/TheElementalOfDestruction/creatorUtils - - Divides a string into multiple substrings of equal length. - If there is not enough for the last substring to be equal, - it will simply use the rest of the string. - Can also be used for things like lists and tuples. + Divides a string into multiple substrings of equal length. If there is not + enough for the last substring to be equal, it will simply use the rest of + the string. Can also be used for things like lists and tuples. :param string: string to be divided. :param length: length of each division. @@ -115,12 +114,15 @@ def divide(string, length): """ return [string[length * x:length * (x + 1)] for x in range(int(ceilDiv(len(string), length)))] -def fromTimeStamp(stamp): +def fromTimeStamp(stamp) -> datetime.datetime: + """ + Returns a datetime from the UTC timestamp given the current timezone. + """ return datetime.datetime.fromtimestamp(stamp, tzlocal.get_localzone()) def getCommandArgs(args): """ - Parse command-line arguments + Parse command-line arguments. """ parser = argparse.ArgumentParser(description=constants.MAINDOC, prog='extract_msg') # --use-content-id, --cid @@ -222,7 +224,7 @@ def getContFileDir(_file_): """ return '/'.join(_file_.replace('\\', '/').split('/')[:-1]) -def getEncodingName(codepage): +def getEncodingName(codepage : int) -> str: """ Returns the name of the encoding with the specified codepage. """ @@ -237,13 +239,13 @@ def getEncodingName(codepage): def getFullClassName(inp): return inp.__class__.__module__ + '.' + inp.__class__.__name__ -def hasLen(obj): +def hasLen(obj) -> bool: """ Checks if :param obj: has a __len__ attribute. """ return hasattr(obj, '__len__') -def injectHtmlHeader(msgFile): +def injectHtmlHeader(msgFile) -> bytes: """ Returns the HTML body from the MSG file (will check that it has one) with the HTML header injected into it. @@ -268,7 +270,7 @@ def replace(bodyMarker): # Use the previously defined function to inject the HTML header. return constants.RE_HTML_BODY_START.sub(replace, msgFile.htmlBody, 1) -def injectRtfHeader(msgFile): +def injectRtfHeader(msgFile) -> bytes: """ Returns the RTF body from the MSG file (will check that it has one) with the RTF header injected into it. @@ -345,7 +347,12 @@ def replace(bodyMarker): raise Exception('All injection attempts failed.') -def inputToBytes(stringInputVar, encoding): +def inputToBytes(stringInputVar, encoding) -> bytes: + """ + Converts the input into bytes. + + :raises ConversionError: if the input cannot be converted. + """ if isinstance(stringInputVar, bytes): return stringInputVar elif isinstance(stringInputVar, str): @@ -355,7 +362,7 @@ def inputToBytes(stringInputVar, encoding): else: raise ConversionError('Cannot convert to bytes.') -def inputToMsgpath(inp): +def inputToMsgpath(inp) -> list: """ Converts the input into an msg path. """ @@ -364,7 +371,12 @@ def inputToMsgpath(inp): ret = inputToString(inp, 'utf-8').replace('\\', '/').split('/') return ret if ret[0] != '' else [] -def inputToString(bytesInputVar, encoding): +def inputToString(bytesInputVar, encoding) -> str: + """ + Converts the input into a string. + + :raises ConversionError: if the input cannot be converted. + """ if isinstance(bytesInputVar, str): return bytesInputVar elif isinstance(bytesInputVar, bytes): @@ -374,7 +386,7 @@ def inputToString(bytesInputVar, encoding): else: raise ConversionError('Cannot convert to str type.') -def isEncapsulatedRtf(inp): +def isEncapsulatedRtf(inp : bytes) -> bool: """ Currently the destection is made to be *extremly* basic, but this will work for now. In the future this will be fixed to that literal text in the body @@ -382,13 +394,13 @@ def isEncapsulatedRtf(inp): """ return b'\\fromhtml' in inp -def isEmptyString(inp): +def isEmptyString(inp : str) -> bool: """ Returns true if the input is None or is an Empty string. """ return (inp == '' or inp is None) -def knownMsgClass(classType): +def knownMsgClass(classType : str) -> bool: """ Checks if the specified class type is recognized by the module. Usually used for checking if a type is simply unsupported rather than unknown. @@ -403,15 +415,16 @@ def knownMsgClass(classType): return False -def msgEpoch(inp): +def filetimeToUtc(inp : int) -> float: """ - Taken (with permission) from https://github.com/TheElementalOfDestruction/creatorUtils + Converts a FILETIME into a unix timestamp. """ return (inp - 116444736000000000) / 10000000.0 -def msgpathToString(inp): +def msgpathToString(inp) -> str: """ - Converts an msgpath (one of the internal paths inside an msg file) into a string. + Converts an msgpath (one of the internal paths inside an msg file) into a + string. """ if inp is None: return None @@ -445,7 +458,8 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt when it cannot identify what MSGFile derivitive to use. Otherwise, it will log the error and return a basic MSGFile instance. - Raises UnsupportedMSGTypeError and UnrecognizedMSGTypeError. + :raises UnsupportedMSGTypeError: if the type is recognized but not suppoted. + :raises UnrecognizedMSGTypeError: if the type is not recognized. """ from .appointment import Appointment from .attachment import Attachment @@ -470,6 +484,7 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt elif classType == 'ipm': # Unspecified format. It should be equal to this and not just start with it. return msg elif strict: + # Because we are closing it, we need to store it in a variable first. ct = msg.classType msg.close() if knownMsgClass(classType): @@ -479,7 +494,7 @@ def openMsg(path, prefix = '', attachmentClass = None, filename = None, delayAtt logger.error(f'Could not recognize msg class type "{msg.classType}". This most likely means it hasn\'t been implemented yet, and you should ask the developers to add support for it.') return msg -def parseType(_type, stream, encoding, extras): +def parseType(_type : int, stream, encoding, extras): """ Converts the data in :param stream: to a much more accurate type, specified by :param _type:. @@ -535,7 +550,7 @@ def parseType(_type, stream, encoding, extras): elif _type == 0x0040: # PtypTime rawtime = constants.ST3.unpack(value)[0] if rawtime != 915151392000000000: - value = fromTimeStamp(msgEpoch(rawtime)) + value = fromTimeStamp(filetimeToUtc(rawtime)) else: # Temporarily just set to max time to signify a null date. value = datetime.datetime.max @@ -592,14 +607,14 @@ def parseType(_type, stream, encoding, extras): if _type == 0x1014: return tuple(constants.STMI64.unpack(x)[0] for x in extras) if _type == 0x1040: - return tuple(msgEpoch(constants.ST3.unpack(x)[0]) for x in extras) + return tuple(filetimeToUtc(constants.ST3.unpack(x)[0]) for x in extras) if _type == 0x1048: return tuple(bytesToGuid(x) for x in extras) else: raise NotImplementedError(f'Parsing for type {_type} has not yet been implmented. If you need this type, please create a new issue labeled "NotImplementedError: parseType {_type}"') return value -def prepareFilename(filename): +def prepareFilename(filename) -> str: """ Adjusts :param filename: so that it can succesfully be used as an actual file name. @@ -607,10 +622,10 @@ def prepareFilename(filename): # I would use re here, but it tested to be slightly slower than this. return ''.join(i for i in filename if i not in r'\/:*?"<>|' + '\x00') -def properHex(inp, length = 0): +def properHex(inp, length : int = 0) -> str: """ - Taken (with permission) from - https://github.com/TheElementalOfDestruction/creatorUtils + Takes in various input types and converts them into a hex string whose + length will always be even. """ a = '' if isinstance(inp, str): @@ -623,13 +638,13 @@ def properHex(inp, length = 0): a = '0' + a return a.rjust(length, '0').upper() -def roundUp(inp, mult): +def roundUp(inp : int, mult : int) -> int: """ Rounds :param inp: up to the nearest multiple of :param mult:. """ return inp + (mult - inp) % mult -def rtfSanitizeHtml(inp): +def rtfSanitizeHtml(inp : str) -> str: """ Sanitizes input to an RTF stream that has encapsulated HTML. """ @@ -659,7 +674,7 @@ def rtfSanitizeHtml(inp): return output -def rtfSanitizePlain(inp): +def rtfSanitizePlain(inp : str) -> str: """ Sanitizes input to a plain RTF stream. """ From 37406ae14a7a1f3944379847f5240a667a07a7a5 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Wed, 19 Jan 2022 11:52:04 -0800 Subject: [PATCH 14/17] type hints, update to match 0.29.3 --- CHANGELOG.md | 7 +++++++ extract_msg/__init__.py | 2 +- extract_msg/__main__.py | 4 +--- extract_msg/message.py | 4 ++-- extract_msg/recipient.py | 28 ++++++++++++++-------------- extract_msg/utils.py | 2 +- 6 files changed, 26 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df1fad59..6005480a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,13 @@ * Updated `utils.bytesToGuid` to make it faster and more efficient. * Renamed `utils.msgEpoch` to `utils.filetimeToUtc` to be more descriptive. +**v0.29.3** +* [[TeamMsgExtractor #226](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Fix typo in command parsing that prevented the usage of `allowFallback`. +* Fixed main still manually navigating to a new directory with os.chdir instead of using `customPath`. +* Fixed issue in main where the `--html` option was being using for both html *and* rtf. This meant if you wanted rtf it would not have used it, and if you wanted html it would have thrown an error. +* Fixed `--out-name` having no effect. +* Fixed `--out` having no effect. + **v0.29.2** * Fixed issue where the RTF injection was accidentally doing HTML escapes for non-encapsulated streams and *not* doing escapes for encapsulated streams. * Fixed name error in `Message.save` causing bad logic. For context, the internal variable `zip` was renamed to `_zip` to avoid a name conflict with the built-in function. Some instances of it were missed. diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 8de35773..05e95446 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,7 +27,7 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2022-01-16' +__date__ = '2022-01-19' __version__ = '0.30.0' import logging diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index 573bca6e..932c99e0 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -47,12 +47,10 @@ def main(): if args.dump_stdout: print(msg.body) else: - os.chdir(out) - msg.save(json = args.json, useMsgFilename = args.use_filename, contentId = args.cid, html = args.html, rtf = args.html, allowFallback = args.allowFallback) + msg.save(customPath = out, customFilename = args.out_name, json = args.json, useMsgFilename = args.use_filename, contentId = args.cid, html = args.html, rtf = args.rtf, allowFallback = args.allowFallback) except Exception as e: print("Error with file '" + x[0] + "': " + traceback.format_exc()) - os.chdir(currentdir) if __name__ == '__main__': main() diff --git a/extract_msg/message.py b/extract_msg/message.py index 80721102..500c06dd 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -23,7 +23,7 @@ class Message(MessageBase): def __init__(self, path, prefix = '', attachmentClass = Attachment, filename = None, delayAttachments = False, overrideEncoding = None, attachmentErrorBehavior = constants.ATTACHMENT_ERROR_THROW, recipientSeparator = ';'): super().__init__(path, prefix, attachmentClass, filename, delayAttachments, overrideEncoding, attachmentErrorBehavior, recipientSeparator) - def dump(self): + def dump(self) -> None: """ Prints out a summary of the message """ @@ -33,7 +33,7 @@ def dump(self): print('Body:') print(self.body) - def getJson(self): + def getJson(self) -> str: """ Returns the JSON representation of the Message. """ diff --git a/extract_msg/recipient.py b/extract_msg/recipient.py index 64e41419..2ff49319 100644 --- a/extract_msg/recipient.py +++ b/extract_msg/recipient.py @@ -25,7 +25,7 @@ def __init__(self, _dir, msg): self.__type = self.__props.get('0C150003').value self.__formatted = f'{self.__name} <{self.__email}>' - def _ensureSet(self, variable, streamID, stringStream = True): + def _ensureSet(self, variable, streamID, stringStream : bool = True): """ Ensures that the variable exists, otherwise will set it using the specified stream. After that, return said variable. @@ -43,7 +43,7 @@ def _ensureSet(self, variable, streamID, stringStream = True): setattr(self, variable, value) return value - def _ensureSetNamed(self, variable, propertyName): + def _ensureSetNamed(self, variable : str, propertyName : str): """ Ensures that the variable exists, otherwise will set it using the named property. After that, return said variable. @@ -55,7 +55,7 @@ def _ensureSetNamed(self, variable, propertyName): setattr(self, variable, value) return value - def _ensureSetProperty(self, variable, propertyName): + def _ensureSetProperty(self, variable : str, propertyName : str): """ Ensures that the variable exists, otherwise will set it using the property. After that, return said variable. @@ -70,7 +70,7 @@ def _ensureSetProperty(self, variable, propertyName): setattr(self, variable, value) return value - def _ensureSetTyped(self, variable, _id): + def _ensureSetTyped(self, variable : str, _id): """ Like the other ensure set functions, but designed for when something could be multiple types (where only one will be present). This way you @@ -95,7 +95,7 @@ def _getStringStream(self, filename): """ return self.__msg._getStringStream([self.__dir, filename]) - def _getTypedData(self, id, _type = None): + def _getTypedData(self, _id, _type = None): """ Gets the data for the specified id as the type that it is supposed to be. :param id: MUST be a 4 digit hexadecimal string. @@ -105,15 +105,15 @@ def _getTypedData(self, id, _type = None): or VARIABLE_LENGTH_PROPS_STRING. """ verifyPropertyId(id) - id = id.upper() - found, result = self._getTypedStream('__substg1.0_' + id, _type) + _id = _id.upper() + found, result = self._getTypedStream('__substg1.0_' + _id, _type) if found: return result else: - found, result = self._getTypedProperty(id, _type) + found, result = self._getTypedProperty(_id, _type) return result if found else None - def _getTypedProperty(self, propertyID, _type = None): + def _getTypedProperty(self, propertyID : str, _type = None): """ Gets the property with the specified id as the type that it is supposed to be. :param id: MUST be a 4 digit hexadecimal string. @@ -151,19 +151,19 @@ def _getTypedStream(self, filename, _type = None): """ self.__msg._getTypedStream(self, [self.__dir, filename], True, _type) - def exists(self, filename): + def exists(self, filename) -> bool: """ Checks if stream exists inside the recipient folder. """ return self.__msg.exists([self.__dir, filename]) - def sExists(self, filename): + def sExists(self, filename) -> bool: """ Checks if the string stream exists inside the recipient folder. """ return self.__msg.sExists([self.__dir, filename]) - def existsTypedProperty(self, id, _type = None): + def existsTypedProperty(self, id, _type = None) -> bool: """ Determines if the stream with the provided id exists. The return of this function is 2 values, the first being a boolean for if anything was @@ -197,7 +197,7 @@ def entryID(self): return self.__entryID @property - def formatted(self): + def formatted(self) -> str: """ Returns the formatted recipient string. """ @@ -218,7 +218,7 @@ def name(self): return self.__name @property - def props(self): + def props(self) -> Properties: """ Returns the Properties instance of the recipient. """ diff --git a/extract_msg/utils.py b/extract_msg/utils.py index c92e0dac..4acad2d6 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -168,7 +168,7 @@ def getCommandArgs(args): parser.add_argument('--rtf', dest='rtf', action='store_true', help='Sets whether the output should be rtf. If this is not possible, will error.') # --allow-fallback - parser.add_argument('--allow-fallback', dest='allowFallbac', action='store_true', + parser.add_argument('--allow-fallback', dest='allowFallback', action='store_true', help='Tells the program to fallback to a different save type if the selected one is not possible.') # --out-name NAME parser.add_argument('--out-name', dest = 'out_name', From 08d514c7b226cc59458e3f62d7f4a334f66ceaae Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Wed, 19 Jan 2022 12:53:45 -0800 Subject: [PATCH 15/17] Forgot some of the changes from 0.29.3 --- extract_msg/__main__.py | 2 +- extract_msg/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index 932c99e0..daafa853 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -42,7 +42,7 @@ def main(): utils.setupLogging(args.config_path, level, args.log, args.file_logging) for x in args.msgs: try: - with Message(x[0]) as msg: + with utils.openMsg(x[0]) as msg: # Right here we should still be in the path in currentdir if args.dump_stdout: print(msg.body) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 4acad2d6..7f713ddb 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -178,6 +178,7 @@ def getCommandArgs(args): help='An msg file to be parsed') options = parser.parse_args(args) + # Check if more than one of the following arguments has been specified if options.html + options.rtf + options.json > 1: raise IncompatibleOptionsError('Only one of these options may be selected at a time: --html, --json, --raw, --rtf') From 9a3380911cb8bdd16bd3a4be80f7a9dc5bfbfe70 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Wed, 19 Jan 2022 22:00:52 -0800 Subject: [PATCH 16/17] As far as my tests have gone, this version should be ready. Let's do it! --- CHANGELOG.md | 2 ++ extract_msg/__main__.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6005480a..bd1d318a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Started the addition of type hints to functions and methods. * Updated `utils.bytesToGuid` to make it faster and more efficient. * Renamed `utils.msgEpoch` to `utils.filetimeToUtc` to be more descriptive. +* Updated internal variable names to be more consistent. +* Improvements to the way `__main__` works. This does not affect the output it will generate, only the efficiency and readability. **v0.29.3** * [[TeamMsgExtractor #226](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Fix typo in command parsing that prevented the usage of `allowFallback`. diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index daafa853..470c0040 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -6,19 +6,22 @@ from extract_msg import __doc__, utils from extract_msg.message import Message -def main(): + +def main() -> None: # Setup logging to stdout, indicate running from cli CLI_LOGGING = 'extract_msg_cli' - args = utils.getCommandArgs(sys.argv[1:]) level = logging.INFO if args.verbose else logging.WARNING - currentdir = os.getcwd() # Store this just in case the paths that have been given are relative + + # Determine where to save the files to. + currentDir = os.getcwd() # Store this incase the path changes. if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) out = args.out_path else: - out = currentdir + out = currentDir + if args.dev: import extract_msg.dev extract_msg.dev.main(args, sys.argv[1:]) @@ -35,22 +38,34 @@ def main(): pprint.pprint(valResults) print(f'These results have been saved to {filename}') with open(filename, 'w') as fil: - fil.write(json.dumps(valResults)) + json.dump(valResults, fil) input('Press enter to exit...') else: if not args.dump_stdout: utils.setupLogging(args.config_path, level, args.log, args.file_logging) + + # Quickly make a dictionary for the keyword arguments. + kwargs = { + 'customPath': out, + 'customFilename': args.out_name, + 'json': args.json, + 'useMsgFilename': args.use_filename, + 'contentId': args.cid, + 'html': args.html, + 'rtf': args.rtf, + 'allowFallback': args.allowFallback, + } + for x in args.msgs: try: with utils.openMsg(x[0]) as msg: - # Right here we should still be in the path in currentdir if args.dump_stdout: print(msg.body) else: - msg.save(customPath = out, customFilename = args.out_name, json = args.json, useMsgFilename = args.use_filename, contentId = args.cid, html = args.html, rtf = args.rtf, allowFallback = args.allowFallback) + msg.save(**kwargs) except Exception as e: - print("Error with file '" + x[0] + "': " + - traceback.format_exc()) + print(f'Error with file "{x[0]}": {traceback.format_exc()}') + if __name__ == '__main__': main() From 0939ae0f4e3dd6deb319b119b84d99369bda0e9b Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Wed, 19 Jan 2022 22:02:20 -0800 Subject: [PATCH 17/17] Fix for extra character in changelog. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd1d318a..cf0c83cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ * Updated `utils.bytesToGuid` to make it faster and more efficient. * Renamed `utils.msgEpoch` to `utils.filetimeToUtc` to be more descriptive. * Updated internal variable names to be more consistent. -* Improvements to the way `__main__` works. This does not affect the output it will generate, only the efficiency and readability. +* Improvements to the way `__main__` works. This does not affect the output it will generate, only the efficiency and readability. **v0.29.3** * [[TeamMsgExtractor #226](https://github.com/TeamMsgExtractor/msg-extractor/issues/198)] Fix typo in command parsing that prevented the usage of `allowFallback`.