diff --git a/CHANGELOG.md b/CHANGELOG.md index bf4de4e8..ec246fe1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +**v0.22.0** +* [[Syncurity #30](https://github.com/Syncurity/msg-extractor/issues/30)] Added `--validate` option. +* [[Syncurity #24](https://github.com/Syncurity/msg-extractor/issues/24)] Moved all dev code into its own scripts. Use `--dev` to use from the command line. +* [[mattgwwalker #67](https://github.com/Syncurity/msg-extractor/issues/67)] Added compatability module to enforce unicode os functions. +* Added new function to `Message` class: `Message.sExists`. This function checks if a string stream exists. It's input should be formatted identically to that of `Message._getStringSteam`. +* Added new function to `Message` class: `Message.fix_path`. This function will add the proper prefix to the path (if the `prefix` parameter is true) and adjust the path to be a string rather than a list or tuple. +* Added new function to `utils.py`: `get_full_class_name`. This function returns a string containing the module name and the class name of any instance of any class. It is returned in the format of `{module}.{class}`. +* Added a sort of alias of `Message._getStream`, `Message._getStringStream`, `Message.Exists`, and `Message.sExists` to `Attachment` and `Recipient`. These functions run inside the associated attachment directory or recipient directory, respectively. +* Added a fix to an issue introduced in an earlier version caused by accidentally deleting a letter in the code. + **v0.21.0** * [[Syncurity #7](https://github.com/Syncurity/msg-extractor/issues/7)] Changed debug code to use logging module. * [[Syncurity #26](https://github.com/Syncurity/msg-extractor/issues/26)] Fixed Attachment class using wrong properties file location in embedded msg files. diff --git a/README.rst b/README.rst index cf6c1fab..75d4d31e 100644 --- a/README.rst +++ b/README.rst @@ -152,7 +152,6 @@ Here is a list of things that are currently on our todo list: * Tests (ie. unittest) * Finish writing a usage guide * Improve the intelligence of the saving functions -* Create a Pypi package * Provide a way to save attachments and messages into a custom location under a custom name * Implement better property handling that will convert each type into a python equivalent if possible * Implement handling of named properties @@ -161,8 +160,8 @@ Here is a list of things that are currently on our todo list: .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.21.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.21.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.22.0-blue.svg + :target: https://pypi.org/project/extract-msg/0.22.0/ .. |PyPI1| image:: https://img.shields.io/badge/python-2.7+-brightgreen.svg :target: https://www.python.org/downloads/release/python-2715/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.6+-brightgreen.svg diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 27ac9912..edd86120 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -28,7 +28,7 @@ __author__ = 'Matthew Walker & The Elemental of Creation' __date__ = '2018-12-05' -__version__ = '0.21.0' +__version__ = '0.22.0' from extract_msg import constants from extract_msg.attachment import Attachment diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index e04bfbb4..aa041399 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -1,32 +1,51 @@ import logging -import os +import sys import traceback from extract_msg import __doc__, utils +from extract_msg.compat import os_ as os from extract_msg.message import Message if __name__ == '__main__': # Setup logging to stdout, indicate running from cli CLI_LOGGING = 'extract_msg_cli' - args = utils.get_command_args() + args = utils.get_command_args(sys.argv[1:]) level = logging.INFO if args.verbose else logging.WARNING - utils.setup_logging(args.config_path, level, args.log, args.file_logging) - currentdir = os.getcwd() # Store this just in case the paths that have been given are relative + currentdir = os.getcwdu() # Store this just in case the paths that have been given are relative if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) out = args.out_path else: out = currentdir + if args.dev: + import extract_msg.dev + extract_msg.dev.main(args, sys.argv[1:]) + elif args.validate: + import json + import pprint + import time - for x in args.msgs: - try: - with Message(x[0]) as msg: - #Right here we should still be in the path in currentdir - os.chdir(out) - msg.save(toJson = args.json, useFileName = args.use_filename, ContentId = args.cid) - except Exception as e: - print("Error with file '" + filename + "': " + - traceback.format_exc()) - os.chdir(currentdir) + from extract_msg import validation + + val_results = {x[0]: validation.validate(x[0]) for x in args.msgs} + filename = 'validation {}.json'.format(int(time.time())) + print('Validation Results:') + pprint.pprint(val_results) + print('These results have been saved to {}'.format(filename)) + with open(filename, 'w') as fil: + fil.write(json.dumps(val_results)) + utils.get_input('Press enter to exit...') + else: + utils.setup_logging(args.config_path, level, args.log, args.file_logging) + for x in args.msgs: + try: + with Message(x[0]) as msg: + # Right here we should still be in the path in currentdir + os.chdir(out) + msg.save(toJson = args.json, useFileName = args.use_filename, ContentId = args.cid) + except Exception as e: + print("Error with file '" + x[0] + "': " + + traceback.format_exc()) + os.chdir(currentdir) diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 10341bf9..3983f05a 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -26,47 +26,59 @@ def __init__(self, msg, dir_): object.__init__(self) self.__msg = msg self.__dir = dir_ - self.__props = Properties( - self.msg._getStream([self.__dir, '__properties_version1.0']), + self.__props = Properties(self._getStream('__properties_version1.0'), constants.TYPE_ATTACHMENT) # Get long filename - self.__longFilename = msg._getStringStream([dir_, '__substg1.0_3707']) + self.__longFilename = self._getStringStream('__substg1.0_3707') # Get short filename - self.__shortFilename = msg._getStringStream([dir_, '__substg1.0_3704']) + self.__shortFilename = self._getStringStream('__substg1.0_3704') # Get Content-ID - self.__cid = msg._getStringStream([dir_, '__substg1.0_3712']) + self.__cid = self._getStringStream('__substg1.0_3712') # Get attachment data - if msg.Exists([dir_, '__substg1.0_37010102']): + if self.Exists('__substg1.0_37010102'): self.__type = 'data' - self.__data = msg._getStream([dir_, '__substg1.0_37010102']) - elif msg.Exists([dir_, '__substg1.0_3701000D']): - if (self.props['37050003'].value & 0x7) != 0x5: - if not debug: - raise NotImplementedError( - 'Current version of extract_msg does not support extraction of containers that are not embeded msg files.') - # TODO add implementation - else: - # DEBUG - logger.debug('Debugging is true, ignoring NotImplementedError and printing debug info...') - logger.debug('dir_ = {}'.format(dir_)) - logger.debug('Writing properties stream to output:') - logger.debug('--------Start-Properties-Stream--------\n' + - properHex(self.props.stream) + - '\n---------End-Properties-Stream---------') - logger.debug('Writing directory contents to output:') - logger.debug('--------Start-Directory-Content--------') - logger.debug('\n'.join([repr(x) for x in msg.listDir(True, True)])) - logger.debug('---------End-Directory-Content---------') + self.__data = self._getStream('__substg1.0_37010102') + elif self.Exists('__substg1.0_3701000D'): + if (self.__props['37050003'].value & 0x7) != 0x5: + raise NotImplementedError( + 'Current version of extract_msg does not support extraction of containers that are not embedded msg files.') + # TODO add implementation else: self.__prefix = msg.prefixList + [dir_, '__substg1.0_3701000D'] self.__type = 'msg' self.__data = msg.__class__(self.msg.path, self.__prefix, self.__class__) else: + # TODO Handling for special attacment types (like 0x00000007) raise TypeError('Unknown attachment type.') + def _getStream(self, filename): + return self.__msg._getStream([self.__dir, filename]) + + def _getStringStream(self, filename): + """ + Gets a string representation of the requested filename. + Checks for both ASCII and Unicode representations and returns + a value if possible. If there are both ASCII and Unicode + versions, then :param prefer: specifies which will be + returned. + """ + return self.__msg._getStringStream([self.__dir, filename]) + + def Exists(self, filename): + """ + Checks if stream exists inside the attachment folder. + """ + return self.__msg.Exists([self.__dir, filename]) + + def sExists(self, filename): + """ + Checks if the string stream exists inside the attachment folder. + """ + return self.__msg.sExists([self.__dir, filename]) + def save(self, contentId=False, json=False, useFileName=False, raw=False, customPath=None, customFilename=None): # Check if the user has specified a custom filename filename = None diff --git a/extract_msg/compat/__init__.py b/extract_msg/compat/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/extract_msg/compat/os_.py b/extract_msg/compat/os_.py new file mode 100644 index 00000000..77a4b274 --- /dev/null +++ b/extract_msg/compat/os_.py @@ -0,0 +1,9 @@ +""" +Compatibility module to ensure that certain functions exist across python versions +""" + +from os import * +import sys + +if sys.version_info[0] >= 3: + getcwdu = getcwd diff --git a/extract_msg/constants.py b/extract_msg/constants.py index 7dd38992..d2a6aa03 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -87,7 +87,8 @@ NEEDS_ARG = [ '--out-name', ] -MAINDOC = "extract_msg:\n\tExtracts emails and attachments saved in Microsoft Outlook's .msg files.\n\nhttps://github.com/mattgwwalker/msg-extractor" +MAINDOC = "extract_msg:\n\tExtracts emails and attachments saved in Microsoft Outlook's .msg files.\n\n" \ + "https://github.com/mattgwwalker/msg-extractor" # Define pre-compiled structs to make unpacking slightly faster # General structs @@ -142,7 +143,7 @@ # This property information was sourced from # http://www.fileformat.info/format/outlookmsg/index.htm # on 2013-07-22. -# It was extened by The Elemental of Creation on 2018-10-12 +# It was extended by The Elemental of Creation on 2018-10-12 PROPERTIES = { '00010102': 'Template data', '0002000B': 'Alternate recipient allowed', diff --git a/extract_msg/dev.py b/extract_msg/dev.py new file mode 100644 index 00000000..c5cf52b8 --- /dev/null +++ b/extract_msg/dev.py @@ -0,0 +1,67 @@ +""" +Module for collecting data to be sent to the developer. +""" + +# NOTE: Order of tasks: +# 1. Check for exceptions: +# * Check the entire process for exceptions raised by a specific file and log them. If none occur, +# log something like "No exceptions were detected." +# 2. Run the file through the developer versions of the classes + + +import logging + +from extract_msg import dev_classes +from extract_msg import utils +from extract_msg.compat import os_ as os +from extract_msg.message import Message + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +def setup_dev_logger(default_path=None, logfile = None, env_key='EXTRACT_MSG_LOG_CFG'): + utils.setup_logging(default_path, 5, logfile, True, env_key) + + +def main(args, argv): + """ + Please only run this from the command line. Attempting to use this + otherwise is likely to fail. :param args: is the class instance + returned by `extract_msg.utils.get_command_args`. :param argv: is + the list of arguments that were the input to the aforementioned + function. + """ + setup_dev_logger(args.config_path, args.log) + currentdir = os.getcwdu() # Store this just in case the paths that have been given are relative + if args.out_path: + if not os.path.exists(args.out_path): + os.makedirs(args.out_path) + out = args.out_path + else: + out = currentdir + logger.log(5, 'ARGV: {}'.format(argv)) + for y, x in enumerate(args.msgs): + logger.log(5, '---- RUNNING DEVELOPER MODE ON FILE {} ----'.format(x[0])) + logger.log(5, 'EXCEPTION CHECK:') + try: + with Message(x[0]) as msg: + # Right here we should still be in the path in currentdir + os.chdir(out) + msg.save(toJson = args.json, useFileName = args.use_filename, ContentId = args.cid) + except Exception as e: + logger.exception(e) + else: + logger.log(5, 'No exceptions raised.') + logger.log(5, 'DEVELOPER CLASS OUTPUT:') + os.chdir(currentdir) + dev_classes.Message(x[0]) + logger.log(5, '---- END OF DEVELOPER LOG ----') + logpath = None; + for x in logging.root.handlers: + try: + logpath = x.baseFilename + except AttributeError: + pass; + print('Logging complete. Log has been saved to {}'.format(logpath)) diff --git a/extract_msg/dev_classes/__init__.py b/extract_msg/dev_classes/__init__.py new file mode 100644 index 00000000..e1e11c00 --- /dev/null +++ b/extract_msg/dev_classes/__init__.py @@ -0,0 +1,2 @@ +from extract_msg.dev_classes.attachment import Attachment +from extract_msg.dev_classes.message import Message diff --git a/extract_msg/dev_classes/attachment.py b/extract_msg/dev_classes/attachment.py new file mode 100644 index 00000000..1ee5005c --- /dev/null +++ b/extract_msg/dev_classes/attachment.py @@ -0,0 +1,84 @@ +import logging + +from extract_msg import constants +from extract_msg.properties import Properties +from extract_msg.utils import properHex + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +class Attachment(object): + """ + Developer version of the `extract_msg.attachment.Attachment` class. + """ + def __init__(self, msg, dir_): + """ + :param msg: the Message instance that the attachment belongs to. + :param dir_: the directory inside the msg file where the attachment is located. + """ + object.__init__(self) + self.__msg = msg + self.__dir = dir_ + self.__props = Properties( + msg._getStream([self.__dir, '__properties_version1.0']), + constants.TYPE_ATTACHMENT) + + # Get attachment data + if msg.Exists([dir_, '__substg1.0_37010102']): + self.__type = 'data' + self.__data = msg._getStream([dir_, '__substg1.0_37010102']) + elif msg.Exists([dir_, '__substg1.0_3701000D']): + if (self.__props['37050003'].value & 0x7) != 0x5: + logger.log(5, 'Printing details of NotImplementedError...') + logger.log(5, 'dir_ = {}'.format(dir_)) + logger.log(5, 'Writing properties stream to output:') + logger.log(5, '--------Start-Properties-Stream--------\n' + + properHex(self.__props.stream) + + '\n---------End-Properties-Stream---------') + logger.log(5, 'Writing directory contents to output:') + logger.log(5, '--------Start-Directory-Content--------\n' + + '\n'.join([repr(x) for x in msg.listDir(True, True)])) + logger.log(5, '---------End-Directory-Content---------') + logger.log(5, 'End of NotImplementedError details') + else: + self.__prefix = msg.prefixList + [dir_, '__substg1.0_3701000D'] + self.__type = 'msg' + self.__data = msg.__class__(msg.path, self.__prefix) + else: + raise TypeError('Unknown attachment type.') + + @property + def data(self): + """ + Returns the attachment data. + """ + return self.__data + + @property + def dir(self): + """ + Returns the directory inside the msg file where the attachment is located. + """ + return self.__dir + + @property + def msg(self): + """ + Returns the Message instance the attachment belongs to. + """ + return self.__msg + + @property + def props(self): + """ + Returns the Properties instance of the attachment. + """ + return self.__props + + @property + def type(self): + """ + Returns the type of the data. + """ + return self.__type diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py new file mode 100644 index 00000000..3b201f94 --- /dev/null +++ b/extract_msg/dev_classes/message.py @@ -0,0 +1,234 @@ +import copy +import logging +import olefile + +from extract_msg import constants +from extract_msg.dev_classes.attachment import Attachment +from extract_msg.properties import Properties +from extract_msg.recipient import Recipient +from extract_msg.utils import encode, has_len, stri, windowsUnicode + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +class Message(olefile.OleFileIO): + """ + Developer version of the `extract_msg.message.Message` class. + """ + + def __init__(self, path, prefix=''): + """ + :param path: path to the msg file in the system or is the raw msg file. + :param prefix: used for extracting embedded msg files + inside the main one. Do not set manually unless + you know what you are doing. + """ + logger.log(5, 'prefix: {}'.format(prefix)) + self.__path = path + olefile.OleFileIO.__init__(self, path) + prefixl = [] + if prefix != '': + if not isinstance(prefix, stri): + try: + prefix = '/'.join(prefix) + except: + raise TypeError('Invalid prefix type: ' + str(type(prefix)) + + '\n(This was probably caused by you setting it manually).') + prefix = prefix.replace('\\', '/') + g = prefix.split("/") + if g[-1] == '': + g.pop() + prefixl = g + if prefix[-1] != '/': + prefix += '/' + filename = self._getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix=False) + self.__prefix = prefix + self.__prefixList = prefixl + + logger.log(5, ':param path: has __len__ attribute?: {}'.format(has_len(path))) + if has_len(path): + if len(path) < 1536: + self.filename = path + logger.log(5, ':param path: length is {}; Using :param path: as file path'.format(len(path))) + else: + logger.log(5, ':param path: length is {}; Using :param path: as raw msg stream'.format(len(path))) + self.filename = None + + self.mainProperties + recipientDirs = [] + + for dir_ in self.listDir(): + if dir_[len(self.__prefixList)].startswith('__recip') and\ + dir_[len(self.__prefixList)] not in recipientDirs: + recipientDirs.append(dir_[len(self.__prefixList)]) + + self.recipients + self.attachments + self.date + + def listDir(self, streams=True, storages=False): + """ + Replacement for OleFileIO.listdir that runs at the current prefix directory. + """ + temp = self.listdir(streams, storages) + if self.__prefix == '': + return temp + prefix = self.__prefix.split('/') + if prefix[-1] == '': + prefix.pop() + out = [] + for x in temp: + good = True + if len(x) <= len(prefix): + good = False + if good: + for y in range(len(prefix)): + if x[y] != prefix[y]: + good = False + if good: + out.append(x) + return out + + def Exists(self, inp): + """ + Checks if :param inp: exists in the msg file. + """ + if isinstance(inp, list): + inp = self.__prefixList + inp + else: + inp = self.__prefix + inp + return self.exists(inp) + + def _getStream(self, filename, prefix=True): + if isinstance(filename, list): + filename = '/'.join(filename) + if prefix: + filename = self.__prefix + filename + if self.exists(filename): + stream = self.openstream(filename) + return stream.read() + else: + logger.info('Stream "{}" was requested but could not be found. Returning `None`.'.format(filename)) + return None + + def _getStringStream(self, filename, prefer='unicode', prefix=True): + """ + Gets a string representation of the requested filename. + Checks for both ASCII and Unicode representations and returns + a value if possible. If there are both ASCII and Unicode + versions, then :param prefer: specifies which will be + returned. + """ + + if isinstance(filename, list): + # Join with slashes to make it easier to append the type + filename = '/'.join(filename) + + asciiVersion = self._getStream(filename + '001E', prefix) + unicodeVersion = windowsUnicode(self._getStream(filename + '001F', prefix)) + logger.log(5, '_getStringStream called for {}. Ascii version found: {}. Unicode version found: {}.'.format( + filename, asciiVersion is not None, unicodeVersion is not None)) + if asciiVersion is None: + return unicodeVersion + elif unicodeVersion is None: + return asciiVersion + else: + if prefer == 'unicode': + return unicodeVersion + else: + return asciiVersion + + @property + def path(self): + """ + Returns the message path if generated from a file, + otherwise returns the data used to generate the + Message instance. + """ + return self.__path + + @property + def prefix(self): + """ + Returns the prefix of the Message instance. + Intended for developer use. + """ + return self.__prefix + + @property + def prefixList(self): + """ + Returns the prefix list of the Message instance. + Intended for developer use. + """ + return copy.deepcopy(self.__prefixList) + + @property + def mainProperties(self): + """ + Returns the Properties instance used by the Message instance. + """ + try: + return self._prop + except AttributeError: + self._prop = Properties(self._getStream('__properties_version1.0'), + constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) + return self._prop + + @property + def date(self): + """ + Returns the send date, if it exists. + """ + try: + return self._date + except AttributeError: + self._date = self._prop.date + return self._date + + @property + def attachments(self): + """ + Returns a list of all attachments. + """ + try: + return self._attachments + except AttributeError: + # Get the attachments + attachmentDirs = [] + + for dir_ in self.listDir(): + if dir_[len(self.__prefixList)].startswith('__attach') and\ + dir_[len(self.__prefixList)] not in attachmentDirs: + attachmentDirs.append(dir_[len(self.__prefixList)]) + + self._attachments = [] + + for attachmentDir in attachmentDirs: + self._attachments.append(Attachment(self, attachmentDir)) + + return self._attachments + + @property + def recipients(self): + """ + Returns a list of all recipients. + """ + try: + return self._recipients + except AttributeError: + # Get the recipients + recipientDirs = [] + + for dir_ in self.listDir(): + if dir_[len(self.__prefixList)].startswith('__recip') and\ + dir_[len(self.__prefixList)] not in recipientDirs: + recipientDirs.append(dir_[len(self.__prefixList)]) + + self._recipients = [] + + for recipientDir in recipientDirs: + self._recipients.append(Recipient(recipientDir, self)) + + return self._recipients diff --git a/extract_msg/logging-config/logging-nt.json b/extract_msg/logging-config/logging-nt.json index 26bc7a9a..86173d9b 100644 --- a/extract_msg/logging-config/logging-nt.json +++ b/extract_msg/logging-config/logging-nt.json @@ -39,6 +39,15 @@ "maxBytes": 10485760, "backupCount": 20, "encoding": "utf8" + }, + "developer_file_handler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEVELOPER", + "formatter": "simple", + "filename": "%LOCALAPPDATA%/extract_msg/extract_msg.log", + "maxBytes": 10485760, + "backupCount": 20, + "encoding": "utf8" } }, "loggers": { @@ -49,7 +58,7 @@ } }, "root": { - "level": "DEBUG", - "handlers": ["console", "info_file_handler", "error_file_handler", "warning_file_handler"] + "level": "DEVELOPER", + "handlers": ["console", "info_file_handler", "error_file_handler", "warning_file_handler", "developer_file_handler"] } } diff --git a/extract_msg/logging-config/logging-posix.json b/extract_msg/logging-config/logging-posix.json index 29a8b3cb..3de29d06 100644 --- a/extract_msg/logging-config/logging-posix.json +++ b/extract_msg/logging-config/logging-posix.json @@ -39,6 +39,15 @@ "maxBytes": 10485760, "backupCount": 20, "encoding": "utf8" + }, + "developer_file_handler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEVELOPER", + "formatter": "simple", + "filename": "/var/log/extract_msg/extract_msg.log", + "maxBytes": 10485760, + "backupCount": 20, + "encoding": "utf8" } }, "loggers": { @@ -49,7 +58,7 @@ } }, "root": { - "level": "DEBUG", - "handlers": ["console", "info_file_handler", "error_file_handler", "warning_file_handler"] + "level": "DEVELOPER", + "handlers": ["console", "info_file_handler", "error_file_handler", "warning_file_handler", "developer_file_handler"] } } diff --git a/extract_msg/message.py b/extract_msg/message.py index e4d216a0..540f3698 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -2,7 +2,6 @@ import email.utils import json import logging -import os import re from imapclient.imapclient import decode_utf7 @@ -11,6 +10,7 @@ from email.parser import Parser as EmailParser from extract_msg import constants from extract_msg.attachment import Attachment +from extract_msg.compat import os_ as os from extract_msg.properties import Properties from extract_msg.recipient import Recipient from extract_msg.utils import addNumToDir, encode, has_len, stri, windowsUnicode, xstr @@ -62,7 +62,7 @@ def __init__(self, path, prefix='', attachmentClass=Attachment, filename=None): g = prefix.split("/") if g[-1] == '': g.pop() - prefix = g + prefixl = g if prefix[-1] != '/': prefix += '/' filename = self._getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix=False) @@ -119,20 +119,33 @@ def Exists(self, inp): """ Checks if :param inp: exists in the msg file. """ - if isinstance(inp, list): - inp = self.__prefixList + inp - else: - inp = self.__prefix + inp + inp = self.fix_path(inp) return self.exists(inp) - def _getStream(self, filename, prefix=True): - if isinstance(filename, list): - filename = '/'.join(filename) + def sExists(self, inp): + """ + Checks if string stream :param inp: exists in the msg file. + """ + inp = self.fix_path(inp) + return self.exists(inp + '001F') or self.exists(inp + '001E') + + def fix_path(self, inp, prefix=True): + """ + Changes paths so that they have the proper + prefix (should :param prefix: be True) and + are strings rather than lists or tuples. + """ + if isinstance(inp, (list, tuple)): + inp = '/'.join(inp) if prefix: - filename = self.__prefix + filename + inp = self.__prefix + inp + return inp + + def _getStream(self, filename, prefix=True): + filename = self.fix_path(filename, prefix) if self.exists(filename): - stream = self.openstream(filename) - return stream.read() + with self.openstream(filename) as stream: + return stream.read() else: logger.info('Stream "{}" was requested but could not be found. Returning `None`.'.format(filename)) return None @@ -146,13 +159,11 @@ def _getStringStream(self, filename, prefer='unicode', prefix=True): returned. """ - if isinstance(filename, list): - # Join with slashes to make it easier to append the type - filename = '/'.join(filename) + filename = self.fix_path(filename, prefix) - asciiVersion = self._getStream(filename + '001E', prefix) - unicodeVersion = windowsUnicode(self._getStream(filename + '001F', prefix)) - logger.debug('_getStringSteam called for {}. Ascii version found: {}. Unicode version found: {}.'.format( + asciiVersion = self._getStream(filename + '001E', prefix = False) + unicodeVersion = windowsUnicode(self._getStream(filename + '001F', prefix = False)) + logger.debug('_getStringStream called for {}. Ascii version found: {}. Unicode version found: {}.'.format( filename, asciiVersion is not None, unicodeVersion is not None)) if asciiVersion is None: return unicodeVersion @@ -421,12 +432,13 @@ def body(self): try: return self._body except AttributeError: - self._body = encode(self._getStringStream('__substg1.0_1000')) + self._body = self._getStringStream('__substg1.0_1000') if self._body: + self._body = encode(self._body) a = re.search('\n', self._body) - if a is not None: - if re.search('\r\n', self._body) is not None: - self.__crlf = '\r\n' + if a is not None: + if re.search('\r\n', self._body) is not None: + self.__crlf = '\r\n' return self._body @property @@ -456,8 +468,8 @@ def attachments(self): attachmentDirs = [] for dir_ in self.listDir(): - if dir_[len(self.__prefixList)].startswith('__attach') and dir_[ - len(self.__prefixList)] not in attachmentDirs: + if dir_[len(self.__prefixList)].startswith('__attach') and\ + dir_[len(self.__prefixList)] not in attachmentDirs: attachmentDirs.append(dir_[len(self.__prefixList)]) self._attachments = [] @@ -479,8 +491,8 @@ def recipients(self): recipientDirs = [] for dir_ in self.listDir(): - if dir_[len(self.__prefixList)].startswith('__recip') and dir_[ - len(self.__prefixList)] not in recipientDirs: + if dir_[len(self.__prefixList)].startswith('__recip') and\ + dir_[len(self.__prefixList)] not in recipientDirs: recipientDirs.append(dir_[len(self.__prefixList)]) self._recipients = [] @@ -544,7 +556,7 @@ def save(self, toJson=False, useFileName=False, raw=False, ContentId=False, cust dirName ) - oldDir = os.getcwd() + oldDir = os.getcwdu() try: os.chdir(dirName) @@ -590,12 +602,12 @@ def save(self, toJson=False, useFileName=False, raw=False, ContentId=False, cust def saveRaw(self): # Create a 'raw' folder - oldDir = os.getcwd() + oldDir = os.getcwdu() try: rawDir = 'raw' os.makedirs(rawDir) os.chdir(rawDir) - sysRawDir = os.getcwd() + sysRawDir = os.getcwdu() # Loop through all the directories for dir_ in self.listdir(): diff --git a/extract_msg/named.py b/extract_msg/named.py deleted file mode 100644 index 3ac9d40b..00000000 --- a/extract_msg/named.py +++ /dev/null @@ -1,54 +0,0 @@ -import logging -import struct - -from extract_msg import constants -from extract_msg.utils import divide # , round_up - -logger = logging.getLogger(__name__) -logger.addHandler(logging.NullHandler()) - - -# TODO move this function to utils.py: -def round_up(inp, mult): - """ - Rounds :param inp: up to the nearest multiple of :param mult:. - """ - return inp + (mult - inp) % mult - - -# Temporary class code to make references like `constants.CONSTANT` work: -class constants(object): - # Structs used by named.py - STNP_NAM = struct.Struct('> 1, - 'pkind': tmp[1] & 1, - }) - names = [] - pos = 0 - while pos < nl: - l = constants.STNP_NAM.unpack(names_stream[pos:pos + 4])[0] - pos += 4 - names.append(names_stream[pos:pos + l].decode('utf_16_le')) - pos += round_up(l, 4) diff --git a/extract_msg/recipient.py b/extract_msg/recipient.py index 1a0c7f8e..db584f08 100644 --- a/extract_msg/recipient.py +++ b/extract_msg/recipient.py @@ -17,14 +17,39 @@ def __init__(self, _dir, msg): object.__init__(self) self.__msg = msg # Allows calls to original msg file self.__dir = _dir - self.__props = Properties(msg._getStream(self.__dir + '/__properties_version1.0'), constants.TYPE_RECIPIENT) - self.__email = msg._getStringStream(self.__dir + '/__substg1.0_39FE') + self.__props = Properties(self._getStream('__properties_version1.0'), constants.TYPE_RECIPIENT) + self.__email = self._getStringStream('__substg1.0_39FE') if not self.__email: - self.__email = msg._getStringStream(self.__dir + '/__substg1.0_3003') - self.__name = msg._getStringStream(self.__dir + '/__substg1.0_3001') + self.__email = self._getStringStream('__substg1.0_3003') + self.__name = self._getStringStream('__substg1.0_3001') self.__type = self.__props.get('0C150003').value self.__formatted = u'{0} <{1}>'.format(self.__name, self.__email) + def _getStream(self, filename): + return self.__msg._getStream([self.__dir, filename]) + + def _getStringStream(self, filename): + """ + Gets a string representation of the requested filename. + Checks for both ASCII and Unicode representations and returns + a value if possible. If there are both ASCII and Unicode + versions, then :param prefer: specifies which will be + returned. + """ + return self.__msg._getStringStream([self.__dir, filename]) + + def Exists(self, filename): + """ + Checks if stream exists inside the recipient folder. + """ + return self.__msg.Exists([self.__dir, filename]) + + def sExists(self, filename): + """ + Checks if the string stream exists inside the recipient folder. + """ + return self.__msg.sExists([self.__dir, filename]) + @property def email(self): """ diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 53963ab0..c03abe6e 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -4,22 +4,25 @@ import argparse import datetime +import json import logging import logging.config -import json -import os import sys import tzlocal from extract_msg import constants +from extract_msg.compat import os_ as os logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) +logging.addLevelName(5, 'DEVELOPER') if sys.version_info[0] >= 3: # Python 3 stri = (str,) + get_input = input + def encode(inp): return inp @@ -53,9 +56,11 @@ def xstr(s): else: # Python 2 stri = (str, unicode) + get_input = raw_input + def encode(inp): - return inp.encode('utf-8') + return inp.encode('utf-8') if inp is not None else None def properHex(inp): @@ -122,53 +127,58 @@ def fromTimeStamp(stamp): return datetime.datetime.fromtimestamp(stamp, tzlocal.get_localzone()) -def get_command_args(): +def get_command_args(args): """ Parse command-line arguments """ - parser = argparse.ArgumentParser(description = constants.MAINDOC, prog = 'extract_msg') + parser = argparse.ArgumentParser(description=constants.MAINDOC, prog='extract_msg') # --use-content-id, --cid - parser.add_argument('--use-content-id', '--cid', dest = 'cid', action = 'store_true', - help = 'Save attachments by their Content ID, if they have one. Useful when working with the HTML body.') + parser.add_argument('--use-content-id', '--cid', dest='cid', action='store_true', + help='Save attachments by their Content ID, if they have one. Useful when working with the HTML body.') # --dev - parser.add_argument('--dev', dest = 'dev', action = 'store_true', - help = 'Changes to use developer mode. Automatically enables the --verbose flag.') + parser.add_argument('--dev', dest='dev', action='store_true', + help='Changes to use developer mode. Automatically enables the --verbose flag. Takes precedence over the --validate flag.') + # --validate + parser.add_argument('--validate', dest='validate', action='store_true', + help='Turns on file validation mode. Turns off regular file output.') # --json - parser.add_argument('--json', dest = 'json', action = 'store_true', - help = 'Changes to write output files as json.') + parser.add_argument('--json', dest='json', action='store_true', + help='Changes to write output files as json.') # --file-logging - parser.add_argument('--file-logging', dest = 'file_logging', action = 'store_true', - help = 'Enables file logging.') + parser.add_argument('--file-logging', dest='file_logging', action='store_true', + help='Enables file logging.') # --verbose - parser.add_argument('--verbose', dest = 'verbose', action = 'store_true', - help = 'Turns on console logging. Implies --verbose') + parser.add_argument('--verbose', dest='verbose', action='store_true', + help='Turns on console logging. Implies --verbose') # --log PATH - parser.add_argument('--log', dest = 'log', - help = 'Set the path to write the file log to.') + parser.add_argument('--log', dest='log', + help='Set the path to write the file log to.') # --config PATH - parser.add_argument('--config', dest = 'config_path', - help = 'Set the path to load the logging config from.') + parser.add_argument('--config', dest='config_path', + help='Set the path to load the logging config from.') # --out PATH - parser.add_argument('--out', dest = 'out_path', - help = 'Set the folder to use for the program output. (Default: Current directory)') + parser.add_argument('--out', dest='out_path', + help='Set the folder to use for the program output. (Default: Current directory)') # --use-filename - parser.add_argument('--use-filename', dest = 'use_filename', action = 'store_true', - help = 'Sets whether the name of each output is based on the msg filename.') + parser.add_argument('--use-filename', dest='use_filename', action='store_true', + help='Sets whether the name of each output is based on the msg filename.') # --out-name NAME # parser.add_argument('--out-name', dest = 'out_name', # help = 'Name to be used with saving the file output. Should come immediately after the file name') # [msg files] - parser.add_argument('msgs', metavar = 'msg', nargs = '+', - help = 'An msg file to be parsed') + parser.add_argument('msgs', metavar='msg', nargs='+', + help='An msg file to be parsed') - options = parser.parse_args() + options = parser.parse_args(args) if options.dev or options.file_logging: options.verbose = True file_args = options.msgs - file_tables = [] # This is where we will store the separated files and their arguments - temp_table = [] # temp_table will store each table while it is still being built. - need_arg = True # This tells us if the last argument was something like --out-name which requires a string name after it. We start on true to make it so that we use don't have to have something checking if we are on the first table. - for x in file_args: # Iterate through each + file_tables = [] # This is where we will store the separated files and their arguments + temp_table = [] # temp_table will store each table while it is still being built. + need_arg = True # This tells us if the last argument was something like + # --out-name which requires a string name after it. + # We start on true to make it so that we use don't have to have something checking if we are on the first table. + for x in file_args: # Iterate through each if need_arg: temp_table.append(x) need_arg = False @@ -184,6 +194,7 @@ def get_command_args(): options.msgs = file_tables return options + def has_len(obj): """ Checks if :param obj: has a __len__ attribute. @@ -275,13 +286,16 @@ def parse_type(_type, stream): pass return value + def getContFileDir(_file_): """ Takes in the path to a file and tries to return the containing folder. """ return '/'.join(_file_.replace('\\', '/').split('/')[:-1]) -def setup_logging(default_path=None, default_level=logging.WARN, logfile = None, enable_file_logging = False, env_key='EXTRACT_MSG_LOG_CFG'): + +def setup_logging(default_path=None, default_level=logging.WARN, logfile=None, enable_file_logging=False, + env_key='EXTRACT_MSG_LOG_CFG'): """ Setup logging configuration @@ -330,8 +344,8 @@ def setup_logging(default_path=None, default_level=logging.WARN, logfile = None, 'of the following file-paths:') print(str(paths[1:])) logging.basicConfig(level=default_level) - logger.warning('The extract_msg logging configuration was not found - using a basic configuration.' - 'Please check the extract_msg installation directory for "logging-{}.json".'.format(os.name)) + logging.warning('The extract_msg logging configuration was not found - using a basic configuration.' + 'Please check the extract_msg installation directory for "logging-{}.json".'.format(os.name)) return False with open(path, 'rt') as f: @@ -340,7 +354,8 @@ def setup_logging(default_path=None, default_level=logging.WARN, logfile = None, for x in config['handlers']: if 'filename' in config['handlers'][x]: if enable_file_logging: - config['handlers'][x]['filename'] = tmp = os.path.expanduser(os.path.expandvars(logfile if logfile else config['handlers'][x]['filename'])) + config['handlers'][x]['filename'] = tmp = os.path.expanduser( + os.path.expandvars(logfile if logfile else config['handlers'][x]['filename'])) tmp = getContFileDir(tmp) if not os.path.exists(tmp): os.makedirs(tmp) @@ -355,3 +370,7 @@ def setup_logging(default_path=None, default_level=logging.WARN, logfile = None, logging.getLogger().setLevel(default_level) return True + + +def get_full_class_name(inp): + return inp.__class__.__module__ + '.' + inp.__class__.__name__ diff --git a/extract_msg/validation.py b/extract_msg/validation.py new file mode 100644 index 00000000..6f9df46c --- /dev/null +++ b/extract_msg/validation.py @@ -0,0 +1,100 @@ +import copy + +import olefile + +from extract_msg.message import Message +from extract_msg.utils import get_full_class_name, has_len + + +def get_string_details(instance, stream): + return { + 'exists': instance.sExists(stream), + 'not empty': False if not instance.sExists(stream) else len(instance._getStringStream(stream)) > 0, + } + + +def get_stream_details(instance, stream): + return { + 'exists': instance.Exists(stream), + 'not empty': False if not instance.Exists(stream) else len(instance._getStream(stream)) > 0, + } + + +def get_email_details(instance, stream): + return { + 'exists': instance.sExists(stream), + 'not empty': False if not instance.sExists(stream) else len(instance._getStringStream(stream)) > 0, + 'valid email address': False if not instance.sExists(stream) else u'@' in instance._getStringStream(stream), + } + + +def string_FE(instance): + temp = '001E' + if instance.mainProperties.has_key('340D0003'): + temp = '001F' if instance.mainProperties['340D0003'].value & 0x40000 else '001E' + tempnot = '001F' if temp == '001E' else '001E' + confirmation = all(not x[-1].endswith(tempnot) for x in instance.listDir()) + if confirmation: + temp += ', but ' + tempnot + ' was detected.' + return temp + + +def validate_msg(instance): + return { + '001F/001E': string_FE(instance), + 'header': get_string_details(instance, '__substg1.0_007D'), + 'body': get_string_details(instance, '__substg1.0_1000'), + 'html body': get_stream_details(instance, '__substg1.0_10130102'), + 'rtf body': get_stream_details(instance, '__substg1.0_10090102'), + 'date': instance.date, + 'attachments': {x: validate_attachment(y) for x, y in enumerate(instance.attachments)}, + 'recipients': {x: validate_recipient(y) for x, y in enumerate(instance.recipients)}, + } + + +def validate_attachment(instance): + temp = { + 'long filename': get_string_details(instance, '__substg1.0_3707'), + 'short filename': get_string_details(instance, '__substg1.0_3704'), + 'content id': get_string_details(instance, '__substg1.0_3712'), + 'type': instance.type, + } + if temp['type'] == 'msg': + temp['msg'] = validate_msg(instance.data) + return temp + + +def validate_recipient(instance): + return { + 'type': instance.type, + 'stream 3003': get_email_details(instance, '__substg1.0_3003'), + 'stream 39FE': get_email_details(instance, '__substg1.0_39FE'), + } + + +def validate(msg): + validation_dict = { + 'input': { + 'class': get_full_class_name(msg), # Get the full name of the class + 'has_len': has_len(msg), # Does the input have a __len__ attribute? + 'len': len(msg) if has_len(msg) else None, # If input has __len__, put the value here + }, + 'olefile': { + 'valid': olefile.isOleFile(msg), + }, + } + if validation_dict['olefile']['valid']: + validation_dict['message'] = { + 'initializes': False, + } + try: + msg_instance = Message(msg) + except NotImplementedError: + # Should we have a special procedure for handling it if we get "not implemented"? + pass + except: + pass + else: + validation_dict['message']['initializes'] = True + validation_dict['message']['msg'] = validate_msg(msg_instance) + return validation_dict