diff --git a/CHANGELOG.md b/CHANGELOG.md index 27091c84..db7d290f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ +**v0.23.4** +* [[mattgwwalker #112](https://github.com/mattgwwalker/msg-extractor/issues/112)] Changed method used to get the message from an exception to make it compatible with Python 2 and 3 +* [[Syncurity #51](https://github.com/Syncurity/msg-extractor/issues/51)] General cleanup and all around improvements of the code. **v0.23.3** * Fixed issues in readme. * [[Syncurity #50](https://github.com/Syncurity/msg-extractor/issues/50)] Updated `dev_classes.Message` to better match the current `Message` class. * Fixed bad links in changelog. -* [[mattgwwalker #95](https://github.com/mattgwwalker/msg-extractor/issues/95)] Added falback encoding as well as manual encoding change to `dev_classes.Message`. +* [[mattgwwalker #95](https://github.com/mattgwwalker/msg-extractor/issues/95)] Added fallback encoding as well as manual encoding change to `dev_classes.Message`. **v0.23.1** * Fixed issue with embedded msg files caused by the changes in v0.23.0. diff --git a/README.rst b/README.rst index 0f17142e..6d481952 100644 --- a/README.rst +++ b/README.rst @@ -180,8 +180,8 @@ Joel Kaufman - First implementations of the json and filename flags .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.23.3-blue.svg - :target: https://pypi.org/project/extract-msg/0.23.3/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.23.4-blue.svg + :target: https://pypi.org/project/extract-msg/0.23.4/ .. |PyPI1| image:: https://img.shields.io/badge/python-2.7+-brightgreen.svg :target: https://www.python.org/downloads/release/python-2715/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 76ec11a0..6a072df6 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Matthew Walker & The Elemental of Creation' -__date__ = '2019-04-20' -__version__ = '0.23.3' +__date__ = '2020-04-27' +__version__ = '0.23.4' from extract_msg import constants from extract_msg.attachment import Attachment diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index 3983f05a..069c3a33 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -28,14 +28,6 @@ def __init__(self, msg, dir_): self.__dir = dir_ self.__props = Properties(self._getStream('__properties_version1.0'), constants.TYPE_ATTACHMENT) - # Get long filename - self.__longFilename = self._getStringStream('__substg1.0_3707') - - # Get short filename - self.__shortFilename = self._getStringStream('__substg1.0_3704') - - # Get Content-ID - self.__cid = self._getStringStream('__substg1.0_3712') # Get attachment data if self.Exists('__substg1.0_37010102'): @@ -67,6 +59,23 @@ def _getStringStream(self, filename): """ return self.__msg._getStringStream([self.__dir, filename]) + def _ensureSet(self, variable, streamID, stingstream = True): + """ + Ensures that the variable exists, otherwise will set it using the specified stream. + After that, return said variable. + + If the specified stream is not a string stream, make sure to set :param string stream: to False. + """ + try: + return getattr(self, variable) + except AttributeError: + if stringStream: + value = self._getStringStream(streamID) + else: + value = self._getStream(streamID) + setattr(self, variable, value) + return value + def Exists(self, filename): """ Checks if stream exists inside the attachment folder. @@ -88,13 +97,13 @@ def save(self, contentId=False, json=False, useFileName=False, raw=False, custom # If not... # Check if user wants to save the file under the Content-id if contentId: - filename = self.__cid + filename = self.cid # If filename is None at this point, use long filename as first preference if filename is None: - filename = self.__longFilename + filename = self.longFilename # Otherwise use the short filename if filename is None: - filename = self.__shortFilename + filename = self.shortFilename # Otherwise just make something up! if filename is None: filename = 'UnknownFilename ' + \ @@ -124,9 +133,9 @@ def saveEmbededMessage(self, contentId=False, json=False, useFileName=False, raw @property def cid(self): """ - Returns the content ID of the attachment, if it exists. + Returns the Content ID of the attachment, if it exists. """ - return self.__cid + return self._ensureSet('_cid', '__substg1.0_3712') contend_id = cid @@ -149,7 +158,7 @@ def longFilename(self): """ Returns the long file name of the attachment, if it exists. """ - return self.__longFilename + return self._ensureSet('_longFilename', '__substg1.0_3707') @property def msg(self): @@ -170,11 +179,11 @@ def shortFilename(self): """ Returns the short file name of the attachment, if it exists. """ - return self.__shortFilename + return self._ensureSet('_shortFilename', '__substg1.0_3704') @property def type(self): """ - Returns the type of the data. + Returns the (internally used) type of the data. """ return self.__type diff --git a/extract_msg/constants.py b/extract_msg/constants.py index d2a6aa03..2b47dc78 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -359,13 +359,6 @@ # END CONSTANTS -def int_to_recipient_type(integer): - """ - Returns the name of the recipient type constant that has the value of :param integer: - """ - return RECIPIENT_DICT[integer] - - def int_to_data_type(integer): """ Returns the name of the data type constant that has the value of :param integer: @@ -378,3 +371,9 @@ def int_to_intelligence(integer): Returns the name of the intelligence level constant that has the value of :param integer: """ return INTELLIGENCE_DICT[integer] + +def int_to_recipient_type(integer): + """ + Returns the name of the recipient type constant that has the value of :param integer: + """ + return RECIPIENT_DICT[integer] diff --git a/extract_msg/dev_classes/message.py b/extract_msg/dev_classes/message.py index 541f84c5..ffc7f3a2 100644 --- a/extract_msg/dev_classes/message.py +++ b/extract_msg/dev_classes/message.py @@ -75,55 +75,6 @@ def __init__(self, path, prefix='', filename=None): self.attachments self.date - def listDir(self, streams=True, storages=False): - """ - Replacement for OleFileIO.listdir that runs at the current prefix directory. - """ - temp = self.listdir(streams, storages) - if self.__prefix == '': - return temp - prefix = self.__prefix.split('/') - if prefix[-1] == '': - prefix.pop() - out = [] - for x in temp: - good = True - if len(x) <= len(prefix): - good = False - if good: - for y in range(len(prefix)): - if x[y] != prefix[y]: - good = False - if good: - out.append(x) - return out - - def Exists(self, filename): - """ - Checks if :param filename: exists in the msg file. - """ - filename = self.fix_path(filename) - return self.exists(filename) - - def sExists(self, filename): - """ - Checks if string stream :param filename: exists in the msg file. - """ - filename = self.fix_path(filename) - return self.exists(filename + '001F') or self.exists(filename + '001E') - - def fix_path(self, filename, prefix=True): - """ - Changes paths so that they have the proper - prefix (should :param prefix: be True) and - are strings rather than lists or tuples. - """ - if isinstance(filename, (list, tuple)): - filename = '/'.join(filename) - if prefix: - filename = self.__prefix + filename - return filename - def _getStream(self, filename, prefix=True): filename = self.fix_path(filename, prefix) if self.exists(filename): @@ -146,69 +97,55 @@ def _getStringStream(self, filename, prefer='unicode', prefix=True): tmp = self._getStream(filename + '001E', prefix = False) return None if tmp is None else tmp.decode(self.stringEncoding) - @property - def path(self): + def Exists(self, filename): """ - Returns the message path if generated from a file, - otherwise returns the data used to generate the - Message instance. + Checks if :param filename: exists in the msg file. """ - return self.__path - - @property - def prefix(self): + filename = self.fix_path(filename) + return self.exists(filename) + + def sExists(self, filename): """ - Returns the prefix of the Message instance. - Intended for developer use. + Checks if string stream :param filename: exists in the msg file. """ - return self.__prefix - - @property - def prefixList(self): + filename = self.fix_path(filename) + return self.exists(filename + '001F') or self.exists(filename + '001E') + + def fix_path(self, filename, prefix=True): """ - Returns the prefix list of the Message instance. - Intended for developer use. + Changes paths so that they have the proper + prefix (should :param prefix: be True) and + are strings rather than lists or tuples. """ - return copy.deepcopy(self.__prefixList) + if isinstance(filename, (list, tuple)): + filename = '/'.join(filename) + if prefix: + filename = self.__prefix + filename + return filename - @property - def mainProperties(self): + def listDir(self, streams=True, storages=False): """ - Returns the Properties instance used by the Message instance. + Replacement for OleFileIO.listdir that runs at the current prefix directory. """ - try: - return self._prop - except AttributeError: - self._prop = Properties(self._getStream('__properties_version1.0'), - constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) - return self._prop + temp = self.listdir(streams, storages) + if self.__prefix == '': + return temp + prefix = self.__prefix.split('/') + if prefix[-1] == '': + prefix.pop() + out = [] + for x in temp: + good = True + if len(x) <= len(prefix): + good = False + if good: + for y in range(len(prefix)): + if x[y] != prefix[y]: + good = False + if good: + out.append(x) + return out - @property - def stringEncoding(self): - try: - return self.__stringEncoding - except AttributeError: - # We need to calculate the encoding - # Let's first check if the encoding will be unicode: - if self.areStringsUnicode: - self.__stringEncoding = "utf-16-le" - return self.__stringEncoding - else: - # Well, it's not unicode. Now we have to figure out what it IS. - if not self.mainProperties.has_key('3FFD0003'): - logger.error("String encoding is not unicode, but was also not specified. Malformed MSG file detected. Defaulting to utf-8") - self.__stringEncoding = 'utf-8' - return self.__stringEncoding - enc = self.mainProperties['3FFD0003'].value - # Now we just need to translate that value - # Now, this next line SHOULD work, but it is possible that it might not... - self.__stringEncoding = str(enc) - return self.__stringEncoding - - @stringEncoding.setter - def stringEncoding(self, enc): - self.__stringEncoding = enc - @property def areStringsUnicode(self): """ @@ -224,17 +161,6 @@ def areStringsUnicode(self): self.__bStringsUnicode = False return self.__bStringsUnicode - @property - def date(self): - """ - Returns the send date, if it exists. - """ - try: - return self._date - except AttributeError: - self._date = self._prop.date - return self._date - @property def attachments(self): """ @@ -258,6 +184,54 @@ def attachments(self): return self._attachments + @property + def date(self): + """ + Returns the send date, if it exists. + """ + try: + return self._date + except AttributeError: + self._date = self._prop.date + return self._date + + @property + def mainProperties(self): + """ + Returns the Properties instance used by the Message instance. + """ + try: + return self._prop + except AttributeError: + self._prop = Properties(self._getStream('__properties_version1.0'), + constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) + return self._prop + + @property + def path(self): + """ + Returns the message path if generated from a file, + otherwise returns the data used to generate the + Message instance. + """ + return self.__path + + @property + def prefix(self): + """ + Returns the prefix of the Message instance. + Intended for developer use. + """ + return self.__prefix + + @property + def prefixList(self): + """ + Returns the prefix list of the Message instance. + Intended for developer use. + """ + return copy.deepcopy(self.__prefixList) + @property def recipients(self): """ @@ -280,3 +254,29 @@ def recipients(self): self._recipients.append(Recipient(recipientDir, self)) return self._recipients + + @property + def stringEncoding(self): + try: + return self.__stringEncoding + except AttributeError: + # We need to calculate the encoding + # Let's first check if the encoding will be unicode: + if self.areStringsUnicode: + self.__stringEncoding = "utf-16-le" + return self.__stringEncoding + else: + # Well, it's not unicode. Now we have to figure out what it IS. + if not self.mainProperties.has_key('3FFD0003'): + logger.error("String encoding is not unicode, but was also not specified. Malformed MSG file detected. Defaulting to utf-8") + self.__stringEncoding = 'utf-8' + return self.__stringEncoding + enc = self.mainProperties['3FFD0003'].value + # Now we just need to translate that value + # Now, this next line SHOULD work, but it is possible that it might not... + self.__stringEncoding = str(enc) + return self.__stringEncoding + + @stringEncoding.setter + def stringEncoding(self, enc): + self.__stringEncoding = enc diff --git a/extract_msg/exceptions.py b/extract_msg/exceptions.py index e9a51669..58bfde55 100644 --- a/extract_msg/exceptions.py +++ b/extract_msg/exceptions.py @@ -15,6 +15,6 @@ class InvalidFileFormat(OSError): """ - A Invalid File Format Error occurred + An Invalid File Format Error occurred """ pass diff --git a/extract_msg/message.py b/extract_msg/message.py index aa5019d9..1a3ef114 100644 --- a/extract_msg/message.py +++ b/extract_msg/message.py @@ -19,7 +19,6 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) - class Message(olefile.OleFileIO): """ Parser for Microsoft Outlook message files. @@ -45,7 +44,7 @@ def __init__(self, path, prefix='', attachmentClass=Attachment, filename=None): olefile.OleFileIO.__init__(self, path) except IOError as e: # py2 and py3 compatible logger.error(e) - if e.message == 'not an OLE2 structured storage file': + if str(e) == 'not an OLE2 structured storage file': raise InvalidFileFormat(e) else: raise @@ -93,55 +92,53 @@ def __init__(self, path, prefix='', attachmentClass=Attachment, filename=None): self.date self.__crlf = '\n' # This variable keeps track of what the new line character should be self.body - - def listDir(self, streams=True, storages=False): - """ - Replacement for OleFileIO.listdir that runs at the current prefix directory. - """ - temp = self.listdir(streams, storages) - if self.__prefix == '': - return temp - prefix = self.__prefix.split('/') - if prefix[-1] == '': - prefix.pop() - out = [] - for x in temp: - good = True - if len(x) <= len(prefix): - good = False - if good: - for y in range(len(prefix)): - if x[y] != prefix[y]: - good = False - if good: - out.append(x) - return out - - def Exists(self, inp): - """ - Checks if :param inp: exists in the msg file. - """ - inp = self.fix_path(inp) - return self.exists(inp) - - def sExists(self, inp): + + def _ensureSet(self, variable, streamID, stingstream = True): """ - Checks if string stream :param inp: exists in the msg file. + Ensures that the variable exists, otherwise will set it using the specified stream. + After that, return said variable. + If the specified stream is not a string stream, make sure to set :param string stream: to False. """ - inp = self.fix_path(inp) - return self.exists(inp + '001F') or self.exists(inp + '001E') + try: + return getattr(self, variable) + except AttributeError: + if stringStream: + value = self._getStringStream(streamID) + else: + value = self._getStream(streamID) + setattr(self, variable, value) + return value - def fix_path(self, inp, prefix=True): + def _genRecipient(self, recipientType, recipientInt): """ - Changes paths so that they have the proper - prefix (should :param prefix: be True) and - are strings rather than lists or tuples. + Returns the specified recipient field """ - if isinstance(inp, (list, tuple)): - inp = '/'.join(inp) - if prefix: - inp = self.__prefix + inp - return inp + private = '_' + recipientType + try: + return getattr(self, private) + except AttributeError: + # Check header first + headerResult = None + if self.headerInit(): + headerResult = self.header[recipientType] + if headerResult is not None: + setattr(self, private, headerResult) + else: + if self.headerInit(): + logger.info('Header found, but "{}" is not included. Will be generated from other streams.'.format(recipientType)) + f = [] + for x in self.recipients: + if x.type & 0x0000000f == recipientInt: + f.append(x.formatted) + if len(f) > 0: + st = f[0] + if len(f) > 1: + for x in range(1, len(f)): + st += ', {0}'.format(f[x]) + self._cc = st + else: + setattr(self, private, None) + return getattr(self, private) def _getStream(self, filename, prefix=True): filename = self.fix_path(filename, prefix) @@ -165,79 +162,47 @@ def _getStringStream(self, filename, prefix=True): tmp = self._getStream(filename + '001E', prefix = False) return None if tmp is None else tmp.decode(self.stringEncoding) - @property - def path(self): - """ - Returns the message path if generated from a file, - otherwise returns the data used to generate the - Message instance. - """ - return self.__path - - @property - def prefix(self): - """ - Returns the prefix of the Message instance. - Intended for developer use. - """ - return self.__prefix + def debug(self): + for dir_ in self.listDir(): + if dir_[-1].endswith('001E') or dir_[-1].endswith('001F'): + print('Directory: ' + str(dir_[:-1])) + print('Contents: {}'.format(self._getStream(dir_))) - @property - def prefixList(self): + def dump(self): """ - Returns the prefix list of the Message instance. - Intended for developer use. + Prints out a summary of the message """ - return copy.deepcopy(self.__prefixList) - - @property - def subject(self): + print('Message') + print('Subject:', self.subject) + print('Date:', self.date) + print('Body:') + print(self.body) + + def Exists(self, inp): """ - Returns the message subject, if it exists. + Checks if :param inp: exists in the msg file. Does not always go to the top, starts at specified point """ - try: - return self._subject - except AttributeError: - self._subject = encode(self._getStringStream('__substg1.0_0037')) - return self._subject - - @property - def header(self): + inp = self.fix_path(inp) + return self.exists(inp) + + def sExists(self, inp): """ - Returns the message header, if it exists. Otherwise it will generate one. + Checks if string stream :param inp: exists in the msg file. """ - try: - return self._header - except AttributeError: - headerText = self._getStringStream('__substg1.0_007D') - if headerText is not None: - self._header = EmailParser().parsestr(headerText) - self._header['date'] = self.date - else: - logger.info('Header is empty or was not found. Header will be generated from other streams.') - header = EmailParser().parsestr('') - header.add_header('Date', self.date) - header.add_header('From', self.sender) - header.add_header('To', self.to) - header.add_header('Cc', self.cc) - header.add_header('Message-Id', self.message_id) - # TODO find authentication results outside of header - header.add_header('Authentication-Results', None) - - self._header = header - return self._header + inp = self.fix_path(inp) + return self.exists(inp + '001F') or self.exists(inp + '001E') - @property - def header_dict(self): + def fix_path(self, inp, prefix=True): """ - Returns a dictionary of the entries in the header + Changes paths so that they have the proper + prefix (should :param prefix: be True) and + are strings rather than lists or tuples. """ - try: - return self._header_dict - except AttributeError: - self._header_dict = dict(self.header._header) - self._header_dict.pop('Received') - return self._header_dict + if isinstance(inp, (list, tuple)): + inp = '/'.join(inp) + if prefix: + inp = self.__prefix + inp + return inp def headerInit(self): """ @@ -249,54 +214,168 @@ def headerInit(self): except AttributeError: return False - @property - def mainProperties(self): + def listDir(self, streams=True, storages=False): """ - Returns the Properties instance used by the Message instance. + Replacement for OleFileIO.listdir that runs at the current prefix directory. """ - try: - return self._prop - except AttributeError: - self._prop = Properties(self._getStream('__properties_version1.0'), - constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) - return self._prop + temp = self.listdir(streams, storages) + if self.__prefix == '': + return temp + prefix = self.__prefix.split('/') + if prefix[-1] == '': + prefix.pop() + out = [] + for x in temp: + good = True + if len(x) <= len(prefix): + good = False + if good: + for y in range(len(prefix)): + if x[y] != prefix[y]: + good = False + if good: + out.append(x) + return out - @property - def date(self): + def save(self, toJson=False, useFileName=False, raw=False, ContentId=False, customPath=None, customFilename=None): """ - Returns the send date, if it exists. + Saves the message body and attachments found in the message. Setting toJson + to true will output the message body as JSON-formatted text. The body and + attachments are stored in a folder. Setting useFileName to true will mean that + the filename is used as the name of the folder; otherwise, the message's date + and subject are used as the folder name. + Here is the absolute order of prioity for the name of the folder: + 1. customFilename + 2. self.filename if useFileName + 3. {date} {subject} """ - try: - return self._date - except AttributeError: - self._date = self._prop.date - return self._date + if customFilename != None and customFilename != '': + dirName = customFilename + else: + if useFileName: + # strip out the extension + if self.filename is not None: + dirName = self.filename.split('/').pop().split('.')[0] + else: + ValueError( + 'Filename must be specified, or path must have been an actual path, to save using filename') + else: + # Create a directory based on the date and subject of the message + d = self.parsedDate + if d is not None: + dirName = '{0:02d}-{1:02d}-{2:02d}_{3:02d}{4:02d}'.format(*d) + else: + dirName = 'UnknownDate' - @property - def parsedDate(self): - return email.utils.parsedate(self.date) + if self.subject is None: + subject = '[No subject]' + else: + subject = ''.join(i for i in self.subject if i not in r'\/:*?"<>|') - @property - def stringEncoding(self): + dirName = dirName + ' ' + subject + + if customPath != None and customPath != '': + if customPath[-1] != '/' or customPath[-1] != '\\': + customPath += '/' + dirName = customPath + dirName try: - return self.__stringEncoding - except AttributeError: - # We need to calculate the encoding - # Let's first check if the encoding will be unicode: - if self.areStringsUnicode: - self.__stringEncoding = "utf-16-le" - return self.__stringEncoding + os.makedirs(dirName) + except Exception: + newDirName = addNumToDir(dirName) + if newDirName is not None: + dirName = newDirName else: - # Well, it's not unicode. Now we have to figure out what it IS. - if not self.mainProperties.has_key('3FFD0003'): - raise Exception('Encoding property not found') - enc = self.mainProperties['3FFD0003'].value - # Now we just need to translate that value - # Now, this next line SHOULD work, but it is possible that it might not... - self.__stringEncoding = str(enc) - return self.__stringEncoding + raise Exception( + "Failed to create directory '%s'. Does it already exist?" % + dirName + ) - @property + oldDir = os.getcwdu() + try: + os.chdir(dirName) + + # Save the message body + fext = 'json' if toJson else 'text' + f = open('message.' + fext, 'w') + # From, to , cc, subject, date + + attachmentNames = [] + # Save the attachments + for attachment in self.attachments: + attachmentNames.append(attachment.save(ContentId, toJson)) + + if toJson: + + emailObj = {'from': xstr(self.sender), + 'to': xstr(self.to), + 'cc': xstr(self.cc), + 'subject': xstr(self.subject), + 'date': xstr(self.date), + 'attachments': attachmentNames, + 'body': decode_utf7(self.body)} + + f.write(json.dumps(emailObj, ensure_ascii=True)) + else: + f.write('From: ' + xstr(self.sender) + self.__crlf) + f.write('To: ' + xstr(self.to) + self.__crlf) + f.write('CC: ' + xstr(self.cc) + self.__crlf) + f.write('Subject: ' + xstr(self.subject) + self.__crlf) + f.write('Date: ' + xstr(self.date) + self.__crlf) + f.write('-----------------' + self.__crlf + self.__crlf) + f.write(self.body) + + f.close() + + except Exception as e: + self.saveRaw() + raise + + finally: + # Return to previous directory + os.chdir(oldDir) + + def save_attachments(self, contentId=False, json=False, useFileName=False, raw=False, customPath=None): + """ + Saves only attachments in the same folder. + """ + for attachment in self.attachments: + attachment.save(contentId, json, useFileName, raw, customPath) + + def saveRaw(self): + # Create a 'raw' folder + oldDir = os.getcwdu() + try: + rawDir = 'raw' + os.makedirs(rawDir) + os.chdir(rawDir) + sysRawDir = os.getcwdu() + + # Loop through all the directories + for dir_ in self.listdir(): + sysdir = '/'.join(dir_) + code = dir_[-1][-8:] + if code in constants.PROPERTIES: + sysdir = sysdir + ' - ' + constants.PROPERTIES[code] + os.makedirs(sysdir) + os.chdir(sysdir) + + # Generate appropriate filename + if dir_[-1].endswith('001E'): + filename = 'contents.txt' + else: + filename = 'contents' + + # Save contents of directory + with open(filename, 'wb') as f: + f.write(self._getStream(dir_)) + + # Return to base directory + os.chdir(sysRawDir) + + finally: + os.chdir(oldDir) + + @property def areStringsUnicode(self): """ Returns a boolean telling if the strings are unicode encoded. @@ -312,118 +391,147 @@ def areStringsUnicode(self): return self.__bStringsUnicode @property - def sender(self): + def attachmentClass(self): """ - Returns the message sender, if it exists. + Returns the Attachment class being used, should you need to use it externally for whatever reason. + """ + return self.__attachmentClass + + @property + def attachments(self): + """ + Returns a list of all attachments. """ try: - return self._sender + return self._attachments except AttributeError: - # Check header first - if self.headerInit(): - headerResult = self.header['from'] - if headerResult is not None: - self._sender = headerResult - return headerResult - logger.info('Header found, but "sender" is not included. Will be generated from other streams.') - # Extract from other fields - text = self._getStringStream('__substg1.0_0C1A') - email = self._getStringStream('__substg1.0_5D01') - # Will not give an email address sometimes. Seems to exclude the email address if YOU are the sender. - result = None - if text is None: - result = email - else: - result = text - if email is not None: - result += ' <' + email + '>' + # Get the attachments + attachmentDirs = [] - self._sender = result - return result + for dir_ in self.listDir(): + if dir_[len(self.__prefixList)].startswith('__attach') and\ + dir_[len(self.__prefixList)] not in attachmentDirs: + attachmentDirs.append(dir_[len(self.__prefixList)]) + + self._attachments = [] + + for attachmentDir in attachmentDirs: + self._attachments.append(self.__attachmentClass(self, attachmentDir)) + + return self._attachments @property - def to(self): + def body(self): """ - Returns the to field, if it exists. + Returns the message body, if it exists. """ try: - return self._to + return self._body except AttributeError: - # Check header first - headerResult = None - if self.headerInit(): - headerResult = self.header['to'] - if headerResult is not None: - self._to = headerResult - else: - if self.headerInit(): - logger.info('Header found, but "to" is not included. Will be generated from other streams.') - f = [] - for x in self.recipients: - if x.type & 0x0000000f == 1: - f.append(x.formatted) - if len(f) > 0: - st = f[0] - if len(f) > 1: - for x in range(1, len(f)): - st += '; {0}'.format(f[x]) - self._to = st - else: - self._to = None - return self._to + self._body = self._getStringStream('__substg1.0_1000') + if self._body: + self._body = encode(self._body) + a = re.search('\n', self._body) + if a is not None: + if re.search('\r\n', self._body) is not None: + self.__crlf = '\r\n' + return self._body + + @property + def cc(self): + """ + Returns the cc field, if it exists. + """ + return self._genRecipient('cc', 2) @property def compressedRtf(self): """ Returns the compressed RTF stream, if it exists. """ + return self._ensureSet('_compressedRtf', '__substg1.0_10090102', False) + + @property + def crlf(self): + """ + Returns the value of self.__crlf, should you need it for whatever reason. + """ + self.body + return self.__crlf + + @property + def date(self): + """ + Returns the send date, if it exists. + """ try: - return self._compressedRtf + return self._date except AttributeError: - self._compressedRtf = self._getStream('__substg1.0_10090102') - return self._compressedRtf + self._date = self._prop.date + return self._date @property - def htmlBody(self): + def header(self): """ - Returns the html body, if it exists. + Returns the message header, if it exists. Otherwise it will generate one. """ try: - return self._htmlBody + return self._header except AttributeError: - self._htmlBody = self._getStream('__substg1.0_10130102') - return self._htmlBody + headerText = self._getStringStream('__substg1.0_007D') + if headerText is not None: + self._header = EmailParser().parsestr(headerText) + self._header['date'] = self.date + else: + logger.info('Header is empty or was not found. Header will be generated from other streams.') + header = EmailParser().parsestr('') + header.add_header('Date', self.date) + header.add_header('From', self.sender) + header.add_header('To', self.to) + header.add_header('Cc', self.cc) + header.add_header('Message-Id', self.message_id) + # TODO find authentication results outside of header + header.add_header('Authentication-Results', None) + + self._header = header + return self._header @property - def cc(self): + def header_dict(self): """ - Returns the cc field, if it exists. + Returns a dictionary of the entries in the header """ try: - return self._cc + return self._header_dict except AttributeError: - # Check header first - headerResult = None - if self.headerInit(): - headerResult = self.header['cc'] - if headerResult is not None: - self._cc = headerResult - else: - if self.headerInit(): - logger.info('Header found, but "cc" is not included. Will be generated from other streams.') - f = [] - for x in self.recipients: - if x.type & 0x0000000f == 2: - f.append(x.formatted) - if len(f) > 0: - st = f[0] - if len(f) > 1: - for x in range(1, len(f)): - st += '; {0}'.format(f[x]) - self._cc = st - else: - self._cc = None - return self._cc + self._header_dict = dict(self.header._header) + self._header_dict.pop('Received') + return self._header_dict + + @property + def htmlBody(self): + """ + Returns the html body, if it exists. + """ + return self._ensureSet('_htmlBody', '__substg1.0_10130102', False) + + @property + def inReplyTo(self): + """ + """ + return self._ensureSet('_in_reply_to', '__substg1.0_1042') + + @property + def mainProperties(self): + """ + Returns the Properties instance used by the Message instance. + """ + try: + return self._prop + except AttributeError: + self._prop = Properties(self._getStream('__properties_version1.0'), + constants.TYPE_MESSAGE if self.__prefix == '' else constants.TYPE_MESSAGE_EMBED) + return self._prop @property def message_id(self): @@ -442,67 +550,33 @@ def message_id(self): return self._message_id @property - def reply_to(self): - try: - return self._reply_to - except AttributeError: - self._reply_to = self._getStringStream('__substg1.0_1042') - return self._reply_to - - @property - def body(self): - """ - Returns the message body, if it exists. - """ - try: - return self._body - except AttributeError: - self._body = self._getStringStream('__substg1.0_1000') - if self._body: - self._body = encode(self._body) - a = re.search('\n', self._body) - if a is not None: - if re.search('\r\n', self._body) is not None: - self.__crlf = '\r\n' - return self._body + def parsedDate(self): + return email.utils.parsedate(self.date) @property - def crlf(self): + def path(self): """ - Returns the value of self.__crlf, should you need it for whatever reason. + Returns the message path if generated from a file, + otherwise returns the data used to generate the + Message instance. """ - self.body - return self.__crlf + return self.__path @property - def attachmentClass(self): + def prefix(self): """ - Returns the Attachment class being used, should you need to use it externally for whatever reason. + Returns the prefix of the Message instance. + Intended for developer use. """ - return self.__attachmentClass + return self.__prefix @property - def attachments(self): + def prefixList(self): """ - Returns a list of all attachments. + Returns the prefix list of the Message instance. + Intended for developer use. """ - try: - return self._attachments - except AttributeError: - # Get the attachments - attachmentDirs = [] - - for dir_ in self.listDir(): - if dir_[len(self.__prefixList)].startswith('__attach') and\ - dir_[len(self.__prefixList)] not in attachmentDirs: - attachmentDirs.append(dir_[len(self.__prefixList)]) - - self._attachments = [] - - for attachmentDir in attachmentDirs: - self._attachments.append(self.__attachmentClass(self, attachmentDir)) - - return self._attachments + return copy.deepcopy(self.__prefixList) @property def recipients(self): @@ -527,157 +601,66 @@ def recipients(self): return self._recipients - def save(self, toJson=False, useFileName=False, raw=False, ContentId=False, customPath=None, customFilename=None): + @property + def sender(self): """ - Saves the message body and attachments found in the message. Setting toJson - to true will output the message body as JSON-formatted text. The body and - attachments are stored in a folder. Setting useFileName to true will mean that - the filename is used as the name of the folder; otherwise, the message's date - and subject are used as the folder name. - - Here is the absolute order of prioity for the name of the folder: - 1. customFilename - 2. self.filename if useFileName - 3. {date} {subject} + Returns the message sender, if it exists. """ - if customFilename != None and customFilename != '': - dirName = customFilename - else: - if useFileName: - # strip out the extension - if self.filename is not None: - dirName = self.filename.split('/').pop().split('.')[0] - else: - ValueError( - 'Filename must be specified, or path must have been an actual path, to save using filename') - else: - # Create a directory based on the date and subject of the message - d = self.parsedDate - if d is not None: - dirName = '{0:02d}-{1:02d}-{2:02d}_{3:02d}{4:02d}'.format(*d) - else: - dirName = 'UnknownDate' - - if self.subject is None: - subject = '[No subject]' - else: - subject = ''.join(i for i in self.subject if i not in r'\/:*?"<>|') - - dirName = dirName + ' ' + subject - - if customPath != None and customPath != '': - if customPath[-1] != '/' or customPath[-1] != '\\': - customPath += '/' - dirName = customPath + dirName try: - os.makedirs(dirName) - except Exception: - newDirName = addNumToDir(dirName) - if newDirName is not None: - dirName = newDirName - else: - raise Exception( - "Failed to create directory '%s'. Does it already exist?" % - dirName - ) - - oldDir = os.getcwdu() - try: - os.chdir(dirName) - - # Save the message body - fext = 'json' if toJson else 'text' - f = open('message.' + fext, 'w') - # From, to , cc, subject, date - - attachmentNames = [] - # Save the attachments - for attachment in self.attachments: - attachmentNames.append(attachment.save(ContentId, toJson)) - - if toJson: - - emailObj = {'from': xstr(self.sender), - 'to': xstr(self.to), - 'cc': xstr(self.cc), - 'subject': xstr(self.subject), - 'date': xstr(self.date), - 'attachments': attachmentNames, - 'body': decode_utf7(self.body)} - - f.write(json.dumps(emailObj, ensure_ascii=True)) + return self._sender + except AttributeError: + # Check header first + if self.headerInit(): + headerResult = self.header['from'] + if headerResult is not None: + self._sender = headerResult + return headerResult + logger.info('Header found, but "sender" is not included. Will be generated from other streams.') + # Extract from other fields + text = self._getStringStream('__substg1.0_0C1A') + email = self._getStringStream('__substg1.0_5D01') + # Will not give an email address sometimes. Seems to exclude the email address if YOU are the sender. + result = None + if text is None: + result = email else: - f.write('From: ' + xstr(self.sender) + self.__crlf) - f.write('To: ' + xstr(self.to) + self.__crlf) - f.write('CC: ' + xstr(self.cc) + self.__crlf) - f.write('Subject: ' + xstr(self.subject) + self.__crlf) - f.write('Date: ' + xstr(self.date) + self.__crlf) - f.write('-----------------' + self.__crlf + self.__crlf) - f.write(self.body) - - f.close() - - except Exception as e: - self.saveRaw() - raise - - finally: - # Return to previous directory - os.chdir(oldDir) - - def saveRaw(self): - # Create a 'raw' folder - oldDir = os.getcwdu() - try: - rawDir = 'raw' - os.makedirs(rawDir) - os.chdir(rawDir) - sysRawDir = os.getcwdu() - - # Loop through all the directories - for dir_ in self.listdir(): - sysdir = '/'.join(dir_) - code = dir_[-1][-8:] - if code in constants.PROPERTIES: - sysdir = sysdir + ' - ' + constants.PROPERTIES[code] - os.makedirs(sysdir) - os.chdir(sysdir) - - # Generate appropriate filename - if dir_[-1].endswith('001E'): - filename = 'contents.txt' - else: - filename = 'contents' - - # Save contents of directory - with open(filename, 'wb') as f: - f.write(self._getStream(dir_)) - - # Return to base directory - os.chdir(sysRawDir) + result = text + if email is not None: + result += ' <' + email + '>' - finally: - os.chdir(oldDir) + self._sender = result + return result - def dump(self): + @property + def subject(self): """ - Prints out a summary of the message + Returns the message subject, if it exists. """ - print('Message') - print('Subject:', self.subject) - print('Date:', self.date) - print('Body:') - print(self.body) + return self._ensureSet('_subject', '__substg1.0_0037') - def debug(self): - for dir_ in self.listDir(): - if dir_[-1].endswith('001E') or dir_[-1].endswith('001F'): - print('Directory: ' + str(dir_[:-1])) - print('Contents: {}'.format(self._getStream(dir_))) + @property + def stringEncoding(self): + try: + return self.__stringEncoding + except AttributeError: + # We need to calculate the encoding + # Let's first check if the encoding will be unicode: + if self.areStringsUnicode: + self.__stringEncoding = "utf-16-le" + return self.__stringEncoding + else: + # Well, it's not unicode. Now we have to figure out what it IS. + if not self.mainProperties.has_key('3FFD0003'): + raise Exception('Encoding property not found') + enc = self.mainProperties['3FFD0003'].value + # Now we just need to translate that value + # Now, this next line SHOULD work, but it is possible that it might not... + self.__stringEncoding = str(enc) + return self.__stringEncoding - def save_attachments(self, contentId=False, json=False, useFileName=False, raw=False, customPath=None): + @property + def to(self): """ - Saves only attachments in the same folder. + Returns the to field, if it exists. """ - for attachment in self.attachments: - attachment.save(contentId, json, useFileName, raw, customPath) + return self._genRecipient('to', 1) diff --git a/extract_msg/prop.py b/extract_msg/prop.py index bb994ee6..51bff970 100644 --- a/extract_msg/prop.py +++ b/extract_msg/prop.py @@ -171,17 +171,17 @@ def length(self): The length field of the variable length property. """ return self.__length - - @property - def reserved_flags(self): - """ - The reserved flags field of the variable length property. - """ - return self.__reserved - + @property def real_length(self): """ The ACTUAL length of the stream that this property corresponds to. """ return self.__realLength + + @property + def reserved_flags(self): + """ + The reserved flags field of the variable length property. + """ + return self.__reserved diff --git a/extract_msg/properties.py b/extract_msg/properties.py index 6770464b..7c4c48d3 100644 --- a/extract_msg/properties.py +++ b/extract_msg/properties.py @@ -54,6 +54,25 @@ def __init__(self, stream, type=None, skip=None): self.__props[a.name] = a self.__pl = len(self.__props) + def __contains__(self, key): + self.__props.__contains__(key) + + def __getitem__(self, key): + return self.__props.__getitem__(key) + + def __iter__(self): + return self.__props.__iter__() + + def __len__(self): + """ + Returns the number of properties. + """ + return self.__pl + + @property + def __repr__(self): + return self.__props.__repr__ + def get(self, name): """ Retrieve the property of :param name:. @@ -82,25 +101,6 @@ def keys(self): def values(self): return self.__props.values() - def __contains__(self, key): - self.__props.__contains__(key) - - def __getitem__(self, key): - return self.__props.__getitem__(key) - - def __iter__(self): - return self.__props.__iter__() - - def __len__(self): - """ - Returns the number of properties. - """ - return self.__pl - - @property - def __repr__(self): - return self.__props.__repr__ - items.__doc__ = dict.items.__doc__ keys.__doc__ = dict.keys.__doc__ values.__doc__ = dict.values.__doc__ diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 21d42eeb..b66c3d51 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -20,14 +20,11 @@ if sys.version_info[0] >= 3: # Python 3 stri = (str,) - get_input = input - def encode(inp): return inp - def properHex(inp): """ Taken (with permission) from https://github.com/TheElementalOfCreation/creatorUtils @@ -43,24 +40,19 @@ def properHex(inp): a = '0' + a return a - def windowsUnicode(string): return str(string, 'utf_16_le') if string is not None else None - def xstr(s): return '' if s is None else str(s) else: # Python 2 stri = (str, unicode) - get_input = raw_input - def encode(inp): return inp.encode('utf-8') if inp is not None else None - def properHex(inp): """ Taken (with permission) from https://github.com/TheElementalOfCreation/creatorUtils @@ -76,18 +68,15 @@ def properHex(inp): a = '0' + a return a - def windowsUnicode(string): return unicode(string, 'utf_16_le') if string is not None else None - def xstr(s): if isinstance(s, unicode): return s.encode('utf-8') else: return '' if s is None else str(s) - def addNumToDir(dirName): """ Attempt to create the directory with a '(n)' appended. @@ -101,7 +90,6 @@ def addNumToDir(dirName): pass return None - def divide(string, length): """ Taken (with permission) from https://github.com/TheElementalOfCreation/creatorUtils @@ -118,11 +106,9 @@ def divide(string, length): """ return [string[length * x:length * (x + 1)] for x in range(int(len(string) / length))] - def fromTimeStamp(stamp): return datetime.datetime.fromtimestamp(stamp, tzlocal.get_localzone()) - def get_command_args(args): """ Parse command-line arguments @@ -190,6 +176,14 @@ def get_command_args(args): options.msgs = file_tables return options +def getContFileDir(_file_): + """ + Takes in the path to a file and tries to return the containing folder. + """ + return '/'.join(_file_.replace('\\', '/').split('/')[:-1]) + +def get_full_class_name(inp): + return inp.__class__.__module__ + '.' + inp.__class__.__name__ def has_len(obj): """ @@ -201,14 +195,12 @@ def has_len(obj): except AttributeError: return False - def msgEpoch(inp): """ Taken (with permission) from https://github.com/TheElementalOfCreation/creatorUtils """ return (inp - 116444736000000000) / 10000000.0 - def parse_type(_type, stream): """ Converts the data in :param stream: to a @@ -282,14 +274,6 @@ def parse_type(_type, stream): pass return value - -def getContFileDir(_file_): - """ - Takes in the path to a file and tries to return the containing folder. - """ - return '/'.join(_file_.replace('\\', '/').split('/')[:-1]) - - def setup_logging(default_path=None, default_level=logging.WARN, logfile=None, enable_file_logging=False, env_key='EXTRACT_MSG_LOG_CFG'): """ @@ -366,7 +350,3 @@ def setup_logging(default_path=None, default_level=logging.WARN, logfile=None, e logging.getLogger().setLevel(default_level) return True - - -def get_full_class_name(inp): - return inp.__class__.__module__ + '.' + inp.__class__.__name__ diff --git a/extract_msg/validation.py b/extract_msg/validation.py index 6f9df46c..6e1ee5d1 100644 --- a/extract_msg/validation.py +++ b/extract_msg/validation.py @@ -6,28 +6,25 @@ from extract_msg.utils import get_full_class_name, has_len -def get_string_details(instance, stream): +def get_email_details(instance, stream): return { 'exists': instance.sExists(stream), 'not empty': False if not instance.sExists(stream) else len(instance._getStringStream(stream)) > 0, + 'valid email address': False if not instance.sExists(stream) else u'@' in instance._getStringStream(stream), } - def get_stream_details(instance, stream): return { 'exists': instance.Exists(stream), 'not empty': False if not instance.Exists(stream) else len(instance._getStream(stream)) > 0, } - -def get_email_details(instance, stream): +def get_string_details(instance, stream): return { 'exists': instance.sExists(stream), 'not empty': False if not instance.sExists(stream) else len(instance._getStringStream(stream)) > 0, - 'valid email address': False if not instance.sExists(stream) else u'@' in instance._getStringStream(stream), } - def string_FE(instance): temp = '001E' if instance.mainProperties.has_key('340D0003'): @@ -39,39 +36,6 @@ def string_FE(instance): return temp -def validate_msg(instance): - return { - '001F/001E': string_FE(instance), - 'header': get_string_details(instance, '__substg1.0_007D'), - 'body': get_string_details(instance, '__substg1.0_1000'), - 'html body': get_stream_details(instance, '__substg1.0_10130102'), - 'rtf body': get_stream_details(instance, '__substg1.0_10090102'), - 'date': instance.date, - 'attachments': {x: validate_attachment(y) for x, y in enumerate(instance.attachments)}, - 'recipients': {x: validate_recipient(y) for x, y in enumerate(instance.recipients)}, - } - - -def validate_attachment(instance): - temp = { - 'long filename': get_string_details(instance, '__substg1.0_3707'), - 'short filename': get_string_details(instance, '__substg1.0_3704'), - 'content id': get_string_details(instance, '__substg1.0_3712'), - 'type': instance.type, - } - if temp['type'] == 'msg': - temp['msg'] = validate_msg(instance.data) - return temp - - -def validate_recipient(instance): - return { - 'type': instance.type, - 'stream 3003': get_email_details(instance, '__substg1.0_3003'), - 'stream 39FE': get_email_details(instance, '__substg1.0_39FE'), - } - - def validate(msg): validation_dict = { 'input': { @@ -98,3 +62,33 @@ def validate(msg): validation_dict['message']['initializes'] = True validation_dict['message']['msg'] = validate_msg(msg_instance) return validation_dict + +def validate_attachment(instance): + temp = { + 'long filename': get_string_details(instance, '__substg1.0_3707'), + 'short filename': get_string_details(instance, '__substg1.0_3704'), + 'content id': get_string_details(instance, '__substg1.0_3712'), + 'type': instance.type, + } + if temp['type'] == 'msg': + temp['msg'] = validate_msg(instance.data) + return temp + +def validate_msg(instance): + return { + '001F/001E': string_FE(instance), + 'header': get_string_details(instance, '__substg1.0_007D'), + 'body': get_string_details(instance, '__substg1.0_1000'), + 'html body': get_stream_details(instance, '__substg1.0_10130102'), + 'rtf body': get_stream_details(instance, '__substg1.0_10090102'), + 'date': instance.date, + 'attachments': {x: validate_attachment(y) for x, y in enumerate(instance.attachments)}, + 'recipients': {x: validate_recipient(y) for x, y in enumerate(instance.recipients)}, + } + +def validate_recipient(instance): + return { + 'type': instance.type, + 'stream 3003': get_email_details(instance, '__substg1.0_3003'), + 'stream 39FE': get_email_details(instance, '__substg1.0_39FE'), + }