From 58b21579c44f760e25db922556b341990d0368bc Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 08:13:49 -0800 Subject: [PATCH 1/7] Added stdin option (needs testing) --- CHANGELOG.md | 3 +++ extract_msg/utils.py | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73b7a652..1b6a530a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +**v0.48.1** +* Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. + **v0.48.0** * Adjusted error handling for named properties to handle critical streams being missing and to allow suppression of those errors. * Adjusted error handling for named properties to allow silencing of errors caused by invalid references to the name stream. If `ErrorBehavior.NAMED_NAME_STREAM` is provided to the `MSGFile` instance, a warning will be logged and that entry will simply be dropped. diff --git a/extract_msg/utils.py b/extract_msg/utils.py index f6a496af..bc4e9465 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -63,6 +63,7 @@ import pathlib import shutil import struct +import sys import weakref import zipfile @@ -359,6 +360,7 @@ def getCommandArgs(args: Sequence[str]) -> argparse.Namespace: parser = argparse.ArgumentParser(description = constants.MAINDOC, prog = 'extract_msg') outFormat = parser.add_mutually_exclusive_group() inputFormat = parser.add_mutually_exclusive_group() + inputType = parser.add_mutually_exclusive_group(required = True) # --use-content-id, --cid parser.add_argument('--use-content-id', '--cid', dest='cid', action='store_true', help='Save attachments by their Content ID, if they have one. Useful when working with the HTML body.') @@ -368,7 +370,7 @@ def getCommandArgs(args: Sequence[str]) -> argparse.Namespace: # --file-logging parser.add_argument('--file-logging', dest='fileLogging', action='store_true', help='Enables file logging. Implies --verbose level 1.') - # --verbose + # -v, --verbose parser.add_argument('-v', '--verbose', dest='verbose', action='count', default=0, help='Turns on console logging. Specify more than once for higher verbosity.') # --log PATH @@ -455,12 +457,19 @@ def getCommandArgs(args: Sequence[str]) -> argparse.Namespace: # --progress parser.add_argument('--progress', dest='progress', action='store_true', help='Shows what file the program is currently working on during it\'s progress.') + # -s, --stdout + inputType.add_argument('-s', '--stdin', dest='stdin', action='store_true', + help='Read file from stdin (only works with one file at a time).') # [MSG files] - parser.add_argument('msgs', metavar='msg', nargs='+', + inputType.add_argument('msgs', metavar='msg', nargs='*', default=[], help='An MSG file to be parsed.') options = parser.parse_args(args) + if options.stdin: + # Read the MSG file from stdin and shove it into the msgs list. + options.msgs.append(sys.stdin.buffer.read()) + if options.outName and options.noFolders: raise IncompatibleOptionsError('--out-name is not compatible with --no-folders.') @@ -502,6 +511,8 @@ def getCommandArgs(args: Sequence[str]) -> argparse.Namespace: if options.glob: if options.outName: raise IncompatibleOptionsError('--out-name is not supported when using wildcards.') + if options.stdin: + raise IncompatibleOptionsError('--stdin is not supported with using wildcards.') fileLists = [] for path in options.msgs: fileLists += glob.glob(path) From cfa89c1db32f7849acc8f606335083d2ed573869 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 08:21:58 -0800 Subject: [PATCH 2/7] Changed main function to allow manual args --- extract_msg/__main__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/extract_msg/__main__.py b/extract_msg/__main__.py index 78055ff5..120ffb32 100644 --- a/extract_msg/__main__.py +++ b/extract_msg/__main__.py @@ -10,12 +10,13 @@ from extract_msg import __doc__, openMsg, utils from extract_msg.enums import ErrorBehavior +from typing import List -def main() -> None: +def main(argv : List[str]) -> None: # Setup logging to stdout, indicate running from cli CLI_LOGGING = 'extract_msg_cli' - args = utils.getCommandArgs(sys.argv[1:]) + args = utils.getCommandArgs(argv[1:]) # Determine where to save the files to. currentDir = os.getcwd() # Store this in case the path changes. @@ -111,4 +112,4 @@ def strSanitize(inp): _zip.close() if __name__ == '__main__': - main() + main(sys.argv) From 364a16ad964ceb6f05b49a0f93160c6ebeaea207 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 08:25:45 -0800 Subject: [PATCH 3/7] Bump version stuff --- CHANGELOG.md | 1 + README.rst | 7 ++++--- extract_msg/__init__.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b6a530a..d3509689 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ **v0.48.1** * Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. +* Changed main function to allow for manual argument list to be passed to it. **v0.48.0** * Adjusted error handling for named properties to handle critical streams being missing and to allow suppression of those errors. diff --git a/README.rst b/README.rst index 529dcf85..e3ca5315 100644 --- a/README.rst +++ b/README.rst @@ -61,7 +61,7 @@ refer to the usage information provided from the program's help dialog: usage: extract_msg [-h] [--use-content-id] [--json] [--file-logging] [-v] [--log LOG] [--config CONFIGPATH] [--out OUTPATH] [--use-filename] [--dump-stdout] [--html] [--pdf] [--wk-path WKPATH] [--wk-options [WKOPTIONS ...]] [--prepared-html] [--charset CHARSET] [--raw] [--rtf] [--allow-fallback] [--skip-body-not-found] [--zip ZIP] [--save-header] [--attachments-only] [--skip-hidden] [--no-folders] [--skip-embedded] [--extract-embedded] - [--overwrite-existing] [--skip-not-implemented] [--out-name OUTNAME | --glob] [--ignore-rtfde] [--progress] + [--overwrite-existing] [--skip-not-implemented] [--out-name OUTNAME | --glob] [--ignore-rtfde] [--progress] [-s] msg [msg ...] extract_msg: Extracts emails and attachments saved in Microsoft Outlook's .msg files. https://github.com/TeamMsgExtractor/msg-extractor @@ -107,6 +107,7 @@ refer to the usage information provided from the program's help dialog: --glob, --wildcard Interpret all paths as having wildcards. Incompatible with --out-name. --ignore-rtfde Ignores all errors thrown from RTFDE when trying to save. Useful for allowing fallback to continue when an exception happens. --progress Shows what file the program is currently working on during it's progress. + -s, --stdin Read file from stdin (only works with one file at a time). **To use this in your own script**, start by using: @@ -259,8 +260,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.48.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.48.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.48.1-blue.svg + :target: https://pypi.org/project/extract-msg/0.48.1/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3810/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 6d5090f1..7e3a4026 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2024-02-28' -__version__ = '0.48.0' +__date__ = '2024-03-08' +__version__ = '0.48.1' __all__ = [ # Modules: From 7764e75ca6c55fe9f263bdc54d8841b1d4aa805a Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 09:05:53 -0800 Subject: [PATCH 4/7] Added modification and creation time for attachments --- CHANGELOG.md | 1 + extract_msg/attachments/attachment_base.py | 63 ++++++++++++++++------ 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d3509689..9bfa80d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ **v0.48.1** * Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. * Changed main function to allow for manual argument list to be passed to it. +* Added attributes at attachment base for creation and modification time. These can be accessed through `createdAt` ir `creationTime` and `lastModificationTime` or `modifiedAt`. **v0.48.0** * Adjusted error handling for named properties to handle critical streams being missing and to allow suppression of those errors. diff --git a/extract_msg/attachments/attachment_base.py b/extract_msg/attachments/attachment_base.py index 2a685132..f9276757 100644 --- a/extract_msg/attachments/attachment_base.py +++ b/extract_msg/attachments/attachment_base.py @@ -473,7 +473,7 @@ def save(self, **kwargs) -> SAVE_TYPE: the first item specifies what the second value will be. """ - @functools.cached_property + @cached_property def attachmentEncoding(self) -> Optional[bytes]: """ The encoding information about the attachment object. @@ -483,7 +483,7 @@ def attachmentEncoding(self) -> Optional[bytes]: """ return self.getStream('__substg1.0_37020102') - @functools.cached_property + @cached_property def additionalInformation(self) -> Optional[str]: """ The additional information about the attachment. @@ -495,7 +495,7 @@ def additionalInformation(self) -> Optional[str]: """ return self.getStringStream('__substg1.0_370F') - @functools.cached_property + @cached_property def cid(self) -> Optional[str]: """ Returns the Content ID of the attachment, if it exists. @@ -526,8 +526,25 @@ def clsid(self) -> str: @property def contentId(self) -> Optional[str]: + """ + Alias of :attr:`cid`. + """ return self.cid + @property + def createdAt(self) -> Optional[datetime.datetime]: + """ + Alias of :attr:`creationTime`. + """ + return self.creationTime + + @cached_property + def creationTime(self) -> Optional[datetime.datetime]: + """ + The time the attachment was created. + """ + return self.getPropertyVal('30070040') + @property @abc.abstractmethod def data(self) -> Optional[object]: @@ -537,7 +554,7 @@ def data(self) -> Optional[object]: Returns ``None`` if there is no data to save. """ - @functools.cached_property + @cached_property def dataType(self) -> Optional[Type[object]]: """ The class that the data type will use, if it can be retrieved. @@ -560,14 +577,14 @@ def dir(self) -> str: """ return self.__dir - @functools.cached_property + @cached_property def displayName(self) -> Optional[str]: """ Returns the display name of the folder. """ return self.getStringStream('__substg1.0_3001') - @functools.cached_property + @cached_property def exceptionReplaceTime(self) -> Optional[datetime.datetime]: """ The original date and time at which the instance in the recurrence @@ -577,48 +594,62 @@ def exceptionReplaceTime(self) -> Optional[datetime.datetime]: """ return self.getPropertyVal('7FF90040') - @functools.cached_property + @cached_property def extension(self) -> Optional[str]: """ The reported extension for the file. """ return self.getStringStream('__substg1.0_3703') - @functools.cached_property + @cached_property def hidden(self) -> bool: """ Indicates whether an Attachment object is hidden from the end user. """ return bool(self.getPropertyVal('7FFE000B')) - @functools.cached_property + @cached_property def isAttachmentContactPhoto(self) -> bool: """ Whether the attachment is a contact photo for a Contact object. """ return bool(self.getPropertyVal('7FFF000B')) - @functools.cached_property + @cached_property + def lastModificationTime(self) -> Optional[datetime.datetime]: + """ + The last time the attachment was modified. + """ + return self.getPropertyVal('30080040') + + @cached_property def longFilename(self) -> Optional[str]: """ Returns the long file name of the attachment, if it exists. """ return self.getStringStream('__substg1.0_3707') - @functools.cached_property + @cached_property def longPathname(self) -> Optional[str]: """ The fully qualified path and file name with extension. """ return self.getStringStream('__substg1.0_370D') - @functools.cached_property + @cached_property def mimetype(self) -> Optional[str]: """ The content-type mime header of the attachment, if specified. """ return tryGetMimetype(self, self.getStringStream('__substg1.0_370E')) + @property + def modifiedAt(self) -> Optional[datetime.datetime]: + """ + Alias of :attr:`lastModificationTime`. + """ + return self.lastModificationTime + @property def msg(self) -> MSGFile: """ @@ -631,7 +662,7 @@ def msg(self) -> MSGFile: raise ReferenceError('The MSGFile for this Attachment instance has been garbage collected.') return msg - @functools.cached_property + @cached_property def name(self) -> Optional[str]: """ The best name available for the file. @@ -650,7 +681,7 @@ def namedProperties(self) -> NamedProperties: """ return self.__namedProperties - @functools.cached_property + @cached_property def payloadClass(self) -> Optional[str]: """ The class name of an object that can display the contents of the @@ -665,7 +696,7 @@ def props(self) -> PropertiesStore: """ return self.__props - @functools.cached_property + @cached_property def renderingPosition(self) -> Optional[int]: """ The offset, in rendered characters, to use when rendering the attachment @@ -676,7 +707,7 @@ def renderingPosition(self) -> Optional[int]: """ return self.getPropertyVal('370B0003') - @property + @cached_property def shortFilename(self) -> Optional[str]: """ The short file name of the attachment, if it exists. From bf6feaf2242251a2fb82191f8d64c5bced97a7bd Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 09:07:04 -0800 Subject: [PATCH 5/7] Test updates --- CHANGELOG.md | 2 ++ extract_msg_tests/__init__.py | 4 +++- extract_msg_tests/cmd_line_tests.py | 34 +++++++++++++++++++++++++++ extract_msg_tests/ole_writer_tests.py | 8 ++++--- 4 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 extract_msg_tests/cmd_line_tests.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bfa80d0..a54d7654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ * Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. * Changed main function to allow for manual argument list to be passed to it. * Added attributes at attachment base for creation and modification time. These can be accessed through `createdAt` ir `creationTime` and `lastModificationTime` or `modifiedAt`. +* Changed OleWriter tests to output the name of the test file being done if an error occurs. +* Added tests for some command line stuff. **v0.48.0** * Adjusted error handling for named properties to handle critical streams being missing and to allow suppression of those errors. diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py index 117e5f63..a31b4cc0 100644 --- a/extract_msg_tests/__init__.py +++ b/extract_msg_tests/__init__.py @@ -1,12 +1,14 @@ __all__ = [ 'AttachmentTests', + 'CommandLineTests', 'OleWriterEditingTests', 'OleWriterExportTests', 'PropTests', 'ValidationTests', ] -from .validation_tests import ValidationTests from .attachment_tests import AttachmentTests +from .cmd_line_tests import CommandLineTests from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests from .prop_tests import PropTests +from .validation_tests import ValidationTests diff --git a/extract_msg_tests/cmd_line_tests.py b/extract_msg_tests/cmd_line_tests.py new file mode 100644 index 00000000..a2f95375 --- /dev/null +++ b/extract_msg_tests/cmd_line_tests.py @@ -0,0 +1,34 @@ +__all__ = [ + 'CommandLineTests', +] + + +import pathlib +import subprocess +import sys +import unittest + +from .constants import TEST_FILE_DIR, USER_TEST_DIR + + +class CommandLineTests(unittest.TestCase): + def testStdin(self, testFileDir = TEST_FILE_DIR): + for path in testFileDir.glob('*.msg'): + # First, let's do the file on the disk. + process = subprocess.Popen([sys.executable, '-m', 'extract_msg', '--dump-stdout', str(path)], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + # Wait for the process to return data. + stdout1, stderr1 = process.communicate() + + # Now, do the same thing with stdin. + process = subprocess.Popen([sys.executable, '-m', 'extract_msg', '-s', '--dump-stdout'], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + with open(path, 'rb') as f: + stdout2, stderr2 = process.communicate(f.read()) + + # Now, compare the two. + with self.subTest(path): + self.assertEqual(stdout1, stdout2) + self.assertEqual(stderr1, stderr2) + + @unittest.skipIf(USER_TEST_DIR is None, 'User test files not defined.') + def testUserStdin(self): + self.testStdin(USER_TEST_DIR) \ No newline at end of file diff --git a/extract_msg_tests/ole_writer_tests.py b/extract_msg_tests/ole_writer_tests.py index 55c2a2ec..610c2aa7 100644 --- a/extract_msg_tests/ole_writer_tests.py +++ b/extract_msg_tests/ole_writer_tests.py @@ -147,9 +147,11 @@ def testExportExamples(self, testFileDir = TEST_FILE_DIR): with open(exportResultFile, 'rb') as f: exportResult = f.read() - # We use two assertions to give better error messages. - self.assertCountEqual(exportResult, exportedBytes, 'Exported data is wrong size.') - self.assertEqual(exportedBytes, exportResult, 'Exported data is incorrect.') + # Use a subtest to print the file name. + with self.subTest(str(testFileDir / exportResultFile.name)): + # We use two assertions to give better error messages. + self.assertCountEqual(exportResult, exportedBytes, 'Exported data is wrong size.') + self.assertEqual(exportedBytes, exportResult, 'Exported data is incorrect.') @unittest.skipIf(USER_TEST_DIR is None, 'User test files not defined.') @unittest.skipIf(USER_TEST_DIR is not None and not (USER_TEST_DIR / 'export-results').exists(), 'User export tests not defined.') From 89ad006015fe8bda5699d10c5e5c27107aa81bc7 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 09:11:04 -0800 Subject: [PATCH 6/7] Fix typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a54d7654..901bf384 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ **v0.48.1** * Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. * Changed main function to allow for manual argument list to be passed to it. -* Added attributes at attachment base for creation and modification time. These can be accessed through `createdAt` ir `creationTime` and `lastModificationTime` or `modifiedAt`. +* Added attributes at attachment base for creation and modification time. These can be accessed through `createdAt` or `creationTime` and `lastModificationTime` or `modifiedAt`. * Changed OleWriter tests to output the name of the test file being done if an error occurs. * Added tests for some command line stuff. From af5fa5765536a984e759d49aa136e6469162ec07 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Fri, 8 Mar 2024 09:11:55 -0800 Subject: [PATCH 7/7] More typo fixes --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 901bf384..e513e208 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ **v0.48.1** -* Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the msg data from another program directly instead of having to write a middleman that uses the extract-msg library directly or having to write the file to the disk first. +* Added an option (`-s`, `--stdin`) to the command line to take an MSG file from stdin. This allows the user to pipe the MSG data from another program directly instead of having to write a middleman that uses the `extract-msg` library directly or having to write the file to the disk first. * Changed main function to allow for manual argument list to be passed to it. -* Added attributes at attachment base for creation and modification time. These can be accessed through `createdAt` or `creationTime` and `lastModificationTime` or `modifiedAt`. -* Changed OleWriter tests to output the name of the test file being done if an error occurs. +* Added attributes to `AttachmentBase` for creation and modification time. These can be accessed through `createdAt` or `creationTime` and `lastModificationTime` or `modifiedAt`. +* Changed `OleWriter` tests to output the name of the test file being done if an error occurs. * Added tests for some command line stuff. **v0.48.0**