diff --git a/CHANGELOG.md b/CHANGELOG.md index f08d66e2..b5dfb13a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ +**v0.28.1** +* [[TeamMsgExtractor #181](https://github.com/TeamMsgExtractor/msg-extractor/issues/181)] Fixed issue in `Attachment` that arose when moving some of the code to a base class. +* Fixed small error in `utils.parse_type` that caused it to incorrectly compare expected and actual length. Fortunately, this had no actual effect aside from a warning. +* Added the `ebcdic` module to the requirements to add more supported encodings. + **v0.28.0** * [[TeamMsgExtractor #87](https://github.com/TeamMsgExtractor/msg-extractor/issues/87)] Added a new system to handle `NotImplementedError` and other exceptions. All msg classes now have an option called `attachmentErrorBehavior` that tells the class what to do if it has an error. The value should be one of three constants: `ATTACHMENT_ERROR_THROW`, `ATTACHMENT_ERROR_NOT_IMPLEMENTED`, or `ATTACHMENT_ERROR_BROKEN`. `ATTACHMENT_ERROR_THROW` tells the class to not catch and exceptions and just let the user handle them. `ATTACHMENT_ERROR_NOT_IMPLEMENTED` tells the class to catch `NotImplementedError` exceptions and put an instance of `UnsupportedAttachment` in place of a regular attachment. `ATTACHMENT_ERROR_BROKEN` tells the class to catch *all* exceptions and either replace the attachment with `UnsupportedAttachment` if it is a `NotImplementedError` or `BrokenAttachment` for all other exceptions. With both of those options, caught exceptions will be logged. -* In making the previous point work, much code from `Attachment` has been moved to a new class called `AttachmentBase`. Both `BrokenAttachment` and `UnsupportedAttachment` are subclasses of `AttachmentBase` meaning data can be extracted from their streams in the same way as a functioning attachment. +* In making the previous point work, much code from `Attachment` has been moved to a new class called `AttachmentBase`. Both `BrokenAttachment` and `UnsupportedAttachment` are subclasses of `AttachmentBase` meaning data can be extracted from their streams in the same way as a functioning attachment. * [[TeamMsgExtractor #162](https://github.com/TeamMsgExtractor/msg-extractor/issues/162)] Pretty sure I actually got it this time. The execution flag should be applied by pip now. * Fixed typos in some exceptions diff --git a/README.rst b/README.rst index 5d2954dd..4cbff210 100644 --- a/README.rst +++ b/README.rst @@ -180,8 +180,8 @@ Credits .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.28.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.28.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.28.1-blue.svg + :target: https://pypi.org/project/extract-msg/0.28.1/ .. |PyPI1| image:: https://img.shields.io/badge/python-2.7+-brightgreen.svg :target: https://www.python.org/downloads/release/python-2715/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 581f29bc..60b3b292 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'The Elemental of Destruction & Matthew Walker' -__date__ = '2021-01-07' -__version__ = '0.28.0' +__date__ = '2021-01-12' +__version__ = '0.28.1' import logging diff --git a/extract_msg/attachment.py b/extract_msg/attachment.py index a8a9b870..44adc44e 100644 --- a/extract_msg/attachment.py +++ b/extract_msg/attachment.py @@ -41,7 +41,7 @@ def __init__(self, msg, dir_): self.__prefix = msg.prefixList + [dir_, '__substg1.0_3701000D'] self.__type = 'msg' self.__data = openMsg(self.msg.path, self.__prefix, self.__class__, overrideEncoding = msg.overrideEncoding, attachmentErrorBehavior = msg.attachmentErrorBehavior) - elif (self.__props['37050003'].value & 0x7) == 0x7: + elif (self.props['37050003'].value & 0x7) == 0x7: # TODO Handling for special attacment type 0x7 self.__type = 'web' raise NotImplementedError('Attachments of type afByWebReference are not currently supported.') diff --git a/extract_msg/constants.py b/extract_msg/constants.py index 78548a34..8cf41897 100644 --- a/extract_msg/constants.py +++ b/extract_msg/constants.py @@ -8,6 +8,8 @@ import struct import sys +import ebcdic + if sys.version_info[0] >= 3: BYTES = bytes STRING = str @@ -184,8 +186,7 @@ 865: 'IBM865', # OEM Nordic; Nordic (DOS) 866: 'cp866', # OEM Russian; Cyrillic (DOS) 869: 'ibm869', # OEM Modern Greek; Greek, Modern (DOS) - # UNSUPPORTED - 870: 'IBM870', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 + 870: 'cp870', # IBM870 # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 # UNSUPPORTED 874: 'windows-874', # ANSI/OEM Thai (ISO 8859-11); Thai (Windows) 875: 'cp875', # IBM EBCDIC Greek Modern @@ -194,28 +195,17 @@ 949: 'ks_c_5601-1987', # ANSI/OEM Korean (Unified Hangul Code) 950: 'big5', # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) 1026: 'IBM1026', # IBM EBCDIC Turkish (Latin 5) - # UNSUPPORTED - 1047: 'IBM01047', # IBM EBCDIC Latin 1/Open System - # UNSUPPORTED - 1140: 'IBM01140', # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) - # UNSUPPORTED - 1141: 'IBM01141', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) - # UNSUPPORTED - 1142: 'IBM01142', # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) - # UNSUPPORTED - 1143: 'IBM01143', # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) - # UNSUPPORTED - 1144: 'IBM01144', # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) - # UNSUPPORTED - 1145: 'IBM01145', # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) - # UNSUPPORTED - 1146: 'IBM01146', # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) - # UNSUPPORTED - 1147: 'IBM01147', # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) - # UNSUPPORTED - 1148: 'IBM01148', # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) - # UNSUPPORTED - 1149: 'IBM01149', # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) + 1047: 'cp1047', # IBM EBCDIC Latin 1/Open System + 1140: 'cp1140', # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) + 1141: 'cp1141', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) + 1142: 'cp1142', # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) + 1143: 'cp1143', # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) + 1144: 'cp1144', # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) + 1145: 'cp1145', # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) + 1146: 'cp1146', # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) + 1147: 'cp1147', # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) + 1148: 'cp1148ms', # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) + 1149: 'cp1149', # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) 1200: 'utf-16-le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications 1201: 'utf-16-be', # Unicode UTF-16, big endian byte order; available only to managed applications 1250: 'windows-1250', # ANSI Central European; Central European (Windows) @@ -285,32 +275,21 @@ # UNSUPPORTED 20269: 'x-cp20269', # ISO 6937 Non-Spacing Accent 20273: 'IBM273', # IBM EBCDIC Germany - # UNSUPPORTED - 20277: 'IBM277', # IBM EBCDIC Denmark-Norway - # UNSUPPORTED - 20278: 'IBM278', # IBM EBCDIC Finland-Sweden - # UNSUPPORTED - 20280: 'IBM280', # IBM EBCDIC Italy - # UNSUPPORTED - 20284: 'IBM284', # IBM EBCDIC Latin America-Spain - # UNSUPPORTED - 20285: 'IBM285', # IBM EBCDIC United Kingdom - # UNSUPPORTED - 20290: 'IBM290', # IBM EBCDIC Japanese Katakana Extended - # UNSUPPORTED - 20297: 'IBM297', # IBM EBCDIC France - # UNSUPPORTED - 20420: 'IBM420', # IBM EBCDIC Arabic + 20277: 'cp277', # IBM EBCDIC Denmark-Norway + 20278: 'cp278', # IBM EBCDIC Finland-Sweden + 20280: 'cp280', # IBM EBCDIC Italy + 20284: 'cp284', # IBM EBCDIC Latin America-Spain + 20285: 'cp285', # IBM EBCDIC United Kingdom + 20290: 'cp290', # IBM EBCDIC Japanese Katakana Extended + 20297: 'cp297', # IBM EBCDIC France + 20420: 'cp420', # IBM EBCDIC Arabic # UNSUPPORTED 20423: 'IBM423', # IBM EBCDIC Greek 20424: 'IBM424', # IBM EBCDIC Hebrew - # UNSUPPORTED - 20833: 'x-EBCDIC-KoreanExtended', # IBM EBCDIC Korean Extended - # UNSUPPORTED - 20838: 'IBM-Thai', # IBM EBCDIC Thai + 20833: 'cp833', # IBM EBCDIC Korean Extended + 20838: 'cp838', # IBM EBCDIC Thai 20866: 'koi8-r', # Russian (KOI8-R); Cyrillic (KOI8-R) - # UNSUPPORTED - 20871: 'IBM871', # IBM EBCDIC Icelandic + 20871: 'cp871', # IBM EBCDIC Icelandic # UNSUPPORTED 20880: 'IBM880', # IBM EBCDIC Cyrillic Russian # UNSUPPORTED @@ -322,7 +301,6 @@ 20936: 'x-cp20936', # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) # UNSUPPORTED 20949: 'x-cp20949', # Korean Wansung - # UNSUPPORTED 21025: 'cp1025', # IBM EBCDIC Cyrillic Serbian-Bulgarian # UNSUPPORTED 21027: '', # (deprecated) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index a8e4466e..ef488180 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -456,7 +456,7 @@ def parseType(_type, stream, encoding, extras): return ret elif _type == 0x1102: ret = copy.deepcopy(extras) - lengths = tuple(constants.STUI32.unpack(stream[pos*8:(pos+1)*8]) for pos in range(len(stream) // 8)) + lengths = tuple(constants.STUI32.unpack(stream[pos*8:(pos+1)*8])[0] for pos in range(len(stream) // 8)) length_lengths = len(lengths) if length_lengths > length_extras: logger.warning('Error while parsing multiple type. Expected {} stream{}, got {}. Ignoring.'.format(length_lengths, 's' if length_lengths > 1 or length_lengths == 0 else '', length_extras)) diff --git a/requirements.txt b/requirements.txt index 172dd867..7d15bd6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ imapclient==2.1.0 olefile>=0.46 tzlocal>=2.1 compressed_rtf>=1.0.6 +ebcdic>=1.1.1 diff --git a/setup.py b/setup.py index 11509b71..67355baf 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,6 @@ license='GPL', packages=[main_module], py_modules=[main_module], -# scripts=['scripts/extract_msg'], entry_points={'console_scripts': ['extract_msg = extract_msg.__main__:main',]}, include_package_data=True, install_requires=dependencies,