From 0e33c97fd918be84b8e23e9d1a20cedc23770cda Mon Sep 17 00:00:00 2001 From: Joos Kiener Date: Wed, 17 May 2023 17:11:59 +0200 Subject: [PATCH 1/4] fix wrong type for CDXPositioningType - is int8 so only 1 byte required --- pycdxml/cdxml_converter/chemdraw_types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py index 92cf366..ae45d41 100644 --- a/pycdxml/cdxml_converter/chemdraw_types.py +++ b/pycdxml/cdxml_converter/chemdraw_types.py @@ -1964,8 +1964,8 @@ def __init__(self, value: int): @staticmethod def from_bytes(property_bytes: bytes) -> 'CDXPositioningType': - if len(property_bytes) != 2: - raise ValueError("CDXPositioningType should consist of exactly 2 bytes.") + if len(property_bytes) != 1: + raise ValueError("CDXPositioningType should consist of exactly 1 bytes.") value = int.from_bytes(property_bytes, "little", signed=True) return CDXPositioningType(value) @@ -1974,7 +1974,7 @@ def from_string(value: str) -> 'CDXPositioningType': return CDXPositioningType[value] def to_bytes(self) -> bytes: - return self.positioning_type.to_bytes(2, byteorder='little', signed=True) + return self.positioning_type.to_bytes(1, byteorder='little', signed=True) def to_property_value(self) -> str: val = str(CDXPositioningType(self.positioning_type)) From 6fd2b9cdaf4ca92e1a5f285be34b9899fcfe5538 Mon Sep 17 00:00:00 2001 From: Joos Kiener Date: Thu, 18 May 2023 09:51:05 +0200 Subject: [PATCH 2/4] fixes around charsets in annotations and invalid control characters - the annotation Keyword and the Content are now by default both read as utf8 - in case of need remove control characters that are not allowed in XML - Close #32 --- pycdxml/cdxml_converter/chemdraw_objects.py | 14 ++++++++++++-- pycdxml/cdxml_converter/chemdraw_types.py | 7 ++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pycdxml/cdxml_converter/chemdraw_objects.py b/pycdxml/cdxml_converter/chemdraw_objects.py index caf702c..18aabe9 100644 --- a/pycdxml/cdxml_converter/chemdraw_objects.py +++ b/pycdxml/cdxml_converter/chemdraw_objects.py @@ -4,6 +4,7 @@ from pathlib import Path from lxml import etree as ET import logging +import re from ..utils.cdxml_io import etree_to_cdxml @@ -437,7 +438,7 @@ def _read_attributes(self, element: ET.Element): chemdraw_type = ChemDrawDocument.CDX_PROPERTIES[tag_id]["type"] logger.debug(f"Reading property {prop_name} of type {chemdraw_type}.") klass = globals()[chemdraw_type] - if prop_name == "UTF8Text": + if prop_name in ["UTF8Text", "Keyword", "Content"]: type_obj = klass.from_bytes(prop_bytes, charset="utf8") elif chemdraw_type == "CDXString": type_obj = klass.from_bytes(prop_bytes, fonttable=self.fonttable) @@ -494,7 +495,16 @@ def _read_attributes(self, element: ET.Element): # adds style tags to this t element containing styled text type_obj.to_element(element) else: - element.attrib[prop_name] = type_obj.to_property_value() + try: + element.attrib[prop_name] = type_obj.to_property_value() + except ValueError as e: + # https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python + # This error is usually caused when a control character is found which is invalid in xml + # Since this is rare, we only replace it in case of need for performance reasons + logger.warning(f"{e}. Replacing invalid chars with ''.") + val = type_obj.to_property_value() + val = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', val) + element.attrib[prop_name] = val logger.debug('Successfully finished reading attributes.') # move back 2 positions, finished reading attributes diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py index ae45d41..0bad553 100644 --- a/pycdxml/cdxml_converter/chemdraw_types.py +++ b/pycdxml/cdxml_converter/chemdraw_types.py @@ -94,7 +94,8 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> ' # get charset from first fontstyle try: - charset = CDXString.get_charset(fonttable, font_styles) + if fonttable is not None: + charset = CDXString.get_charset(fonttable, font_styles) text_length = len(property_bytes) - (CDXString.BYTES_PER_STYLE * style_runs) - 2 except pycdxml.cdxml_converter.chemdraw_objects.MissingFontException as ex: # to deal with issue #30 - no style runs and the uint16 defining number of style runs is completely omitted @@ -116,6 +117,10 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> ' logger.warning("Found unsupported charset. Retrying with 'utf8'.") stream.seek(stream.tell() - text_length) value = stream.read(text_length).decode('utf8') + except UnicodeDecodeError: + logger.warning("Found unsupported character. Retrying with 'utf8'.") + stream.seek(stream.tell() - text_length) + value = stream.read(text_length).decode('utf8') # Normalize to xml spec where all line breaks in attributes are represented by \n value = value.replace("\r", "\n") logger.debug(f"Read String '{value}' with {len(font_styles)} different styles.") From 34411564aceed50bafdaaa0a1226968fde354734 Mon Sep 17 00:00:00 2001 From: Joos Kiener Date: Sat, 16 Sep 2023 10:43:42 +0200 Subject: [PATCH 3/4] hacky fix for running test file directly --- tests/cdxml_converter_tests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/cdxml_converter_tests.py b/tests/cdxml_converter_tests.py index 1bf3898..873eb1c 100644 --- a/tests/cdxml_converter_tests.py +++ b/tests/cdxml_converter_tests.py @@ -11,6 +11,9 @@ logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) +cwd = os.getcwd() +if cwd.endswith("tests"): + os.chdir(Path(cwd).parent) class CdxmlConverterTest(unittest.TestCase): """ From d64e2cba43dba53d59aa130d2f573e047e60ef7d Mon Sep 17 00:00:00 2001 From: Joos Kiener Date: Sat, 16 Sep 2023 10:44:42 +0200 Subject: [PATCH 4/4] better error handling with invalid characters - fixes issue #34 --- pycdxml/cdxml_converter/chemdraw_types.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py index 0bad553..b34ed6c 100644 --- a/pycdxml/cdxml_converter/chemdraw_types.py +++ b/pycdxml/cdxml_converter/chemdraw_types.py @@ -118,9 +118,14 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> ' stream.seek(stream.tell() - text_length) value = stream.read(text_length).decode('utf8') except UnicodeDecodeError: - logger.warning("Found unsupported character. Retrying with 'utf8'.") stream.seek(stream.tell() - text_length) - value = stream.read(text_length).decode('utf8') + if charset == 'utf8': + logger.warning("Found unsupported character for utf8. Retrying with errors=='replace'.") + else: + logger.warning(f"Found unsupported character for charset {charset}. " + f"Retrying with 'utf8' and errors=='replace'.") + value = stream.read(text_length).decode('utf8', errors="replace") + # Normalize to xml spec where all line breaks in attributes are represented by \n value = value.replace("\r", "\n") logger.debug(f"Read String '{value}' with {len(font_styles)} different styles.")