From 0e33c97fd918be84b8e23e9d1a20cedc23770cda Mon Sep 17 00:00:00 2001
From: Joos Kiener <joos.kiener@gmail.com>
Date: Wed, 17 May 2023 17:11:59 +0200
Subject: [PATCH 1/4] fix wrong type for CDXPositioningType - is int8 so only 1
 byte required

---
 pycdxml/cdxml_converter/chemdraw_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py
index 92cf366..ae45d41 100644
--- a/pycdxml/cdxml_converter/chemdraw_types.py
+++ b/pycdxml/cdxml_converter/chemdraw_types.py
@@ -1964,8 +1964,8 @@ def __init__(self, value: int):
 
     @staticmethod
     def from_bytes(property_bytes: bytes) -> 'CDXPositioningType':
-        if len(property_bytes) != 2:
-            raise ValueError("CDXPositioningType should consist of exactly 2 bytes.")
+        if len(property_bytes) != 1:
+            raise ValueError("CDXPositioningType should consist of exactly 1 bytes.")
         value = int.from_bytes(property_bytes, "little", signed=True)
         return CDXPositioningType(value)
 
@@ -1974,7 +1974,7 @@ def from_string(value: str) -> 'CDXPositioningType':
         return CDXPositioningType[value]
 
     def to_bytes(self) -> bytes:
-        return self.positioning_type.to_bytes(2, byteorder='little', signed=True)
+        return self.positioning_type.to_bytes(1, byteorder='little', signed=True)
 
     def to_property_value(self) -> str:
         val = str(CDXPositioningType(self.positioning_type))

From 6fd2b9cdaf4ca92e1a5f285be34b9899fcfe5538 Mon Sep 17 00:00:00 2001
From: Joos Kiener <joos.kiener@gmail.com>
Date: Thu, 18 May 2023 09:51:05 +0200
Subject: [PATCH 2/4] fixes around charsets in annotations and invalid control
 characters - the annotation Keyword and the Content are now by default both
 read as utf8 - in case of need remove control characters that are not allowed
 in XML - Close #32

---
 pycdxml/cdxml_converter/chemdraw_objects.py | 14 ++++++++++++--
 pycdxml/cdxml_converter/chemdraw_types.py   |  7 ++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pycdxml/cdxml_converter/chemdraw_objects.py b/pycdxml/cdxml_converter/chemdraw_objects.py
index caf702c..18aabe9 100644
--- a/pycdxml/cdxml_converter/chemdraw_objects.py
+++ b/pycdxml/cdxml_converter/chemdraw_objects.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from lxml import etree as ET
 import logging
+import re
 
 from ..utils.cdxml_io import etree_to_cdxml
 
@@ -437,7 +438,7 @@ def _read_attributes(self, element: ET.Element):
             chemdraw_type = ChemDrawDocument.CDX_PROPERTIES[tag_id]["type"]
             logger.debug(f"Reading property {prop_name} of type {chemdraw_type}.")
             klass = globals()[chemdraw_type]
-            if prop_name == "UTF8Text":
+            if prop_name in ["UTF8Text", "Keyword", "Content"]:
                 type_obj = klass.from_bytes(prop_bytes, charset="utf8")
             elif chemdraw_type == "CDXString":
                 type_obj = klass.from_bytes(prop_bytes, fonttable=self.fonttable)
@@ -494,7 +495,16 @@ def _read_attributes(self, element: ET.Element):
                 # adds style tags <s></s> to this t element containing styled text
                 type_obj.to_element(element)
             else:
-                element.attrib[prop_name] = type_obj.to_property_value()
+                try:
+                    element.attrib[prop_name] = type_obj.to_property_value()
+                except ValueError as e:
+                    # https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
+                    # This error is usually caused when a control character is found which is invalid in xml
+                    # Since this is rare, we only replace it in case of need for performance reasons
+                    logger.warning(f"{e}. Replacing invalid chars with ''.")
+                    val = type_obj.to_property_value()
+                    val = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', val)
+                    element.attrib[prop_name] = val
 
         logger.debug('Successfully finished reading attributes.')
         # move back 2 positions, finished reading attributes
diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py
index ae45d41..0bad553 100644
--- a/pycdxml/cdxml_converter/chemdraw_types.py
+++ b/pycdxml/cdxml_converter/chemdraw_types.py
@@ -94,7 +94,8 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> '
 
         # get charset from first fontstyle
         try:
-            charset = CDXString.get_charset(fonttable, font_styles)
+            if fonttable is not None:
+                charset = CDXString.get_charset(fonttable, font_styles)
             text_length = len(property_bytes) - (CDXString.BYTES_PER_STYLE * style_runs) - 2
         except pycdxml.cdxml_converter.chemdraw_objects.MissingFontException as ex:
             # to deal with issue #30 - no style runs and the uint16 defining number of style runs is completely omitted
@@ -116,6 +117,10 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> '
             logger.warning("Found unsupported charset. Retrying with 'utf8'.")
             stream.seek(stream.tell() - text_length)
             value = stream.read(text_length).decode('utf8')
+        except UnicodeDecodeError:
+            logger.warning("Found unsupported character. Retrying with 'utf8'.")
+            stream.seek(stream.tell() - text_length)
+            value = stream.read(text_length).decode('utf8')
         # Normalize to xml spec where all line breaks in attributes are represented by \n
         value = value.replace("\r", "\n")
         logger.debug(f"Read String '{value}' with  {len(font_styles)} different styles.")

From 34411564aceed50bafdaaa0a1226968fde354734 Mon Sep 17 00:00:00 2001
From: Joos Kiener <joos.kiener@gmail.com>
Date: Sat, 16 Sep 2023 10:43:42 +0200
Subject: [PATCH 3/4] hacky fix for running test file directly

---
 tests/cdxml_converter_tests.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/cdxml_converter_tests.py b/tests/cdxml_converter_tests.py
index 1bf3898..873eb1c 100644
--- a/tests/cdxml_converter_tests.py
+++ b/tests/cdxml_converter_tests.py
@@ -11,6 +11,9 @@
 logger.addHandler(logging.StreamHandler())
 logger.setLevel(logging.INFO)
 
+cwd = os.getcwd()
+if cwd.endswith("tests"):
+    os.chdir(Path(cwd).parent)
 
 class CdxmlConverterTest(unittest.TestCase):
     """

From d64e2cba43dba53d59aa130d2f573e047e60ef7d Mon Sep 17 00:00:00 2001
From: Joos Kiener <joos.kiener@gmail.com>
Date: Sat, 16 Sep 2023 10:44:42 +0200
Subject: [PATCH 4/4] better error handling with invalid characters - fixes
 issue #34

---
 pycdxml/cdxml_converter/chemdraw_types.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pycdxml/cdxml_converter/chemdraw_types.py b/pycdxml/cdxml_converter/chemdraw_types.py
index 0bad553..b34ed6c 100644
--- a/pycdxml/cdxml_converter/chemdraw_types.py
+++ b/pycdxml/cdxml_converter/chemdraw_types.py
@@ -118,9 +118,14 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> '
             stream.seek(stream.tell() - text_length)
             value = stream.read(text_length).decode('utf8')
         except UnicodeDecodeError:
-            logger.warning("Found unsupported character. Retrying with 'utf8'.")
             stream.seek(stream.tell() - text_length)
-            value = stream.read(text_length).decode('utf8')
+            if charset == 'utf8':
+                logger.warning("Found unsupported character for utf8. Retrying with errors=='replace'.")
+            else:
+                logger.warning(f"Found unsupported character for charset {charset}. "
+                               f"Retrying with 'utf8' and errors=='replace'.")
+            value = stream.read(text_length).decode('utf8', errors="replace")
+
         # Normalize to xml spec where all line breaks in attributes are represented by \n
         value = value.replace("\r", "\n")
         logger.debug(f"Read String '{value}' with  {len(font_styles)} different styles.")