rules on whitespace chars when parsing xhtml/xml are stricter than html

Sigil-Ebook · Dec 1, 2023 · feecb06 · feecb06
1 parent 8be6790
commit feecb06
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 10 deletions.
diff --git a/src/Resource_Files/plugin_launchers/python/opf_parser.py b/src/Resource_Files/plugin_launchers/python/opf_parser.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 
-# Copyright (c) 2014-2020 Kevin B. Hendricks and Doug Massay
+# Copyright (c) 2014-2023 Kevin B. Hendricks and Doug Massay
 # Copyright (c) 2014      John Schember
 # All rights reserved.
 #
@@ -33,6 +33,8 @@
 from hrefutils import mime_group_map
 from collections import OrderedDict
 
+WHITESPACE_CHARS = (' ', '\n', '\r', '\t')
+
 SPECIAL_HANDLING_TAGS = OrderedDict([
     ('?xml', ('xmlheader', -1)),
     ('!--', ('comment', -3)),
@@ -352,13 +354,13 @@ def _parsetag(self, s):
         if ttype is None:
             # parse any attributes of begin or single tags
             while s.find('=', p) != -1 :
-                while p < n and s[p:p + 1] == ' ' : p += 1
+                while p < n and s[p:p + 1] in WHITESPACE_CHARS : p += 1
                 b = p
                 while p < n and s[p:p + 1] != '=' : p += 1
                 aname = s[b:p].lower()
-                aname = aname.rstrip(' ')
+                aname = aname.rstrip(' \n\r\t')
                 p += 1
-                while p < n and s[p:p + 1] == ' ' : p += 1
+                while p < n and s[p:p + 1] in WHITESPACE_CHARS: p += 1
                 if s[p:p + 1] in ('"', "'") :
                     qt = s[p:p + 1]
                     p = p + 1

diff --git a/src/Resource_Files/plugin_launchers/python/quickparser.py b/src/Resource_Files/plugin_launchers/python/quickparser.py
@@ -39,7 +39,7 @@
 
 SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment', 'cdata', 'pi']
 
-WHITESPACE_CHARS = (' ', '\n', '\r', '\f', '\t', '\v')
+WHITESPACE_CHARS = (' ', '\n', '\r', '\t')
 
 class QuickXHTMLParser(object):
 
@@ -116,7 +116,7 @@ def parsetag(self, s):
                 while p < n and s[p:p + 1] != '=' : p += 1
                 # attribute names can be mixed case and are in SVG
                 aname = s[b:p]
-                aname = aname.rstrip(' ')
+                aname = aname.rstrip(' \n\r\t')
                 p += 1
                 while p < n and s[p:p + 1] in WHITESPACE_CHARS : p += 1
                 if s[p:p + 1] in ('"', "'") :

diff --git a/src/Resource_Files/python3lib/opf_newparser.py b/src/Resource_Files/python3lib/opf_newparser.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 
-# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# Copyright (c) 2014-2023 Kevin B. Hendricks, John Schember, and Doug Massay
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification,
@@ -51,6 +51,8 @@ def xmldecode(data):
     newdata = newdata.replace('&amp;', '&')
     return newdata
 
+WHITESPACE_CHARS = (' ', '\n', '\r', '\t')
+
 SPECIAL_HANDLING_TAGS = OrderedDict([
     ('?xml', ('xmlheader', -1)),
     ('!--',  ('comment', -3)),
@@ -237,13 +239,13 @@ def _parsetag(self, s):
         if ttype is None:
             # parse any attributes of begin or single tags
             while s.find('=',p) != -1 :
-                while p < n and s[p:p+1] == ' ' : p += 1
+                while p < n and s[p:p+1] in WHITESPACE_CHARS : p += 1
                 b = p
                 while p < n and s[p:p+1] != '=' : p += 1
                 aname = s[b:p].lower()
-                aname = aname.rstrip(' ')
+                aname = aname.rstrip(' \n\r\t')
                 p += 1
-                while p < n and s[p:p+1] == ' ' : p += 1
+                while p < n and s[p:p+1] in WHITESPACE_CHARS: p += 1
                 if s[p:p+1] in ('"', "'") :
                     qt = s[p:p+1]
                     p = p + 1