diff --git a/scripts/python/pyproject.toml b/scripts/python/pyproject.toml index 7a87e2290..12ade8e6c 100644 --- a/scripts/python/pyproject.toml +++ b/scripts/python/pyproject.toml @@ -44,6 +44,7 @@ fodt-split-subdocument = "fodt.split_subdocument:split_subdocument" fodt-splitter = "fodt.splitter:main" fodt-validate-document = "fodt.validate_automatic_styles:validate" fodt-xml-sax-filter-all = "fodt.xml_sax_filter_all:xml_sax_filter_all" +fodt-xml-sax-filter-meta = "fodt.xml_filter_meta:xml_sax_filter_meta" [build-system] requires = ["poetry-core"] diff --git a/scripts/python/src/fodt/xml_filter_meta.py b/scripts/python/src/fodt/xml_filter_meta.py new file mode 100644 index 000000000..893187a6f --- /dev/null +++ b/scripts/python/src/fodt/xml_filter_meta.py @@ -0,0 +1,61 @@ +import logging +import xml.sax +import xml.sax.handler +import xml.sax.xmlreader +import xml.sax.saxutils +from pathlib import Path + +import click + +from fodt.constants import ClickOptions, Directories, FileExtensions +from fodt.xml_handlers import PassThroughFilterHandler + + +class FilterAll: + def __init__(self, maindir: str) -> None: + self.maindir = Path(maindir) + + def run_filter(self) -> None: + meta_dir = self.maindir / Directories.meta / Directories.sections + if not meta_dir.is_dir(): + logging.info(f"Directory {meta_dir} does not exist.") + return + for i, filename in enumerate(meta_dir.glob("*.xml"), start=1): + logging.info(f"Processing file: {filename}") + self.filter_file(filename) + #if i == 1: + # break + + def filter_file(self, filename: Path) -> None: + parser = xml.sax.make_parser() + handler = PassThroughFilterHandler() + parser.setContentHandler(handler) + parser.parse(filename) + with open(filename, "w", encoding='utf8') as f: + f.write(handler.get_content()) + + + +# USAGE: +# +# fodt-xml-sax-filter-meta \ +# --maindir=
\ +# +# DESCRIPTION: +# +# Runs xml.sax pass-through filter on all xml files in the parts/meta/sections +# directory. The files in this directory are used by among other the +# fodt-add-keyword script. +# This means that each xml file is read by the xml.sax parser, and +# the content is then written back to the file using xml.sax.saxutils.escape() +# to escape the content. +# This is useful to check for inconsistencies in the XML content written by LibreOffice +# and the content written by the xml.sax parser and to initially algin the XML content +# with the format written by LibreOffice. +# +@click.command() +@ClickOptions.maindir(required=False) +def xml_sax_filter_meta(maindir: str) -> None: + """Filter all xml files in the meta dir.""" + logging.basicConfig(level=logging.INFO) + FilterAll(maindir).run_filter() diff --git a/scripts/python/src/fodt/xml_handlers.py b/scripts/python/src/fodt/xml_handlers.py index eff37e412..969b0d361 100644 --- a/scripts/python/src/fodt/xml_handlers.py +++ b/scripts/python/src/fodt/xml_handlers.py @@ -1,9 +1,45 @@ +import io import re import xml.sax import xml.sax.handler import xml.sax.xmlreader import xml.sax.saxutils +from fodt.xml_helpers import XMLHelper + +class PassThroughFilterHandler(xml.sax.handler.ContentHandler): + def __init__(self) -> None: + self.content = io.StringIO() + self.start_tag_open = False # For empty tags, do not close with /> + + def characters(self, content: str): + if self.start_tag_open: + # NOTE: characters() is only called if there is content between the start + # tag and the end tag. If there is no content, characters() is not called. + self.content.write(">") + self.start_tag_open = False + self.content.write(XMLHelper.escape(content)) + + def endElement(self, name: str): + if self.start_tag_open: + self.content.write("/>") + self.start_tag_open = False + else: + self.content.write(XMLHelper.endtag(name)) + + def get_content(self) -> str: + return self.content.getvalue() + + def startDocument(self): + self.content.write(XMLHelper.header) + + def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl): + if self.start_tag_open: + self.content.write(">") + self.start_tag_open = True + self.content.write(XMLHelper.starttag(name, attrs, close_tag=False)) + + class GetUsedStylesHandler(xml.sax.handler.ContentHandler): def __init__(self) -> None: # The values of the dict below list the attribute-names where the style is used. diff --git a/scripts/python/src/fodt/xml_sax_filter_all.py b/scripts/python/src/fodt/xml_sax_filter_all.py index d1a84ce10..6632b85fb 100644 --- a/scripts/python/src/fodt/xml_sax_filter_all.py +++ b/scripts/python/src/fodt/xml_sax_filter_all.py @@ -1,4 +1,3 @@ -import io import logging import xml.sax import xml.sax.handler @@ -9,39 +8,7 @@ import click from fodt.constants import ClickOptions -from fodt.xml_helpers import XMLHelper - -class ElementHandler(xml.sax.handler.ContentHandler): - def __init__(self) -> None: - self.content = io.StringIO() - self.start_tag_open = False # For empty tags, do not close with /> - - def characters(self, content: str): - if self.start_tag_open: - # NOTE: characters() is only called if there is content between the start - # tag and the end tag. If there is no content, characters() is not called. - self.content.write(">") - self.start_tag_open = False - self.content.write(XMLHelper.escape(content)) - - def endElement(self, name: str): - if self.start_tag_open: - self.content.write("/>") - self.start_tag_open = False - else: - self.content.write(XMLHelper.endtag(name)) - - def get_content(self) -> str: - return self.content.getvalue() - - def startDocument(self): - self.content.write(XMLHelper.header) - - def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl): - if self.start_tag_open: - self.content.write(">") - self.start_tag_open = True - self.content.write(XMLHelper.starttag(name, attrs, close_tag=False)) +from fodt.xml_handlers import PassThroughFilterHandler class FilterAll: @@ -57,7 +24,7 @@ def run_filter(self) -> None: def filter_file(self, filename: Path) -> None: parser = xml.sax.make_parser() - handler = ElementHandler() + handler = PassThroughFilterHandler() parser.setContentHandler(handler) parser.parse(filename) with open(filename, "w", encoding='utf8') as f: