Skip to content

Commit

Permalink
Script to insert keyword links
Browse files Browse the repository at this point in the history
  • Loading branch information
hakonhagland committed Nov 4, 2024
1 parent 6f22834 commit b4586fd
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 0 deletions.
1 change: 1 addition & 0 deletions scripts/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ fodt-fix-ignored-keywords = "fodt.fix_ignored:fix_ignored"
fodt-fix-footer-style = "fodt.fix_footer_style:fix_footer_style"
fodt-fix-letter-k-footer = "fodt.fix_letter_k_footer:fix_letter_k_footer"
fodt-gen-kw-uri-map = "fodt.keyword_linker:gen_kw_uri_map"
fodt-link-keywords = "fodt.keyword_linker2:link_keywords"
fodt-remove-bookmarks-from-master-styles = "fodt.remove_bookmarks:remove_bookmarks_from_master_styles"
fodt-remove-chapters = "fodt.splitter:remove_chapters"
fodt-remove-elements = "fodt.splitter:remove_elements"
Expand Down
187 changes: 187 additions & 0 deletions scripts/python/src/fodt/keyword_linker2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import io
import logging
import re
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
import xml.sax.saxutils

from pathlib import Path

import click

from fodt.constants import ClickOptions, Directories, FileNames, FileExtensions
from fodt.exceptions import HandlerDoneException, ParsingException
from fodt import helpers
from fodt.xml_helpers import XMLHelper

class FileHandler(xml.sax.handler.ContentHandler):
def __init__(self, keyword_name: str, kw_uri_map: dict[str, str]) -> None:
self.keyword_name = keyword_name
self.kw_uri_map = kw_uri_map
self.in_section = False
# For empty tags, we use a special trick to rewrite them with a shortened
# end /> tag instead of the full end tag </tag>
self.start_tag_open = False
self.in_p = False
self.in_a = False
self.content = io.StringIO()
# Create a regex pattern with alternation on the keyword names
self.regex = self.compile_regex()
self.num_links_inserted = 0

def compile_regex(self) -> re.Pattern:
# Do not include the keyword name itself in the regex pattern
pattern = re.compile(
r'\b(' +
'|'.join(
re.escape(k) for k in self.kw_uri_map.keys() if k != self.keyword_name
) +
r')\b'
)
return pattern

def characters(self, content: str):
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
if self.start_tag_open:
self.content.write(">")
self.start_tag_open = False
# NOTE: We need to escape the content before we apply the regex pattern
# because it may insert tags (<text:a ...>) that should not be escaped.
content = XMLHelper.escape(content)
if self.in_p and not self.in_a:
content = self.regex.sub(self.replace_match_function, content)
self.content.write(content)

def endDocument(self):
pass

def endElement(self, name: str):
if name == "text:p":
self.in_p = False
elif self.in_p and name == "text:a":
self.in_a = False
if self.start_tag_open:
self.content.write("/>")
self.start_tag_open = False
else:
self.content.write(XMLHelper.endtag(name))

def get_content(self) -> str:
return self.content.getvalue()

def get_num_links_inserted(self) -> int:
return self.num_links_inserted

def replace_match_function(self, match: re.Match) -> str:
keyword = match.group(0)
uri = self.kw_uri_map[keyword]
self.num_links_inserted += 1
return f'<text:a xlink:href="#{uri}">{keyword}</text:a>'

# This callback is used for debugging, it can be used to print
# line numbers in the XML file
def setDocumentLocator(self, locator):
self.locator = locator

def startDocument(self):
self.content.write(XMLHelper.header)

def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
if self.start_tag_open:
self.content.write(">") # Close the start tag
self.start_tag_open = False
if name == "text:p":
self.in_p = True
elif self.in_p and name == "text:a":
# We are already inside an anchor, and we should not insert a new text:a tag here
self.in_a = True
self.start_tag_open = True
self.content.write(XMLHelper.starttag(name, attrs, close_tag=False))


class InsertLinks():
def __init__(self, maindir: Path, kw_dir: Path, kw_uri_map: dict[str, str]) -> None:
self.maindir = maindir
self.kw_dir = kw_dir
self.kw_uri_map = kw_uri_map

def insert_links(self) -> None:
for item in self.kw_dir.iterdir():
if not item.is_dir():
continue
logging.info(f"Processing directory: {item}")
for item2 in item.iterdir():
if item2.suffix == f".{FileExtensions.fodt}":
keyword_name = item2.name.removesuffix(f".{FileExtensions.fodt}")
self.insert_links_in_file(item2, keyword_name)

def insert_links_in_file(self, filename: Path, keyword_name: str) -> None:
parser = xml.sax.make_parser()
handler = FileHandler(keyword_name, self.kw_uri_map)
parser.setContentHandler(handler)
try:
parser.parse(str(filename))
except HandlerDoneException as e:
pass
num_links_inserted = handler.get_num_links_inserted()
if num_links_inserted > 0:
with open(filename, "w", encoding='utf8') as f:
f.write(handler.content.getvalue())
logging.info(f"{filename.name}: Inserted {num_links_inserted} links.")
breakpoint()
else:
logging.info(f"{filename.name}: No links inserted.")


def load_kw_uri_map(maindir: Path) -> dict[str, str]:
kw_uri_map_path = maindir / Directories.meta / FileNames.kw_uri_map
if not kw_uri_map_path.exists():
raise FileNotFoundError(f"File not found: {kw_uri_map_path}")
kw_uri_map = {}
with open(kw_uri_map_path, "r", encoding='utf-8') as f:
for line in f:
# Each line is on the format "<kw> <uri>" where <kw> is the keyword name and
# does not contain any whitespace characters, and <uri> is the URI of the
# keyword subsection subdocument. The <uri> may contain whitespace characters.
# There is a single whitespace character between <kw> and <uri>.
match = re.match(r"(\S+)\s+(.+)", line)
if match:
parts = match.groups()
kw_uri_map[parts[0]] = parts[1]
else:
raise ParsingException(f"Could not parse line: {line}")
return kw_uri_map

# fodt-link-keywords
# ------------------
#
# SHELL USAGE:
#
# fodt-link-keyword --maindir=<main_dir>
#
# DESCRIPTION:
#
# Links all keyword names found inside <p> tags in the subsection documents to the
# corresponding keyword subsection subdocument.
# Uses the mapping file "meta/kw_uri_map.txt" generated by the script
# "fodt-gen-kw-uri-map".
#
# EXAMPLE:
#
# fodt-link-keywords
#
# Will use the default value: --maindir=../../parts
#
@click.command()
@ClickOptions.maindir()
def link_keywords(maindir: str|None) -> None:
logging.basicConfig(level=logging.INFO)
maindir = helpers.get_maindir(maindir)
kw_uri_map = load_kw_uri_map(maindir)
kw_dir = maindir / Directories.chapters / Directories.subsections
InsertLinks(maindir, kw_dir, kw_uri_map).insert_links()

if __name__ == "__main__":
link_keywords()

0 comments on commit b4586fd

Please sign in to comment.