Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Updated DTD to support annotations #236

Merged
merged 7 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,8 @@ jobs:
# Warning: the diff line below is PowerShell syntax, not bash!
run: |
echo ćś | readalongs make-xml -l fra - - | findstr /v meta > cs.readalong
echo Output ====
cat cs.readalong
echo Reference ====
cat test/data/cs-ref.readalong
if (diff (cat cs.readalong) (cat test/data/cs-ref.readalong)) { throw "Output did not match reference" }
2 changes: 1 addition & 1 deletion docs/cli-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI
Lite](https://tei-c.org/guidelines/customization/lite/) but is
considerably simplified. The DTD (document type definition) can be
found in the ReadAlong Studio source code under
`readalongs/static/read-along-1.1.dtd`.
`readalongs/static/read-along-1.2.dtd`.

(dna)=

Expand Down
2 changes: 1 addition & 1 deletion readalongs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = "1.1.0"

READALONG_FILE_FORMAT_VERSION = "1.1"
READALONG_FILE_FORMAT_VERSION = "1.2"
8 changes: 4 additions & 4 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def parse_and_make_xml(
"""Parse XML input and run tokenization and G2P.

Args:
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.1.dtd)
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.2.dtd)
config (dict): Optional; ReadAlong-Studio configuration to use
save_temps (str): Optional; Save temporary files, by default None
verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
Expand Down Expand Up @@ -574,7 +574,7 @@ def align_audio(
"""Align an XML input file to an audio file.

Args:
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.1.dtd)
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.2.dtd)
audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
unit (str): Optional; Element to create alignments for, by default 'w'
bare (boolean): Optional;
Expand Down Expand Up @@ -1192,7 +1192,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down Expand Up @@ -1240,7 +1240,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->


def create_input_ras(**kwargs):
"""Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses readlines to infer paragraph and sentence structure from plain text.
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down
106 changes: 106 additions & 0 deletions readalongs/static/read-along-1.2.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<!-- VERSION: 1.2 -->
<!ELEMENT read-along (meta|text|body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST read-along
use-assets-folder CDATA #IMPLIED
href CDATA #IMPLIED
audio CDATA #IMPLIED
xml:lang CDATA #IMPLIED
language CDATA #IMPLIED
lang CDATA #IMPLIED
version CDATA #IMPLIED>

<!ELEMENT text (body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST text
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
fallback-langs CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT body (div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST body
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT anchor EMPTY>
<!ATTLIST anchor time CDATA #REQUIRED>

<!ELEMENT silence EMPTY>
<!ATTLIST silence dur CDATA #REQUIRED>

<!ELEMENT graphic EMPTY>
<!ATTLIST graphic
url CDATA #REQUIRED
id CDATA #IMPLIED>

<!ELEMENT div (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST div
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT span (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST span
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT p (#PCDATA|span|anchor|silence|s|w)*>
<!ATTLIST p
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT s (#PCDATA|span|anchor|silence|w)*>
<!ATTLIST s
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED
annotation-id CDATA #IMPLIED
sentence-id CDATA #IMPLIED>

<!ELEMENT w (#PCDATA|span|syl)*>
<!ATTLIST w
xml:lang CDATA #IMPLIED
effective-g2p-lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT syl (#PCDATA|span)*>
<!ATTLIST syl
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT meta EMPTY>
<!ATTLIST meta name CDATA #REQUIRED
content CDATA #REQUIRED
id CDATA #IMPLIED>
2 changes: 1 addition & 1 deletion readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
# Call get_langs() when the server loads to load the languages into memory
LANGS = get_langs()
# Get the DTD
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.1.dtd")
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.2.dtd")
with open(DTDPATH) as dtdfh:
DTD = etree.DTD(dtdfh)

Expand Down
2 changes: 1 addition & 1 deletion test/data/cs-ref.readalong
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.1">
<read-along version="1.2">
<text xml:lang="fra" fallback-langs="und">
<body>
<div type="page">
Expand Down
34 changes: 34 additions & 0 deletions test/data/ej-fra-annotated.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.2">
<meta name="generator" content="human made" id="meta0" />
<meta name="annotations-ids" content="translation1, translation2" id="meta1" />
<meta name="annotations-labels" content="Algonquin, English" id="meta2" />
<meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
<meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0" />
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1" sentence-id="t0b0d0p0s0">
Kwei.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
sentence-id="t0b0d0p0s0">
Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
sentence-id="t0b0d0p0s1">Éric
Joanis nindijinikàz.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
sentence-id="t0b0d0p0s1">My
name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
34 changes: 34 additions & 0 deletions test/data/ras-dtd-1.2.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.2">
<meta name="generator" content="human made" id="meta0" />
<meta name="annotations-ids" content="translation1, translation2" id="meta1" />
<meta name="annotations-labels" content="Algonquin, English" id="meta2" />
<meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
<meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0" />
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1" sentence-id="t0b0d0p0s0">
Kwei.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
sentence-id="t0b0d0p0s0">
Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
sentence-id="t0b0d0p0s1">Éric
Joanis nindijinikàz.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
sentence-id="t0b0d0p0s1">My
name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
9 changes: 7 additions & 2 deletions test/test_dtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
from readalongs.text.util import load_xml

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.1.dtd"
dirname(__file__), "..", "readalongs", "static", "read-along-1.2.dtd"
)

VALID_RAS = """
ej-fra-anchors2.readalong
ej-fra-anchors.readalong
ej-fra-annotated.readalong
ej-fra-converted.readalong
ej-fra-dna.readalong
ej-fra-package.readalong
Expand Down Expand Up @@ -70,7 +71,11 @@ def test_invalid_inputs(self):

def test_backwards_compatibility(self):
# the DTD needs to be backwards compatible as long as the major version does not change
versions = ["ras-dtd-1.0.readalong", "ras-dtd-1.1.readalong"]
versions = [
"ras-dtd-1.0.readalong",
"ras-dtd-1.1.readalong",
"ras-dtd-1.2.readalong",
]
for name in versions:
path = os.path.join(dirname(__file__), "data", name.strip())
# DTD is text, XML is binary... okay
Expand Down
Loading