Skip to content

Commit

Permalink
feat(omr search): create non-existent folios when indexing manuscript…
Browse files Browse the repository at this point in the history
… mei

When chants end on a folio where no other chants begin, the folio
does not exist in CantusDB but does have an MEI file. For example, see
folio A14r in Salzinnes. Here, we modify the index_manuscript_mei
command to create a folio in such cases (we check that it's a "real"
folio by making sure the mei file follows a naming convention and then
create the folio if it doesn't exist). The user is alerted to this, and
they must manually add the image_uri to the folio (either through the admin
panel or the map folios process) and then reindex the mei. This is detailed
in issue #891. This is convoluted, but given that we're going to change the
structure of the CU database soon so that it is more closely coupled with
CantusDB, we'll figure out a more permanent solution then -- solving #891).
  • Loading branch information
dchiller committed Aug 15, 2024
1 parent 833d334 commit 711822e
Show file tree
Hide file tree
Showing 3 changed files with 1,187 additions and 6 deletions.
29 changes: 24 additions & 5 deletions app/public/cantusdata/management/commands/index_manuscript_mei.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
from typing import Any, Dict
from os import path, listdir
import re

from django.core.management.base import BaseCommand, CommandParser
from django.conf import settings
from solr.core import SolrConnection # type: ignore

from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer
from cantusdata.models.folio import Folio

from solr.core import SolrConnection # type: ignore

MEI4_DIR = path.join("/code", "production-mei-files")
FOLIO_NUMBER_REGEX = re.compile(r"[a-zA-Z]?\d+[a-z]?")


class Command(BaseCommand):
help = (
"This command indexes the contents of MEI files in Solr, using"
"the MEITokenizer class to extract n-grams from the MEI files."
"Files must be named in the format [some string]_[folio number].mei."
"Files must be named in the format [some string]_[folio number].mei,"
"where [folio number] is an optional single letter followed by "
"some number of digits followed by an optional"
"lowercase single letter. The command currently has a workaround for folios "
"that have MEI files but are NOT in CantusDB. See #891 for details "
"about how to handle this case -- the command will alert the user "
"when it encounters this case."
)

def add_arguments(self, parser: CommandParser) -> None:
Expand Down Expand Up @@ -74,18 +83,27 @@ def handle(self, *args: Any, **options: Any) -> None:
raise ValueError(f"No folios found for manuscript {manuscript_id}.")
manuscript_mei_path = path.join(options["mei_dir"], str(manuscript_id))
if not path.exists(manuscript_mei_path):
raise FileNotFoundError(f"--mei-dir path does not exist.")
raise FileNotFoundError("--mei-dir path does not exist.")
manuscript_mei_files = [
f for f in listdir(manuscript_mei_path) if f.endswith(".mei")
]
if len(manuscript_mei_files) == 0:
raise FileNotFoundError(f"No MEI files found in {manuscript_mei_path}.")
for mei_file in manuscript_mei_files:
folio_number: str = mei_file.split("_")[-1].split(".")[0]
if not folio_number in folio_map:
if not FOLIO_NUMBER_REGEX.match(folio_number):
raise ValueError(
f"Folio number {folio_number} in MEI file {mei_file} does not exist in the database."
f"MEI file {mei_file} does not match the expected format."
)
if not folio_number in folio_map:
self.stdout.write(
self.style.WARNING(
f"Folio number {folio_number} in MEI file "
f"{mei_file} did not exist in the database. Creating record. "
"See #891 for details on how to handle this case."
)
)
Folio.objects.create(manuscript_id=manuscript_id, number=folio_number)
tokenizer = MEITokenizer(
path.join(manuscript_mei_path, mei_file),
min_ngram=options["min_ngram"],
Expand All @@ -98,6 +116,7 @@ def handle(self, *args: Any, **options: Any) -> None:
doc["image_uri"] = folio_map.get(folio_number, "")
solr_conn.add_many(ngram_docs)
solr_conn.commit()
return None

def flush_manuscript_ngrams_from_index(
self, solr_conn: SolrConnection, manuscript_id: int
Expand Down
Loading

0 comments on commit 711822e

Please sign in to comment.