diff --git a/.gitmodules b/.gitmodules index 0b9b4e4f..5d0eff03 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "app/cantus-staticpages"] path = app/cantus-staticpages url = https://github.com/DDMAL/cantus-staticpages +[submodule "app/production-mei-files"] + path = app/production-mei-files + url = https://github.com/DDMAL/production_mei_files.git diff --git a/app/Dockerfile b/app/Dockerfile index 732cf3f5..5c65912a 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -10,6 +10,7 @@ RUN chmod u+x /code/install-packages.sh && \ FROM python:3.12.3 COPY app/django-config.sh /code/ COPY app/cantus-staticpages/ /code/cantus-staticpages/ +COPY app/production-mei-files/ /code/production-mei-files/ EXPOSE 8001 RUN chmod u+x /code/django-config.sh diff --git a/app/production-mei-files b/app/production-mei-files new file mode 160000 index 00000000..82589a8e --- /dev/null +++ b/app/production-mei-files @@ -0,0 +1 @@ +Subproject commit 82589a8e6adfc6f72e84ba7a68d0557bed7eb9f9 diff --git a/app/public/cantusdata/management/commands/import_mei_data.py b/app/public/cantusdata/management/commands/import_mei_data.py deleted file mode 100644 index a5809af9..00000000 --- a/app/public/cantusdata/management/commands/import_mei_data.py +++ /dev/null @@ -1,205 +0,0 @@ -import os -import subprocess -import csv -import time - -import progressbar - -from django.core.management.base import BaseCommand -from django.conf import settings - - -UPLOAD_POLL_WAIT_SECS = 0.25 -UPLOAD_PROGRESS_STEP = 5 - - -class Command(BaseCommand): - args = "mode manuscript" - - def handle(self, *args, **kwargs): - if args and args[0] and args[1]: - mode = args[0] - manuscript = args[1] - else: - raise Exception( - "Please provide arguments for processing" " mode and manuscript name." - ) - - # Make sure we're working with the right manuscript - if manuscript == "salzinnes": - self.stdout.write("Salzinnes manuscript selected.") - siglum = "cdn-hsmu-m2149l4" - id = 133 - mei_location = "data_dumps/mei/salz" - csv_location = "data_dumps/mei_csv/salzinnes.csv" - elif manuscript == "st_gallen_390": - self.stdout.write("St. Gallen 390 manuscript selected.") - siglum = "ch-sgs-390" - id = 127 - mei_location = "data_dumps/mei/csg-390" - csv_location = "data_dumps/mei_csv/csg-390.csv" - elif manuscript == "st_gallen_391": - self.stdout.write("St. Gallen 391 manuscript selected.") - siglum = "ch-sgs-391" - id = 128 - mei_location = "data_dumps/mei/csg-391" - csv_location = "data_dumps/mei_csv/csg-391.csv" - else: - raise Exception("Please provide manuscript name!") - - if mode == "mei_to_csv": - self.stdout.write("Dumping MEI to CSV.") - dump_to_csv(mei_location, siglum, id, csv_location) - self.stdout.write("MEI dumped to CSV.") - - elif mode == "mei_to_solr": - dump_to_csv(mei_location, siglum, id, csv_location) - upload_to_solr(csv_location) - - elif mode == "csv_to_solr": - upload_to_solr(csv_location) - - else: - raise Exception("Please provide mode!") - - -def dump_to_csv(mei_location, siglum, id, path): - """ - Dump the data to a CSV file. - - :param mei_location: - :param siglum: - :param path: - :return: - """ - # Maintain a stable heading order for Salzinnes-style CSV so that it's possible to run word-by-word - # diffs on the output - heading_order = { - h: i - for (i, h) in enumerate( - ( - "folio", - "image_uri", - "pnames", - "neumes", - "siglum_slug", - "intervals", - "id", - "semitones", - "contour", - "project", - "location", - "type", - ) - ) - } - - with open(path, "wb") as csv_file: - writer = None - - files, pages = convert_mei(mei_location, siglum, id) - - prog_widgets = [ - "Parsing: ", - progressbar.Percentage(), - " ", - progressbar.Bar(), - " ", - progressbar.ETA(), - ] - prog_bar = progressbar.ProgressBar(widgets=prog_widgets, maxval=len(files)) - prog_bar.start() - - for page_idx, (file_name, page) in enumerate(pages): - for row in page: - if writer is None: - # We can only initialize the header once we have the first row - - # FIXME(wabain): This assumes that the first row will contain all the - # fields we're interested in, but that's not necessarily the case. - # - # If the assumption breaks we'll get a ValueError: dict contains fields - # not in fieldname. - headings = list( - sorted( - list(row.keys()), - key=lambda h: heading_order.get(h, -1), - ) - ) - - writer = csv.DictWriter(csv_file, headings) - writer.writeheader() - - writer.writerow(row) - - prog_bar.update(page_idx) - - prog_bar.finish() - - -def convert_mei(mei_location, siglum, id): - return get_converter(siglum).convert(mei_location, siglum, id) - - -def get_converter(siglum): - from cantusdata.helpers.mei_conversion import ( - MEIConverter, - StGallenMEIConverter, - ) - - if siglum == "ch-sgs-390" or siglum == "ch-sgs-391": - return StGallenMEIConverter - - return MEIConverter - - -def upload_to_solr(filename): - """Commit a CSV file to Solr using a stream""" - - prog_widgets = [ - "Uploading... ", - progressbar.BouncingBar(), - " ", - progressbar.Timer(format="Time: %s"), - ] - prog_bar = progressbar.ProgressBar(widgets=prog_widgets) - prog_bar.start() - - # Build the Solr upload URL - url = '"{server}/update?stream.file={path}&stream.contentType=text/csv;charset=utf-8&commit=true"'.format( - server=settings.SOLR_SERVER, path=os.path.abspath(filename) - ) - - command = 'curl -s -o /dev/null -w "%{http_code}" ' + url - - proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) - - polls = 0 - - while proc.returncode is None: - proc.poll() - - polls += 1 - - prog_bar.update((polls * UPLOAD_PROGRESS_STEP) % prog_bar.maxval) - time.sleep(UPLOAD_POLL_WAIT_SECS) - - prog_bar.finish() - - if proc.returncode != 0: - failure_message = "process returned {}".format(proc.returncode) - else: - status = proc.communicate()[0] - - if status[0] != "2": - failure_message = "status {}".format(status) - - else: - failure_message = None - - if failure_message is not None: - print( - "Upload failed ({}). See the Solr logs for details.".format(failure_message) - ) - else: - print("Upload successful.") diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py new file mode 100644 index 00000000..88e7d95e --- /dev/null +++ b/app/public/cantusdata/management/commands/index_manuscript_mei.py @@ -0,0 +1,109 @@ +from typing import Any, Dict +from os import path, listdir + +from django.core.management.base import BaseCommand, CommandParser +from django.conf import settings +from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer +from cantusdata.models.folio import Folio + +from solr.core import SolrConnection # type: ignore + +MEI4_DIR = path.join("/code", "production-mei-files") + + +class Command(BaseCommand): + help = ( + "This command indexes the contents of MEI files in Solr, using" + "the MEITokenizer class to extract n-grams from the MEI files." + "Files must be named in the format [some string]_[folio number].mei." + ) + + def add_arguments(self, parser: CommandParser) -> None: + parser.add_argument( + "manuscript_id", + type=int, + nargs=1, + help=( + "The ID of the manuscript whose MEI data should be indexed." + "Must have MEI files in a subdirectory of the --mei-dir argument" + "named with this ID." + ), + ) + parser.add_argument( + "--mei-dir", + type=str, + default=MEI4_DIR, + help=( + "The directory containing the MEI files to be indexed." + "Defaults to '/code/production-mei-files'." + ), + ) + parser.add_argument( + "--min-ngram", + type=int, + default=1, + help="The minimum n-gram length to index from the MEI files.", + ) + parser.add_argument( + "--max-ngram", + type=int, + default=5, + help="The maximum n-gram length to index from the MEI files.", + ) + parser.add_argument( + "--flush-index", + action="store_true", + help=( + "If this flag is set, the command will delete all existing OMR" + "documents for the specified manuscript." + ), + ) + + def handle(self, *args: Any, **options: Any) -> None: + solr_conn = SolrConnection(settings.SOLR_SERVER) + manuscript_id = options["manuscript_id"][0] + if options.get("flush_index"): + self.flush_manuscript_ngrams_from_index(solr_conn, manuscript_id) + return None + folio_map: Dict[str, str] = dict( + Folio.objects.filter(manuscript_id=manuscript_id).values_list( + "number", "image_uri" + ) + ) + if not folio_map: + raise ValueError(f"No folios found for manuscript {manuscript_id}.") + manuscript_mei_path = path.join(options["mei_dir"], str(manuscript_id)) + if not path.exists(manuscript_mei_path): + raise FileNotFoundError(f"--mei-dir path does not exist.") + manuscript_mei_files = [ + f for f in listdir(manuscript_mei_path) if f.endswith(".mei") + ] + if len(manuscript_mei_files) == 0: + raise FileNotFoundError(f"No MEI files found in {manuscript_mei_path}.") + for mei_file in manuscript_mei_files: + folio_number: str = mei_file.split("_")[-1].split(".")[0] + if not folio_number in folio_map: + raise ValueError( + f"Folio number {folio_number} in MEI file {mei_file} does not exist in the database." + ) + tokenizer = MEITokenizer( + path.join(manuscript_mei_path, mei_file), + min_ngram=options["min_ngram"], + max_ngram=options["max_ngram"], + ) + ngram_docs = tokenizer.create_ngram_documents() + for doc in ngram_docs: + doc["manuscript_id"] = manuscript_id + doc["folio"] = folio_number + doc["image_uri"] = folio_map.get(folio_number, "") + solr_conn.add_many(ngram_docs) + solr_conn.commit() + + def flush_manuscript_ngrams_from_index( + self, solr_conn: SolrConnection, manuscript_id: int + ) -> None: + """ + Deletes all n-gram documents for a given manuscript from the Solr index. + """ + solr_conn.delete_query(f"type:omr_ngram AND manuscript_id:{manuscript_id}") + solr_conn.commit() diff --git a/app/public/cantusdata/models/__init__.py b/app/public/cantusdata/models/__init__.py index 127d6c0a..b516891c 100644 --- a/app/public/cantusdata/models/__init__.py +++ b/app/public/cantusdata/models/__init__.py @@ -2,3 +2,5 @@ from cantusdata.models.manuscript import Manuscript from cantusdata.models.folio import Folio from cantusdata.models.neume_exemplar import NeumeExemplar + +__all__ = ["Chant", "Manuscript", "Folio", "NeumeExemplar"] diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/cdn-hsmu-m2149l4_001r.mei b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_001r.mei similarity index 100% rename from app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/cdn-hsmu-m2149l4_001r.mei rename to app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_001r.mei diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_001v.mei b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_001v.mei new file mode 100644 index 00000000..4bf3745a --- /dev/null +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_001v.mei @@ -0,0 +1,1271 @@ + + + + + + + + MEI Encoding Output (1.0.0) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + lo + + + + + + con + + + + + + + + + + + + ti + + + nens + + + + + + ven + + + + + + tris + + + + + + sub + + + + + + ar + + + + + + + + ca + + + + + + clau + + + + + + + + + + + + sus + + + + + + est + + + + + + Be + + + + + + + + + + a + + + ta + + + + + + + cae + + + + + + + + + li + + + nun + + + + + + + ti + + + + + + + o + + + + + + fe + + + + + + cun + + + + + + da + + + + + + san + + + + + + + cto + + + + + + + + + spi + + + + + + ri + + + + + + + tu + + + + + + de + + + + + + si + + + + + + de + + + + + + + ra + + + + + + + tus + + + + + + gen + + + + + + + + + + + + ti + + + bus + + + + + + cu + + + + + + jus + + + + + + per + + + + + + + + + al + + + + + + + + vum + + + + + + + + + + + fu + + + sus + + + + + + + est + + + + + + Laus + + + + + + ho + + + + + + + nor + + + + + + + vir + + + + + + tus + + + + + + + + + glo + + + + + + + + + + + ri + + + a + + + + + + de + + + + + + o + + + + + + pa + + + + + + tri + + + + + + + et + + + + + + fi + + + + + + + + + + li + + + + + + o + + + san + + + + + + cto + + + + + + si + + + + + + + mul + + + + + + + + + + pa + + + + + + ra + + + + + + + + + + + + cli + + + to + + + + + + in + + + + + + sem + + + + + + pi + + + + + + + + + ter + + + + + + + na + + + + + + + + + + + sae + + + cu + + + + + + + la + + + + + + a + + + + + + + + men + + + + + + + + + + Ec + + + + + + + + + + + ce + + + + + + + + + + + + + + ve + + + nit + + + + + + + + + + rex + + + + + + Oc + + + + + + + + cur + + + + + + ra + + + + + + + + + + mus + + + + + + + ob + + + + + + + + + + + vi + + + + + + + am + + + + + + sal + + + + + + + + + va + + + + + + + + + + + + to + + + ri + + + + + + nost + + + + + + ro + + + + + + Ve + + + + + + ni + + + + + + te + + + + + + + + Do + + + + + + mi + + + + + + + ne + + + + + + in + + + + + + vir + + + + + + tu + + + + + + te + + + + + + tu + + + + + + a + + + + + + + + + le + + + + + + ta + + + + + + + + + + bi + + + + + + tur + + + + + + rex + + + + + + E + + + + + + u + + + + + + o + + + + + + u + + + + + + a + + + + + + e + + + + + + +
+
+
+ +
+
\ No newline at end of file diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py index ff6a4199..eb43f042 100644 --- a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py @@ -47,6 +47,7 @@ def test_mei_parser(self) -> None: "helpers", "mei_processing", "test_mei_files", + "123723", "cdn-hsmu-m2149l4_001r.mei", ) ) diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py index 98ca090c..d4c58919 100644 --- a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py @@ -14,6 +14,7 @@ "helpers", "mei_processing", "test_mei_files", + "123723", "cdn-hsmu-m2149l4_001r.mei", ) diff --git a/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py b/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py new file mode 100644 index 00000000..9c5b9fbd --- /dev/null +++ b/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py @@ -0,0 +1,113 @@ +import os + +from django.core.management import call_command +from django.test import TestCase +from django.conf import settings + +from cantusdata.models import Manuscript, Folio +from cantusdata.test.core.helpers.mei_processing.test_mei_tokenizer import ( + calculate_expected_total_ngrams, +) + +from solr.core import SolrConnection # type: ignore + + +TEST_MEI_FILES_PATH = "cantusdata/test/core/helpers/mei_processing/test_mei_files" + + +class IndexManuscriptMeiTestCase(TestCase): + solr_conn = SolrConnection(settings.SOLR_TEST_SERVER) + + @classmethod + def setUpTestData(cls) -> None: + # Create a manuscript + manuscript = Manuscript.objects.create(id=123723) + # Create two folios + Folio.objects.create(number="001r", manuscript=manuscript) + Folio.objects.create(number="001v", manuscript=manuscript) + + def test_index_manuscript_mei(self) -> None: + call_command( + "index_manuscript_mei", + "123723", + "--min-ngram", + "1", + "--max-ngram", + "5", + "--mei-dir", + TEST_MEI_FILES_PATH, + ) + results = self.solr_conn.query("*:*", fq="type:omr_ngram") + with self.subTest("Test total number of indexed documents"): + total_exp_ngrams_001r = calculate_expected_total_ngrams( + f"{TEST_MEI_FILES_PATH}/123723/cdn-hsmu-m2149l4_001r.mei", 1, 5 + ) + total_exp_ngrams_001v = calculate_expected_total_ngrams( + f"{TEST_MEI_FILES_PATH}/123723/cdn-hsmu-m2149l4_001v.mei", 1, 5 + ) + self.assertEqual( + results.numFound, total_exp_ngrams_001r + total_exp_ngrams_001v + ) + + def test_flush_option(self) -> None: + call_command( + "index_manuscript_mei", + "123723", + "--mei-dir", + TEST_MEI_FILES_PATH, + ) + with self.subTest("Check index is not empty before test"): + results = self.solr_conn.query("*:*", fq="type:omr_ngram") + self.assertGreater(len(results), 0) + + with self.subTest("Test flush option"): + call_command("index_manuscript_mei", "123723", "--flush-index") + results = self.solr_conn.query("*:*", fq="type:omr_ngram") + self.assertEqual(len(results), 0) + + +class IndexManuscriptMeiExceptionsTestCase(TestCase): + @classmethod + def setUpTestData(cls) -> None: + # Create a manuscript + manuscript = Manuscript.objects.create(id=123723) + # Create two folios + Folio.objects.create(number="001r", manuscript=manuscript) + Folio.objects.create(number="001v", manuscript=manuscript) + + def setUp(self) -> None: + os.mkdir("/empty-mei-dir") + os.mkdir("/test-mei-dir") + os.mkdir( + "/test-mei-dir/123723", + ) + with open("/test-mei-dir/123723/test.mei", "w") as f: + pass + + def test_command_exceptions(self) -> None: + with self.subTest("Test manuscript_id with no folios"): + with self.assertRaises(ValueError): + call_command( + "index_manuscript_mei", "123724", "--mei-dir", TEST_MEI_FILES_PATH + ) + with self.subTest("Test non-existent mei-dir"): + with self.assertRaises(FileNotFoundError): + call_command( + "index_manuscript_mei", "123723", "--mei-dir", "/non-existent-dir" + ) + with self.subTest("Test empty mei-dir"): + with self.assertRaises(FileNotFoundError): + call_command( + "index_manuscript_mei", "123723", "--mei-dir", "/empty-mei-dir" + ) + with self.subTest("Test improperly named mei files"): + with self.assertRaises(ValueError): + call_command( + "index_manuscript_mei", "123723", "--mei-dir", "/test-mei-dir" + ) + + def tearDown(self) -> None: + os.remove("/test-mei-dir/123723/test.mei") + os.rmdir("/test-mei-dir/123723") + os.rmdir("/test-mei-dir") + os.rmdir("/empty-mei-dir") diff --git a/poetry.lock b/poetry.lock index 02d626e8..702a0bc9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -921,13 +921,13 @@ files = [ [[package]] name = "platformdirs" -version = "4.2.1" +version = "4.2.2" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.2.1-py3-none-any.whl", hash = "sha256:17d5a1161b3fd67b390023cb2d3b026bbd40abde6fdb052dfbd3a29c3ba22ee1"}, - {file = "platformdirs-4.2.1.tar.gz", hash = "sha256:031cd18d4ec63ec53e82dceaac0417d218a6863f7745dfcc9efe7793b7039bdf"}, + {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, + {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, ] [package.extras] diff --git a/solr/solr/cantus_ultimus_1/conf/schema.xml b/solr/solr/cantus_ultimus_1/conf/schema.xml index 5d696f15..dda690d9 100644 --- a/solr/solr/cantus_ultimus_1/conf/schema.xml +++ b/solr/solr/cantus_ultimus_1/conf/schema.xml @@ -62,10 +62,10 @@ - - + + - +