recover missing files

malteos · Jul 18, 2024 · 6b38f9b · 6b38f9b
1 parent 8e1e5d3
commit 6b38f9b
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 0 deletions.
diff --git a/src/llm_datasets/datasets/ga/ga_bilingual_legistation.py b/src/llm_datasets/datasets/ga/ga_bilingual_legistation.py
@@ -0,0 +1,41 @@
+import logging
+
+from llm_datasets.datasets.base import (
+    MB,
+    Availability,
+    BaseDataset,
+    License,
+    QualityWarning,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class GABilingualLegislationDataset(BaseDataset):
+    """Quality warning: only sentences no documents"""
+
+    DATASET_ID = "ga_bilingual_legistation"
+    TITLE = "The Gaois bilingual corpus of English-Irish legislation (Irish legislation)"
+    HOMEPAGE = "https://portulanclarin.net/repository/browse/the-gaois-bilingual-corpus-of-english-irish-legislation-processed/daeac17c9e3511ea9b7f02420a000407b83de243dc0b469aab41084386c5b80f/"  # noqa
+    DESCRIPTION = "Bilingual corpus of English-Irish legislation provided by the Department of Justice."
+    LICENSE = License("Open Under - PSI", url="https://elrc-share.eu/terms/openUnderPSI.html")
+    LANGUAGES = ["ga"]
+
+    AVAILIBILITY = Availability.SIGNIN_DOWNLOAD
+    QUALITY_WARNINGS = [QualityWarning.SHORT_TEXT]
+    DOWNLOAD_URLS = []
+
+    USED_BY = ["gaBERT"]
+
+    BYTES = 0.5 * 25 * MB
+
+    def get_texts(self):
+        from translate.storage.tmx import tmxfile
+
+        with open(self.get_dataset_file_paths(single_file=True, needed_suffix=".tmx"), "rb") as fin:
+            tmx_file = tmxfile(fin, "ga", "en")
+
+        for i, node in enumerate(tmx_file.unit_iter()):
+            text = node.source  # ga
+            # en => node.target
+            yield text
diff --git a/src/llm_datasets/datasets/lt/seimas_lt_en.py b/src/llm_datasets/datasets/lt/seimas_lt_en.py
@@ -0,0 +1,37 @@
+import zipfile
+
+from llm_datasets.datasets.base import BaseDataset, Genre, License
+
+
+class SeismasLTENDataset(BaseDataset):
+    DATASET_ID = "seimas_lt_en"
+    TITLE = "Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website"
+    HOMEPAGE = "https://live.european-language-grid.eu/catalogue/corpus/3009/download/"
+    DESCRIPTION = (
+        "Contents of http://www.lrs.lt were crawled, aligned on document and sentence level and converted into a"
+        " parallel corpus."
+    )
+    DOWNLOAD_URLS = [
+        "https://elrc-share.eu/repository/download/4486f8e4e72711e7b7d400155d0267060b3d0987d08b43fd9c065ce3f05f99f8"
+    ]
+    LANGUAGES = ["lt"]
+    GENRES = [Genre.GOVERNMENT]
+    LICENSE = License("Open under PSI", url="https://elrc-share.eu/terms/openUnderPSI.html")
+    BYTES = 160 * 1024
+
+    def get_texts(self):
+        from translate.storage.tmx import tmxfile
+
+        zip_fp = self.get_dataset_file_paths(needed_suffix=".zip", single_file=True)
+
+        with zipfile.ZipFile(zip_fp) as zf:
+            for fn in zf.namelist():
+                if fn.endswith(".tmx"):
+                    with zf.open(fn) as member_f:
+                        tmx_file = tmxfile(member_f, "lt", "en")
+
+                for i, node in enumerate(tmx_file.unit_iter()):
+                    text = node.target  # lt
+                    # en => node.source
+
+                    yield text
diff --git a/src/llm_datasets/datasets/parquet_dataset.py b/src/llm_datasets/datasets/parquet_dataset.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+from typing import Iterable
+
+from llm_datasets.datasets.base import BaseDataset
+
+
+class ParquetDataset(BaseDataset):
+    SINGLE_OUTPUT_FILE = False
+
+    def get_texts(self):
+        if self.output_format == "parquet":
+            yield from self.generate_texts_from_output(shuffled=False)
+        else:
+            raise ValueError(
+                "Dataset is already processed and in parquet format; no need for text extraction! Call"
+                " `generate_texts_from_output()` instead."
+            )
+
+    def extract_plaintext(self):
+        if self.output_format == "parquet":
+            raise ValueError(
+                "Dataset is already in parquet format; no text extraction needed! Call `generate_texts_from_output()`"
+                " instead."
+            )
+        else:
+            super().extract_plaintext()
+
+
+class ShuffledParquetDataset(ParquetDataset):
+    """The raw dataset files are already shuffled."""
+
+    def get_file_name_glob_pattern(self):
+        raise NotImplementedError()
+
+    def get_single_output_file_path(self, shuffled=False) -> str:
+        return None
+
+    def has_chunked_output_files(self, **kwargs):
+        return True
+
+    def get_shuffled_output_file_path(self, unshuffled_output_file_path: str) -> str:
+        raise ValueError("Dataset is a pre-shuffled dataset.")
+
+    def get_chunked_output_file_paths(self, shuffled=True) -> Iterable[str]:
+        return list(Path(self.get_output_dir(shuffled=shuffled)).glob(self.get_file_name_glob_pattern()))