Skip to content

Commit

Permalink
recover missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
malteos committed Jul 18, 2024
1 parent 8e1e5d3 commit 6b38f9b
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 0 deletions.
41 changes: 41 additions & 0 deletions src/llm_datasets/datasets/ga/ga_bilingual_legistation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging

from llm_datasets.datasets.base import (
MB,
Availability,
BaseDataset,
License,
QualityWarning,
)

logger = logging.getLogger(__name__)


class GABilingualLegislationDataset(BaseDataset):
"""Quality warning: only sentences no documents"""

DATASET_ID = "ga_bilingual_legistation"
TITLE = "The Gaois bilingual corpus of English-Irish legislation (Irish legislation)"
HOMEPAGE = "https://portulanclarin.net/repository/browse/the-gaois-bilingual-corpus-of-english-irish-legislation-processed/daeac17c9e3511ea9b7f02420a000407b83de243dc0b469aab41084386c5b80f/" # noqa
DESCRIPTION = "Bilingual corpus of English-Irish legislation provided by the Department of Justice."
LICENSE = License("Open Under - PSI", url="https://elrc-share.eu/terms/openUnderPSI.html")
LANGUAGES = ["ga"]

AVAILIBILITY = Availability.SIGNIN_DOWNLOAD
QUALITY_WARNINGS = [QualityWarning.SHORT_TEXT]
DOWNLOAD_URLS = []

USED_BY = ["gaBERT"]

BYTES = 0.5 * 25 * MB

def get_texts(self):
from translate.storage.tmx import tmxfile

with open(self.get_dataset_file_paths(single_file=True, needed_suffix=".tmx"), "rb") as fin:
tmx_file = tmxfile(fin, "ga", "en")

for i, node in enumerate(tmx_file.unit_iter()):
text = node.source # ga
# en => node.target
yield text
37 changes: 37 additions & 0 deletions src/llm_datasets/datasets/lt/seimas_lt_en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import zipfile

from llm_datasets.datasets.base import BaseDataset, Genre, License


class SeismasLTENDataset(BaseDataset):
DATASET_ID = "seimas_lt_en"
TITLE = "Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website"
HOMEPAGE = "https://live.european-language-grid.eu/catalogue/corpus/3009/download/"
DESCRIPTION = (
"Contents of http://www.lrs.lt were crawled, aligned on document and sentence level and converted into a"
" parallel corpus."
)
DOWNLOAD_URLS = [
"https://elrc-share.eu/repository/download/4486f8e4e72711e7b7d400155d0267060b3d0987d08b43fd9c065ce3f05f99f8"
]
LANGUAGES = ["lt"]
GENRES = [Genre.GOVERNMENT]
LICENSE = License("Open under PSI", url="https://elrc-share.eu/terms/openUnderPSI.html")
BYTES = 160 * 1024

def get_texts(self):
from translate.storage.tmx import tmxfile

zip_fp = self.get_dataset_file_paths(needed_suffix=".zip", single_file=True)

with zipfile.ZipFile(zip_fp) as zf:
for fn in zf.namelist():
if fn.endswith(".tmx"):
with zf.open(fn) as member_f:
tmx_file = tmxfile(member_f, "lt", "en")

for i, node in enumerate(tmx_file.unit_iter()):
text = node.target # lt
# en => node.source

yield text
45 changes: 45 additions & 0 deletions src/llm_datasets/datasets/parquet_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path
from typing import Iterable

from llm_datasets.datasets.base import BaseDataset


class ParquetDataset(BaseDataset):
SINGLE_OUTPUT_FILE = False

def get_texts(self):
if self.output_format == "parquet":
yield from self.generate_texts_from_output(shuffled=False)
else:
raise ValueError(
"Dataset is already processed and in parquet format; no need for text extraction! Call"
" `generate_texts_from_output()` instead."
)

def extract_plaintext(self):
if self.output_format == "parquet":
raise ValueError(
"Dataset is already in parquet format; no text extraction needed! Call `generate_texts_from_output()`"
" instead."
)
else:
super().extract_plaintext()


class ShuffledParquetDataset(ParquetDataset):
"""The raw dataset files are already shuffled."""

def get_file_name_glob_pattern(self):
raise NotImplementedError()

def get_single_output_file_path(self, shuffled=False) -> str:
return None

def has_chunked_output_files(self, **kwargs):
return True

def get_shuffled_output_file_path(self, unshuffled_output_file_path: str) -> str:
raise ValueError("Dataset is a pre-shuffled dataset.")

def get_chunked_output_file_paths(self, shuffled=True) -> Iterable[str]:
return list(Path(self.get_output_dir(shuffled=shuffled)).glob(self.get_file_name_glob_pattern()))

0 comments on commit 6b38f9b

Please sign in to comment.