-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import logging | ||
|
||
from llm_datasets.datasets.base import ( | ||
MB, | ||
Availability, | ||
BaseDataset, | ||
License, | ||
QualityWarning, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class GABilingualLegislationDataset(BaseDataset): | ||
"""Quality warning: only sentences no documents""" | ||
|
||
DATASET_ID = "ga_bilingual_legistation" | ||
TITLE = "The Gaois bilingual corpus of English-Irish legislation (Irish legislation)" | ||
HOMEPAGE = "https://portulanclarin.net/repository/browse/the-gaois-bilingual-corpus-of-english-irish-legislation-processed/daeac17c9e3511ea9b7f02420a000407b83de243dc0b469aab41084386c5b80f/" # noqa | ||
DESCRIPTION = "Bilingual corpus of English-Irish legislation provided by the Department of Justice." | ||
LICENSE = License("Open Under - PSI", url="https://elrc-share.eu/terms/openUnderPSI.html") | ||
LANGUAGES = ["ga"] | ||
|
||
AVAILIBILITY = Availability.SIGNIN_DOWNLOAD | ||
QUALITY_WARNINGS = [QualityWarning.SHORT_TEXT] | ||
DOWNLOAD_URLS = [] | ||
|
||
USED_BY = ["gaBERT"] | ||
|
||
BYTES = 0.5 * 25 * MB | ||
|
||
def get_texts(self): | ||
from translate.storage.tmx import tmxfile | ||
|
||
with open(self.get_dataset_file_paths(single_file=True, needed_suffix=".tmx"), "rb") as fin: | ||
tmx_file = tmxfile(fin, "ga", "en") | ||
|
||
for i, node in enumerate(tmx_file.unit_iter()): | ||
text = node.source # ga | ||
# en => node.target | ||
yield text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import zipfile | ||
|
||
from llm_datasets.datasets.base import BaseDataset, Genre, License | ||
|
||
|
||
class SeismasLTENDataset(BaseDataset): | ||
DATASET_ID = "seimas_lt_en" | ||
TITLE = "Bilingual English-Lithuanian parallel corpus from Seimas of the Republic of Lithuania website" | ||
HOMEPAGE = "https://live.european-language-grid.eu/catalogue/corpus/3009/download/" | ||
DESCRIPTION = ( | ||
"Contents of http://www.lrs.lt were crawled, aligned on document and sentence level and converted into a" | ||
" parallel corpus." | ||
) | ||
DOWNLOAD_URLS = [ | ||
"https://elrc-share.eu/repository/download/4486f8e4e72711e7b7d400155d0267060b3d0987d08b43fd9c065ce3f05f99f8" | ||
] | ||
LANGUAGES = ["lt"] | ||
GENRES = [Genre.GOVERNMENT] | ||
LICENSE = License("Open under PSI", url="https://elrc-share.eu/terms/openUnderPSI.html") | ||
BYTES = 160 * 1024 | ||
|
||
def get_texts(self): | ||
from translate.storage.tmx import tmxfile | ||
|
||
zip_fp = self.get_dataset_file_paths(needed_suffix=".zip", single_file=True) | ||
|
||
with zipfile.ZipFile(zip_fp) as zf: | ||
for fn in zf.namelist(): | ||
if fn.endswith(".tmx"): | ||
with zf.open(fn) as member_f: | ||
tmx_file = tmxfile(member_f, "lt", "en") | ||
|
||
for i, node in enumerate(tmx_file.unit_iter()): | ||
text = node.target # lt | ||
# en => node.source | ||
|
||
yield text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from pathlib import Path | ||
from typing import Iterable | ||
|
||
from llm_datasets.datasets.base import BaseDataset | ||
|
||
|
||
class ParquetDataset(BaseDataset): | ||
SINGLE_OUTPUT_FILE = False | ||
|
||
def get_texts(self): | ||
if self.output_format == "parquet": | ||
yield from self.generate_texts_from_output(shuffled=False) | ||
else: | ||
raise ValueError( | ||
"Dataset is already processed and in parquet format; no need for text extraction! Call" | ||
" `generate_texts_from_output()` instead." | ||
) | ||
|
||
def extract_plaintext(self): | ||
if self.output_format == "parquet": | ||
raise ValueError( | ||
"Dataset is already in parquet format; no text extraction needed! Call `generate_texts_from_output()`" | ||
" instead." | ||
) | ||
else: | ||
super().extract_plaintext() | ||
|
||
|
||
class ShuffledParquetDataset(ParquetDataset): | ||
"""The raw dataset files are already shuffled.""" | ||
|
||
def get_file_name_glob_pattern(self): | ||
raise NotImplementedError() | ||
|
||
def get_single_output_file_path(self, shuffled=False) -> str: | ||
return None | ||
|
||
def has_chunked_output_files(self, **kwargs): | ||
return True | ||
|
||
def get_shuffled_output_file_path(self, unshuffled_output_file_path: str) -> str: | ||
raise ValueError("Dataset is a pre-shuffled dataset.") | ||
|
||
def get_chunked_output_file_paths(self, shuffled=True) -> Iterable[str]: | ||
return list(Path(self.get_output_dir(shuffled=shuffled)).glob(self.get_file_name_glob_pattern())) |