Skip to content

Commit

Permalink
Merge branch 'lflage-ubertext_2'
Browse files Browse the repository at this point in the history
  • Loading branch information
malteos committed Jul 18, 2024
2 parents d247576 + 125c956 commit aa54c48
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/llm_datasets/datasets/dataset_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,13 @@
".ro.marcell_legislative_subcorpus_v2.MarcellLegislativeSubcorpusV2Dataset",
# uk
".uk.uk_laws.UKLawsDataset",
".uk.ubertext_2.UberText2",
]


def get_class_by_import_string(
import_string_or_cls: Union[str, object], relative_base_package: str = "llm_datasets.datasets"
import_string_or_cls: Union[str, object],
relative_base_package: str = "llm_datasets.datasets",
):
"""Import dataset class based on import string
Expand Down Expand Up @@ -253,7 +255,11 @@ def get_registered_dataset_ids(
]


def get_dataset_class_by_id(dataset_id, extra_dataset_registries: Optional[Union[str, List[str]]] = None, **kwargs):
def get_dataset_class_by_id(
dataset_id,
extra_dataset_registries: Optional[Union[str, List[str]]] = None,
**kwargs,
):
id_to_dataset_class = {
cls.DATASET_ID: cls for cls in get_registered_dataset_classes(extra_dataset_registries, **kwargs)
}
Expand Down
68 changes: 68 additions & 0 deletions src/llm_datasets/datasets/uk/ubertext_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import bz2
import logging
import os

from llm_datasets.datasets.base import Availability, BaseDataset

logger = logging.getLogger(__name__)


class UberText2(BaseDataset):
DATASET_ID = "ubertext_2"
TITLE = "UberText2.0"
# DESCRIPTION =
HOMEPAGE = "https://lang.org.ua/en/ubertext/"
AVAILIBILITY = Availability.DIRECT_DOWNLOAD.name
DOWNLOAD_URLS = [
"https://lang.org.ua/static/downloads/ubertext2.0/court/based/ubertext.court.filter_rus_gcld+short.text_only.txt.bz2",
"https://lang.org.ua/static/downloads/ubertext2.0/fiction/based/ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2",
"https://lang.org.ua/static/downloads/ubertext2.0/news/based/ubertext.news.filter_rus_gcld+short.text_only.txt.bz2",
"https://lang.org.ua/static/downloads/ubertext2.0/social/based/ubertext.social.filter_rus_gcld+short.text_only.txt.bz2",
"https://lang.org.ua/static/downloads/ubertext2.0/wikipedia/based/ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2",
]

DOI = "10.18653/v1/2023.unlp-1.1"
CITATION = """@inproceedings{chaplynskyi-2023-introducing,
title = "Introducing {U}ber{T}ext 2.0: A Corpus of Modern {U}krainian at Scale",
author = "Chaplynskyi, Dmytro",
booktitle = "Proceedings of the Second Ukrainian Natural Language Processing Workshop",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.unlp-1.1",
pages = "1--10",
}"""
LANGUAGES = ["uk"]

def extract_txt_file(self):
output = self.get_local_dataset_dir()
bz2_files = [
"ubertext.court.filter_rus_gcld+short.text_only.txt.bz2",
"ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2",
"ubertext.news.filter_rus_gcld+short.text_only.txt.bz2",
"ubertext.social.filter_rus_gcld+short.text_only.txt.bz2",
"ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2",
]
# Decompressor object
output_txt = output + "/ubertext.txt"
logger.info("Writing to {}".format(output_txt))
for bz2_file in bz2_files:
with (
bz2.BZ2File(os.path.join(output, bz2_file), "rb") as f,
open(output_txt, "ab") as out_f,
):
# Decompress data from file
logger.info("decompressing from: {}".format(bz2_file))
for data in iter(lambda: f.read(100 * 1024), b""):
# yield bz2.decompress(f.read())
out_f.write(data)
out_f.write(b"\n")

def get_texts(self):
output_txt = self.get_local_dataset_dir() + "/ubertext.txt"
logger.info("read into local dateset")
with open(output_txt, "r", encoding="utf-32") as f:
print("read file")
for line in f.readlines():
yield line

0 comments on commit aa54c48

Please sign in to comment.