-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
76 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import bz2 | ||
import logging | ||
import os | ||
|
||
from llm_datasets.datasets.base import Availability, BaseDataset | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UberText2(BaseDataset): | ||
DATASET_ID = "ubertext_2" | ||
TITLE = "UberText2.0" | ||
# DESCRIPTION = | ||
HOMEPAGE = "https://lang.org.ua/en/ubertext/" | ||
AVAILIBILITY = Availability.DIRECT_DOWNLOAD.name | ||
DOWNLOAD_URLS = [ | ||
"https://lang.org.ua/static/downloads/ubertext2.0/court/based/ubertext.court.filter_rus_gcld+short.text_only.txt.bz2", | ||
"https://lang.org.ua/static/downloads/ubertext2.0/fiction/based/ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2", | ||
"https://lang.org.ua/static/downloads/ubertext2.0/news/based/ubertext.news.filter_rus_gcld+short.text_only.txt.bz2", | ||
"https://lang.org.ua/static/downloads/ubertext2.0/social/based/ubertext.social.filter_rus_gcld+short.text_only.txt.bz2", | ||
"https://lang.org.ua/static/downloads/ubertext2.0/wikipedia/based/ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2", | ||
] | ||
|
||
DOI = "10.18653/v1/2023.unlp-1.1" | ||
CITATION = """@inproceedings{chaplynskyi-2023-introducing, | ||
title = "Introducing {U}ber{T}ext 2.0: A Corpus of Modern {U}krainian at Scale", | ||
author = "Chaplynskyi, Dmytro", | ||
booktitle = "Proceedings of the Second Ukrainian Natural Language Processing Workshop", | ||
month = may, | ||
year = "2023", | ||
address = "Dubrovnik, Croatia", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2023.unlp-1.1", | ||
pages = "1--10", | ||
}""" | ||
LANGUAGES = ["uk"] | ||
|
||
def extract_txt_file(self): | ||
output = self.get_local_dataset_dir() | ||
bz2_files = [ | ||
"ubertext.court.filter_rus_gcld+short.text_only.txt.bz2", | ||
"ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2", | ||
"ubertext.news.filter_rus_gcld+short.text_only.txt.bz2", | ||
"ubertext.social.filter_rus_gcld+short.text_only.txt.bz2", | ||
"ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2", | ||
] | ||
# Decompressor object | ||
output_txt = output + "/ubertext.txt" | ||
logger.info("Writing to {}".format(output_txt)) | ||
for bz2_file in bz2_files: | ||
with ( | ||
bz2.BZ2File(os.path.join(output, bz2_file), "rb") as f, | ||
open(output_txt, "ab") as out_f, | ||
): | ||
# Decompress data from file | ||
logger.info("decompressing from: {}".format(bz2_file)) | ||
for data in iter(lambda: f.read(100 * 1024), b""): | ||
# yield bz2.decompress(f.read()) | ||
out_f.write(data) | ||
out_f.write(b"\n") | ||
|
||
def get_texts(self): | ||
output_txt = self.get_local_dataset_dir() + "/ubertext.txt" | ||
logger.info("read into local dateset") | ||
with open(output_txt, "r", encoding="utf-32") as f: | ||
print("read file") | ||
for line in f.readlines(): | ||
yield line |