Skip to content

Commit

Permalink
ruff lint fix
Browse files Browse the repository at this point in the history
  • Loading branch information
malteos committed Jul 18, 2024
1 parent ee00a32 commit 2656044
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 15 deletions.
3 changes: 0 additions & 3 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@
"python.testing.pytestArgs": [
"tests"
],
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none",
"python.testing.pytestEnabled": true,
}
15 changes: 3 additions & 12 deletions src/llm_datasets/datasets/multilingual/legal_mc4.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,10 @@

class LegalMC4BaseDataset(HFDataset):
SOURCE_ID = "legal_mc4"
DESCRIPTION = (
"MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages"
)
DESCRIPTION = "MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages"
HOMEPAGE = "https://huggingface.co/datasets/joelito/legal-mc4"
AVAILIBILITY = Availability.DIRECT_DOWNLOAD
WEB_CRAWLED = True
# DUMMY = True
LICENSE = License(
"AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common"
" Crawl terms of use in respect of the content contained in the dataset.",
Expand Down Expand Up @@ -72,12 +69,6 @@ def HF_DATASET_CONFIGS(self):

def get_legal_mc4_auto_classes():
"""Auto generate dataset classes with token count"""
lang_to_tokens = {
row.split("\t")[0]: int(row.split("\t")[1])
for row in RAW_LANG_TO_TOKENS.splitlines()
}
lang_to_tokens = {row.split("\t")[0]: int(row.split("\t")[1]) for row in RAW_LANG_TO_TOKENS.splitlines()}

return [
get_legal_mc4_auto_cls_by_language(lang, tokens)
for lang, tokens in lang_to_tokens.items()
]
return [get_legal_mc4_auto_cls_by_language(lang, tokens) for lang, tokens in lang_to_tokens.items()]

0 comments on commit 2656044

Please sign in to comment.