Skip to content

Commit

Permalink
added trust remote code
Browse files Browse the repository at this point in the history
  • Loading branch information
malteos committed Jul 18, 2024
1 parent 6b38f9b commit fadc6ed
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions src/llm_datasets/datasets/multilingual/legal_mc4.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@

class LegalMC4BaseDataset(HFDataset):
SOURCE_ID = "legal_mc4"
DESCRIPTION = "MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages"
DESCRIPTION = (
"MC4_Legal: A Corpus Covering the Legal Part of MC4 for European Languages"
)
HOMEPAGE = "https://huggingface.co/datasets/joelito/legal-mc4"
AVAILIBILITY = Availability.DIRECT_DOWNLOAD
WEB_CRAWLED = True
Expand All @@ -46,7 +48,9 @@ class LegalMC4BaseDataset(HFDataset):
HF_DATASET_ID = "joelito/legal-mc4"
HF_DATASET_SPLIT = "train"
HF_DATASET_CONFIGS = None # is set by language version

HF_KWARGS = dict(
trust_remote_code=True,
)
streaming = True
keep_columns = True
metadata_column_names = ["url", "timestamp"]
Expand All @@ -68,6 +72,12 @@ def HF_DATASET_CONFIGS(self):

def get_legal_mc4_auto_classes():
"""Auto generate dataset classes with token count"""
lang_to_tokens = {row.split("\t")[0]: int(row.split("\t")[1]) for row in RAW_LANG_TO_TOKENS.splitlines()}
lang_to_tokens = {
row.split("\t")[0]: int(row.split("\t")[1])
for row in RAW_LANG_TO_TOKENS.splitlines()
}

return [get_legal_mc4_auto_cls_by_language(lang, tokens) for lang, tokens in lang_to_tokens.items()]
return [
get_legal_mc4_auto_cls_by_language(lang, tokens)
for lang, tokens in lang_to_tokens.items()
]

0 comments on commit fadc6ed

Please sign in to comment.