Skip to content

Commit 42d41a7

Browse files
author
Damien Sileo
committed
new tasks
1 parent e57e9ae commit 42d41a7

File tree

3 files changed

+54
-29
lines changed

3 files changed

+54
-29
lines changed

src/tasksource/mtasks.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .preprocess import cat, get, regen, constant, Classification, TokenClassification, MultipleChoice
1+
from .preprocess import cat, get,name, regen, constant, Classification, TokenClassification, MultipleChoice
22
from .metadata import udep_labels
33
from datasets import get_dataset_config_names, ClassLabel, Dataset, DatasetDict, concatenate_datasets, Sequence
44

@@ -19,47 +19,34 @@ def concatenate_configs(dataset):
1919

2020
americas_nli = Classification("premise","hypothesis","label",config_name="all_languages")
2121

22-
moritz_xnli = Classification("premise","hypothesis","label",
22+
moritz_xnli = Classification("premise","hypothesis",name("label",["entailment", "neutral","contradiction"]),
2323
pre_process=concatenate_configs, dataset_name="MoritzLaurer/multilingual-NLI-26lang-2mil7")
2424

2525
stsb_multi_mt = Classification("sentence1", "sentence2",
2626
lambda x: float(x["similarity_score"]/5),
2727
**all('stsb_multi_mt'))
2828

29-
pawsx = Classification("sentence1","sentence2","label", **all('paws-x'))
29+
pawsx = Classification("sentence1","sentence2",name('label',['not_paraphrase','paraphrase']), **all('paws-x'))
3030

3131
miam = Classification("Utterance",labels="Label", **all('miam'))
3232

3333
xstance = Classification("question", "comment", "label",
3434
**all("strombergnlp/x-stance"))
3535

36-
sentiment = Classification("text",labels="label",
37-
dataset_name="tyqiangz/multilingual-sentiments",config_name="all",
38-
pre_process=lambda ds:ds.filter(lambda x: "amazon_reviews" not in x['source'])
39-
)
4036

41-
emotion = Classification("text",labels="emotion",dataset_name="metaeval/universal-joy")
42-
43-
review_sentiment = Classification("review_body",labels="stars",
44-
dataset_name="amazon_reviews_multi",config_name="all_languages")
45-
46-
tweet_sentiment = Classification("text", labels="label",
47-
**all('cardiffnlp/tweet_sentiment_multilingual'))
48-
49-
offenseval = Classification(lambda x: str(x["text"]), labels="subtask_a",
37+
offenseval = Classification(lambda x: str(x["text"]), labels=name("subtask_a",['not offensive','offensive']),
38+
pre_process=lambda ds:ds.filter(lambda x: x['subtask_a'] in [0,1]),
5039
dataset_name='strombergnlp/offenseval_2020',
5140
config_name=["ar","da","gr","tr"])
5241

5342
offenseval_dravidian = Classification("text",labels="label",config_name=['kannada','malayalam','tamil'])
5443

55-
mlma_hate = Classification("tweet", labels="sentiment",
44+
mlma_hate = Classification("tweet", labels=lambda x:x["sentiment"].split('_'),
5645
dataset_name="nedjmaou/MLMA_hate_speech")
5746

58-
5947
qam = Classification("question","answer","label", dataset_name="xglue",config_name="qam")
6048

61-
x_sum_factuality = Classification("summary","generated_summary","label",
62-
dataset_name="ylacombe/xsum_factuality")
49+
#x_sum_factuality = Classification("summary","generated_summary","label", dataset_name="ylacombe/xsum_factuality")
6350

6451
x_fact = Classification('evidence','claim','label', dataset_name="metaeval/x-fact")
6552

@@ -73,8 +60,6 @@ def concatenate_configs(dataset):
7360
sentence2=cat(["target_word","context_2"], " : "),
7461
labels='label',dataset_name="pasinit/xlwic",config_name=['xlwic_de_de','xlwic_it_it','xlwic_fr_fr','xlwic_en_ko'])
7562

76-
77-
7863
#[ "spam", "fails_task", "lang_mismatch", "pii", "not_appropriate", "hate_speech", "sexual_content", "quality", "toxicity", "humor", "helpfulness", "creativity", "violence" ]
7964

8065
oasst1__quality = Classification("parent_text","text",labels="quality", dataset_name="tasksource/oasst1_dense_flat",
@@ -119,10 +104,30 @@ def udep_post_process(ds):
119104
oasst_rlhf = MultipleChoice("prompt",choices=['chosen','rejected'],labels=constant(0),
120105
dataset_name="tasksource/oasst1_pairwise_rlhf_reward")
121106

122-
#Classification(
107+
sentiment = Classification("text",labels="label", dataset_name="tyqiangz/multilingual-sentiments",config_name="all",
108+
pre_process=lambda ds:ds.filter(lambda x: "amazon_reviews" not in x['source']) )
109+
tweet_sentiment = Classification("text", labels="label", **all('cardiffnlp/tweet_sentiment_multilingual'))
110+
review_sentiment = Classification("review_body",labels="stars", dataset_name="amazon_reviews_multi",config_name="all_languages")
111+
emotion = Classification("text",labels="emotion",dataset_name="metaeval/universal-joy")
112+
# in mms
113+
114+
mms_sentiment = Classification("text",labels="label",dataset_name='Brand24/mms')
115+
116+
mapa_fine = TokenClassification("tokens","coarse_grained",dataset_name='joelito/mapa')
117+
mapa_corase = TokenClassification("tokens","fine_grained",dataset_name='joelito/mapa')
118+
119+
aces_ranking = MultipleChoice("source",choices=['good-translation','incorrect-translation'],labels=constant(0), dataset_name='nikitam/ACES')
120+
aces_phenomena = Classification('source','incorrect-translation','phenomena', dataset_name='nikitam/ACES')
121+
122+
amazon_intent = Classification("utt",labels="intent",**all('AmazonScience/massive'))
123123
# dataset_name='glue',config_name=['ocnli','afqmc'])
124124

125-
#
125+
tidy_as2=Classification("Question","Sentence","Label",dataset_name='tasksource/tydi-as2-balanced')
126+
127+
multiconer = TokenClassification("tokens","ner_tags_index", **all("MultiCoNER/multiconer_v2"))
128+
129+
mtop = Classification("question",labels="intent", dataset_name="tasksource/mtop")
130+
126131
#wino_x
127132
# clue, klue, indic_glue
128133
# SMS_Spam_Multilingual_Collection_Dataset

src/tasksource/recast.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
improper_labels = ['recast/recast_kg_relations','linguisticprobing',"lexglue/scotus","pragmeval/squinky","pragmeval/emobank",'pragmeval/persuasiveness']
77
improper_labels += ['glue/stsb', 'sick/relatedness', 'joci', 'utilitarianism', 'amazon_counterfactual/en', 'toxic_conversations', 'ethos/multilabel', 'lex_glue/eurlex', 'lex_glue/unfair_tos', 'app_reviews', 'humicroedit/subtask-1', 'stackoverflow-questions', 'go_emotions/simplified', 'google_wellformed_query', 'has_part', 'blog_authorship_corpus/age', 'promptCoherence', 'Sarcasm_News_Headline', 'auditor_review/demo-org--auditor_review', 'Dynasent_Disagreement', 'Politeness_Disagreement', 'SBIC_Disagreement', 'SChem_Disagreement', 'Dilemmas_Disagreement', 'sts-companion', 'acceptability-prediction', 'chaos-mnli-ambiguity', 'headline_cause/en_simple', 'oasst1_dense_flat', 'civil_comments']
8-
8+
improper_labels += ['stsb_multi_mt','MLMA_hate_speech']
99

1010
def render_options(options):
1111
options = [f'"{x}"' for x in options]
@@ -48,14 +48,14 @@ def shuffle_choices(x):
4848
x["labels"]=choices_texts.index(correct_choice)
4949
return x
5050

51-
def recast_dataset_classification_to_mc(dataset,N=4):
51+
def recast_dataset_classification_to_mc(dataset,sep="[SEP]",N=4):
5252

5353
def recast_split(d,N=N):
5454
labels = d.features['labels']
5555
df=d.to_pandas()
5656
df['inputs'] = df.sentence1
5757
if "sentence2" in df:
58-
df['inputs'] +="[SEP]" + df.sentence2
58+
df['inputs'] +=sep + df.sentence2
5959

6060
N=min(N, len(labels.names))
6161
df['choices']=df.apply(lambda x:negative_sample_options(labels.int2str(x['labels']), labels.names,N),axis=1)

src/tasksource/tasks.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .preprocess import cat, get, regen, name, constant, Classification, TokenClassification, MultipleChoice
22
from .metadata import bigbench_discriminative_english, blimp_hard, imppres_presupposition, imppres_implicature, udep_en_configs, udep_en_labels
33
from datasets import get_dataset_config_names, Sequence, ClassLabel, Dataset, DatasetDict
4+
45
# variable name: dataset___config__task
56

67
###################### NLI/paraphrase ###############################
@@ -1008,8 +1009,27 @@ def _udep_post_process(ds):
10081009

10091010
monli = Classification("sentence1","sentence2","gold_label", dataset_name="tasksource/monli")
10101011

1011-
causality = Classification('input',labels=name('label',['not_entailment','entailment']),dataset_name='causalnlp/corr2cause')
1012+
causality = Classification('premise','hypothesis','relation', dataset_name='tasksource/corr2cause')
10121013

10131014
lsat = MultipleChoice(cat(['passage','question']), choices_list='references',labels='gold_index',dataset_name='lighteval/lsat_qa',config_name='all')
10141015

1015-
apt = Classification('text_a','text_b',name('labels',['not_paraphrase','paraprhase']),dataset_name='tasksource/apt')
1016+
apt = Classification('text_a','text_b',name('labels',['not_paraphrase','paraphrase']),dataset_name='tasksource/apt')
1017+
1018+
#xsum_factuality = Classification("summary",labels="is_factual")
1019+
1020+
financial_sentiment = Classification("text",labels="label",dataset_name="zeroshot/twitter-financial-news-sentiment")
1021+
1022+
def _icl_rand(x):
1023+
import random
1024+
return random.Random(x['sentence1'][:50]).randint(0,1) #deterministic label for each input
1025+
1026+
icl = Classification("inputs", lambda x: x['symbols'][_icl_rand(x)],
1027+
labels=lambda x: int(x['symbols'][_icl_rand(x)]==x['targets']),
1028+
dataset_name="tasksource/icl-symbol-tuning-instruct",
1029+
pre_process=lambda ds:ds.filter(lambda x:len(x['inputs'])<200*4), # 200 tokens of 4 char
1030+
post_process=lambda ds:ds.cast_column('labels',ClassLabel(names=['False','True']))
1031+
)
1032+
1033+
space_nli = Classification("premises","hypothesis","label",dataset_name="tasksource/SpaceNLI")
1034+
1035+
# hate_context

0 commit comments

Comments
 (0)