Skip to content

Commit b26a9bb

Browse files
committed
new tasks, preprocessing+postprocessing
1 parent 6b1c455 commit b26a9bb

File tree

3 files changed

+432
-396
lines changed

3 files changed

+432
-396
lines changed

src/tasksource/preprocess.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __map_to_target(x,fn=lambda x:None, target=None):
3434
return x
3535

3636
def __call__(self,dataset, max_rows=None, max_rows_eval=None):
37+
dataset = self.pre_process(dataset)
3738
for k,v in zip(self.default_splits, self.splits):
3839
if v and k!=v:
3940
dataset[k]=dataset[v]
@@ -52,7 +53,7 @@ def __call__(self,dataset, max_rows=None, max_rows_eval=None):
5253
and type(v)==str and k!=v)})
5354
for k in self.to_dict().keys():
5455
v=getattr(self, k)
55-
if callable(v) and k!="post_process":
56+
if callable(v) and k not in {"post_process","pre_process"}:
5657
dataset=dataset.map(self.__map_to_target,
5758
fn_kwargs={'fn':v,'target':k})
5859

@@ -164,6 +165,7 @@ class SharedFields:
164165
splits:list=Preprocessing.default_splits
165166
dataset_name:str = None
166167
config_name:str = None
168+
pre_process: callable = lambda x:x
167169
post_process: callable = lambda x:x
168170

169171
@dataclass

src/tasksource/tasks.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@
77
###################### NLI/paraphrase ###############################
88

99

10+
anli__a1 = Classification('premise','hypothesis','label', splits=['train_r1','dev_r1','test_r1'])
11+
anli__a2 = Classification('premise','hypothesis','label', splits=['train_r2','dev_r2','test_r2'])
12+
anli__a3 = Classification('premise','hypothesis','label', splits=['train_r3','dev_r3','test_r3'])
13+
14+
1015
babi_nli = Classification("premise", "hypothesis", "label",
1116
dataset_name="metaeval/babi_nli",
1217
config_name=set(get_dataset_config_names("metaeval/babi_nli"))-{"agents-motivations"}
1318
) # agents-motivations task is not as clear-cut as the others
1419

15-
anli__a1 = Classification('premise','hypothesis','label', splits=['train_r1','dev_r1','test_r1'])
16-
anli__a2 = Classification('premise','hypothesis','label', splits=['train_r2','dev_r2','test_r2'])
17-
anli__a3 = Classification('premise','hypothesis','label', splits=['train_r3','dev_r3','test_r3'])
1820

1921
sick__label = Classification('sentence_A','sentence_B','label')
2022
sick__relatedness = Classification('sentence_A','sentence_B','relatedness_score')
@@ -308,7 +310,20 @@ def split_choices(s):
308310
######################## Classification (other) ########################
309311

310312
utilitarianism = Classification("comparison",labels="label",
311-
dataset_name="")
313+
dataset_name="metaeval/utilitarianism")
314+
315+
amazon_counterfactual = Classification(
316+
"text", labels="label",
317+
dataset_name="mteb/amazon_counterfactual",
318+
config_name="en")
319+
320+
insincere_questions = Classification(
321+
"text", labels="label",
322+
dataset_name="SetFit/insincere-questions")
323+
324+
toxic_conversations = Classification(
325+
"text", labels="label",
326+
dataset_name="SetFit/toxic_conversations")
312327

313328
turingbench = Classification("Generation",labels="label",
314329
dataset_name="turingbench/TuringBench",
@@ -378,6 +393,10 @@ def split_choices(s):
378393
"persuasiveness-eloquence", "persuasiveness-premisetype", "persuasiveness-relevance", "persuasiveness-specificity",
379394
"persuasiveness-strength", "sarcasm","stac"])
380395

396+
silicone = Classification("Uterance",labels="Label",
397+
config_name=['dyda_da', 'dyda_e', 'iemocap', 'maptask', 'meld_e', 'meld_s', 'oasis', 'sem'] # +['swda', 'mrda'] # in pragmeval
398+
)
399+
381400
#lex_glue___ecthr_a = Classification(sentence1="text", labels="labels") # too long
382401
#lex_glue___ecthr_b = Classification(sentence1="text", labels="labels") # too long
383402
lex_glue___eurlex = Classification(sentence1="text", labels="labels")
@@ -591,6 +610,8 @@ def split_choices(s):
591610
sarcasm_news = Classification("headline", labels="is_sarcastic",
592611
dataset_name="raquiba/Sarcasm_News_Headline")
593612

613+
sem_eval_2010_task_8 = Classification("sentence",labels="relation")
614+
594615
###END
595616
################### END OF SUPPORT ######################
596617

0 commit comments

Comments
 (0)