more flexible access to tasks, new tasks

sileod · sileod · commit 39af585d5445 · 2023-01-23T11:33:22.000+01:00
diff --git a/setup.cfg b/setup.cfg
@@ -23,6 +23,7 @@ install_requires =
     pandas
     numpy
     scipy
+    sorcery
 
 [options.packages.find]
 where = src
diff --git a/src/tasksource/access.py b/src/tasksource/access.py
@@ -6,7 +6,7 @@
 from datasets import load_dataset
 import funcy as fc
 import os
-
+from sorcery import dict_of
 
 def parse_var_name(s):
     config_name,task_name = None,None
@@ -29,7 +29,6 @@ def pretty_name(x):
 
 def list_tasks(tasks_path=f'{os.path.dirname(__file__)}/tasks.py'):
     task_order = open(tasks_path).readlines()
-    task_order= task_order[:task_order.index('###END\n')]
     task_order = [x.split('=')[0].rstrip() for x in task_order if '=' in x]
     task_order = [x for x in task_order if x.isidentifier()]
     task_order = fc.flip(dict(enumerate(task_order)))
@@ -59,17 +58,22 @@ def list_tasks(tasks_path=f'{os.path.dirname(__file__)}/tasks.py'):
 
 task_df = list_tasks()
 
-def load_preprocessing(dataset_name, config_name=None, task_name=None):
-    y = task_df
-    y = y[y.dataset_name.map(lambda x:x==dataset_name)]
-    y = y[y.config_name.map(lambda x:x==config_name)]
-    y = y[y.task_name.map(lambda x:x==task_name)]
-    return getattr(tasks,y.preprocessing_name.iloc[0])
-
+def dict_to_query(d=dict(), **kwargs):
+    d={**d,**kwargs}
+    return '&'.join([f'`{k}`=="{v}"' for k,v in d.items()])
 
+def load_preprocessing(tasks=tasks, **kwargs):
+    y = task_df.query(dict_to_query(**kwargs)).iloc[0]
+    preprocessing= getattr(tasks, y.preprocessing_name)
+    for c in 'dataset_name','config_name':
+        if not isinstance(getattr(preprocessing,c), str):
+             setattr(preprocessing,c,getattr(y,c))
+    return preprocessing
 
-def load_task(dataset_name,config_name=None,task_name=None,
+def load_task(id=None, dataset_name=None,config_name=None,task_name=None,preprocessing_name=None,
          max_rows=None, max_rows_eval=None):
-    dataset = load_dataset(dataset_name,config_name)
-    preprocessing = load_preprocessing(dataset_name,config_name,task_name) 
+    query = dict_of(id, dataset_name, config_name, task_name,preprocessing_name)
+    query = {k:v for k,v in query.items() if v}
+    preprocessing = load_preprocessing(**query)
+    dataset = load_dataset(preprocessing.dataset_name, preprocessing.config_name)
     return preprocessing(dataset,max_rows, max_rows_eval)
diff --git a/src/tasksource/preprocess.py b/src/tasksource/preprocess.py
@@ -33,6 +33,9 @@ def __map_to_target(x,fn=lambda x:None, target=None):
         x[target]=fn(x)
         return x
 
+    def load(self):
+        return self(datasets.load_dataset(self.dataset_name,self.config_name))
+
     def __call__(self,dataset, max_rows=None, max_rows_eval=None):
         dataset = self.pre_process(dataset)
         for k,v in zip(self.default_splits, self.splits):
@@ -53,7 +56,7 @@ def __call__(self,dataset, max_rows=None, max_rows_eval=None):
                                         and type(v)==str and k!=v)})
         for k in self.to_dict().keys():
             v=getattr(self, k)
-            if callable(v) and k not in {"post_process","pre_process"}:
+            if callable(v) and k not in {"post_process","pre_process","load"}:
                 dataset=dataset.map(self.__map_to_target,
                                     fn_kwargs={'fn':v,'target':k})
 
diff --git a/src/tasksource/tasks.py b/src/tasksource/tasks.py
@@ -17,13 +17,8 @@
     config_name=set(get_dataset_config_names("metaeval/babi_nli"))-{"agents-motivations"}
 ) # agents-motivations task is not as clear-cut as the others
 
-def ling_nli_postprocess(ds):
-    return ds.cast_column('labels', ClassLabel(
-    names=['entailment','neutral','contradiction']))
 
-ling_nli = Classification("premise_original","hypothesis_original","label",
-    dataset_name="metaeval/lingnli", post_process=ling_nli_postprocess
-)
+ling_nli = Classification("premise","hypothesis","label",dataset_name="metaeval/lingnli")
 
 
 sick__label         = Classification('sentence_A','sentence_B','label')
@@ -124,23 +119,23 @@ def ling_nli_postprocess(ds):
 add_one_rte = Classification("premise","hypothesis","label",
     dataset_name="pietrolesci/add_one_rte",splits=["train","dev","test"])
 
-def imppres_post_process(ds,prefix=''):
+def _imppres_post_process(ds,prefix=''):
     # imppres entailment definition is either purely semantic or purely pragmatic
     # because of that, we assign differentiate the labels from anli/mnli notation
     return ds.cast_column('labels', ClassLabel(
     names=[f'imppres{prefix}_entailment',f'imppres{prefix}_neutral',f'imppres{prefix}_contradiction']))
 
 imppres__presupposition = imppres__prag = Classification("premise","hypothesis","gold_label",
     dataset_name="metaeval/imppres", config_name=imppres_presupposition,
-    post_process=imppres_post_process)
+    post_process=_imppres_post_process)
 
 imppres__prag = Classification("premise","hypothesis","gold_label_prag",
     dataset_name="metaeval/imppres", config_name=imppres_implicature,
-    post_process=lambda x: imppres_post_process(x,'_prag'))
+    post_process=lambda x: _imppres_post_process(x,'_prag'))
 
 imppres__log = Classification("premise","hypothesis","gold_label_log",
     dataset_name="metaeval/imppres", config_name=imppres_implicature,
-    post_process=lambda x: imppres_post_process(x,'_log'))
+    post_process=lambda x: _imppres_post_process(x,'_log'))
 
 
 glue__diagnostics = Classification("premise","hypothesis","label",
@@ -312,13 +307,13 @@ def imppres_post_process(ds,prefix=''):
 
 swag=MultipleChoice(cat(["sent1","sent2"]),regen("ending[0-3]"),"label")
 
-def split_choices(s):
+def _split_choices(s):
     import re
     return [x.rstrip(', ') for x in re.split(r'[a-e] \) (.*?)',s) if x.strip(', ')]
 
 math_qa = MultipleChoice(
     'Problem', 
-    choices_list = lambda x: split_choices(x['options']),
+    choices_list = lambda x: _split_choices(x['options']),
     labels = lambda x:'abcde'.index(x['correct'])   
 )
 
@@ -500,15 +495,14 @@ def split_choices(s):
 
 metaeval_linguisticprobing = Classification("sentence", labels="label", dataset_name="metaeval/linguisticprobing", 
     config_name=['subj_number',
-                'word_content',
                 'obj_number',
                 'past_present',
                 'sentence_length',
                 'top_constituents',
                 'tree_depth',
                 'coordination_inversion',
                 'odd_man_out',
-                'bigram_shift']
+                'bigram_shift']#+['word_content'] #too many labels 
 )
 
 metaeval_crowdflower = Classification("text", labels="label",
@@ -664,5 +658,22 @@ def split_choices(s):
     config_name='proto_qa'
 )
 
-###END
-################### END OF SUPPORT ######################
+wiki_qa = Classification("question","answer","label")
+
+cycic_classification = Classification("question",labels="correct_answer",
+    dataset_name = "metaeval/cycic_classification")
+cycic_mc = MultipleChoice("question", choices=regen('answer\_option[0-4]'), labels="correct_answer",
+    dataset_name = "metaeval/cycic_multiplechoice")
+
+
+def _preprocess_chatgpt_detection(ex):
+    import random
+    label=random.random()<=0.5
+    ex['label']=label
+    ex['answer']=[ex['human_answers'],ex['chatgpt_answers']][label]
+    return ex
+    
+chatgpt_detection = Classification("question","answer","label",
+    dataset_name = 'Hello-SimpleAI/HC3', config_name="all",
+    pre_process=lambda dataset:dataset.map(_preprocess_chatgpt_detection)
+)
diff --git a/tasks.md b/tasks.md