1
- from .preprocess import cat , get , regen , constant , Classification , TokenClassification , MultipleChoice
1
+ from .preprocess import cat , get ,name , regen , constant , Classification , TokenClassification , MultipleChoice
2
2
from .metadata import udep_labels
3
3
from datasets import get_dataset_config_names , ClassLabel , Dataset , DatasetDict , concatenate_datasets , Sequence
4
4
@@ -19,47 +19,34 @@ def concatenate_configs(dataset):
19
19
20
20
americas_nli = Classification ("premise" ,"hypothesis" ,"label" ,config_name = "all_languages" )
21
21
22
- moritz_xnli = Classification ("premise" ,"hypothesis" ,"label" ,
22
+ moritz_xnli = Classification ("premise" ,"hypothesis" ,name ( "label" ,[ "entailment" , "neutral" , "contradiction" ]) ,
23
23
pre_process = concatenate_configs , dataset_name = "MoritzLaurer/multilingual-NLI-26lang-2mil7" )
24
24
25
25
stsb_multi_mt = Classification ("sentence1" , "sentence2" ,
26
26
lambda x : float (x ["similarity_score" ]/ 5 ),
27
27
** all ('stsb_multi_mt' ))
28
28
29
- pawsx = Classification ("sentence1" ,"sentence2" ," label" , ** all ('paws-x' ))
29
+ pawsx = Classification ("sentence1" ,"sentence2" ,name ( ' label' ,[ 'not_paraphrase' , 'paraphrase' ]) , ** all ('paws-x' ))
30
30
31
31
miam = Classification ("Utterance" ,labels = "Label" , ** all ('miam' ))
32
32
33
33
xstance = Classification ("question" , "comment" , "label" ,
34
34
** all ("strombergnlp/x-stance" ))
35
35
36
- sentiment = Classification ("text" ,labels = "label" ,
37
- dataset_name = "tyqiangz/multilingual-sentiments" ,config_name = "all" ,
38
- pre_process = lambda ds :ds .filter (lambda x : "amazon_reviews" not in x ['source' ])
39
- )
40
36
41
- emotion = Classification ("text" ,labels = "emotion" ,dataset_name = "metaeval/universal-joy" )
42
-
43
- review_sentiment = Classification ("review_body" ,labels = "stars" ,
44
- dataset_name = "amazon_reviews_multi" ,config_name = "all_languages" )
45
-
46
- tweet_sentiment = Classification ("text" , labels = "label" ,
47
- ** all ('cardiffnlp/tweet_sentiment_multilingual' ))
48
-
49
- offenseval = Classification (lambda x : str (x ["text" ]), labels = "subtask_a" ,
37
+ offenseval = Classification (lambda x : str (x ["text" ]), labels = name ("subtask_a" ,['not offensive' ,'offensive' ]),
38
+ pre_process = lambda ds :ds .filter (lambda x : x ['subtask_a' ] in [0 ,1 ]),
50
39
dataset_name = 'strombergnlp/offenseval_2020' ,
51
40
config_name = ["ar" ,"da" ,"gr" ,"tr" ])
52
41
53
42
offenseval_dravidian = Classification ("text" ,labels = "label" ,config_name = ['kannada' ,'malayalam' ,'tamil' ])
54
43
55
- mlma_hate = Classification ("tweet" , labels = "sentiment" ,
44
+ mlma_hate = Classification ("tweet" , labels = lambda x : x [ "sentiment" ]. split ( '_' ) ,
56
45
dataset_name = "nedjmaou/MLMA_hate_speech" )
57
46
58
-
59
47
qam = Classification ("question" ,"answer" ,"label" , dataset_name = "xglue" ,config_name = "qam" )
60
48
61
- x_sum_factuality = Classification ("summary" ,"generated_summary" ,"label" ,
62
- dataset_name = "ylacombe/xsum_factuality" )
49
+ #x_sum_factuality = Classification("summary","generated_summary","label", dataset_name="ylacombe/xsum_factuality")
63
50
64
51
x_fact = Classification ('evidence' ,'claim' ,'label' , dataset_name = "metaeval/x-fact" )
65
52
@@ -73,8 +60,6 @@ def concatenate_configs(dataset):
73
60
sentence2 = cat (["target_word" ,"context_2" ], " : " ),
74
61
labels = 'label' ,dataset_name = "pasinit/xlwic" ,config_name = ['xlwic_de_de' ,'xlwic_it_it' ,'xlwic_fr_fr' ,'xlwic_en_ko' ])
75
62
76
-
77
-
78
63
#[ "spam", "fails_task", "lang_mismatch", "pii", "not_appropriate", "hate_speech", "sexual_content", "quality", "toxicity", "humor", "helpfulness", "creativity", "violence" ]
79
64
80
65
oasst1__quality = Classification ("parent_text" ,"text" ,labels = "quality" , dataset_name = "tasksource/oasst1_dense_flat" ,
@@ -119,10 +104,30 @@ def udep_post_process(ds):
119
104
oasst_rlhf = MultipleChoice ("prompt" ,choices = ['chosen' ,'rejected' ],labels = constant (0 ),
120
105
dataset_name = "tasksource/oasst1_pairwise_rlhf_reward" )
121
106
122
- #Classification(
107
+ sentiment = Classification ("text" ,labels = "label" , dataset_name = "tyqiangz/multilingual-sentiments" ,config_name = "all" ,
108
+ pre_process = lambda ds :ds .filter (lambda x : "amazon_reviews" not in x ['source' ]) )
109
+ tweet_sentiment = Classification ("text" , labels = "label" , ** all ('cardiffnlp/tweet_sentiment_multilingual' ))
110
+ review_sentiment = Classification ("review_body" ,labels = "stars" , dataset_name = "amazon_reviews_multi" ,config_name = "all_languages" )
111
+ emotion = Classification ("text" ,labels = "emotion" ,dataset_name = "metaeval/universal-joy" )
112
+ # in mms
113
+
114
+ mms_sentiment = Classification ("text" ,labels = "label" ,dataset_name = 'Brand24/mms' )
115
+
116
+ mapa_fine = TokenClassification ("tokens" ,"coarse_grained" ,dataset_name = 'joelito/mapa' )
117
+ mapa_corase = TokenClassification ("tokens" ,"fine_grained" ,dataset_name = 'joelito/mapa' )
118
+
119
+ aces_ranking = MultipleChoice ("source" ,choices = ['good-translation' ,'incorrect-translation' ],labels = constant (0 ), dataset_name = 'nikitam/ACES' )
120
+ aces_phenomena = Classification ('source' ,'incorrect-translation' ,'phenomena' , dataset_name = 'nikitam/ACES' )
121
+
122
+ amazon_intent = Classification ("utt" ,labels = "intent" ,** all ('AmazonScience/massive' ))
123
123
# dataset_name='glue',config_name=['ocnli','afqmc'])
124
124
125
- #
125
+ tidy_as2 = Classification ("Question" ,"Sentence" ,"Label" ,dataset_name = 'tasksource/tydi-as2-balanced' )
126
+
127
+ multiconer = TokenClassification ("tokens" ,"ner_tags_index" , ** all ("MultiCoNER/multiconer_v2" ))
128
+
129
+ mtop = Classification ("question" ,labels = "intent" , dataset_name = "tasksource/mtop" )
130
+
126
131
#wino_x
127
132
# clue, klue, indic_glue
128
133
# SMS_Spam_Multilingual_Collection_Dataset
0 commit comments