30
30
required = False ,
31
31
help = "The subset you want to convert" )
32
32
33
+ parser .add_argument ("--sample" ,
34
+ nargs = 1 ,
35
+ type = int ,
36
+ required = False ,
37
+ help = "Number of samples from the dataset" )
38
+
33
39
args = parser .parse_args ()
34
40
if args .dataset :
35
41
dataset_name = args .dataset [0 ]
36
42
37
43
if args .subset :
38
44
subset_name = args .subset [0 ]
39
45
46
+ if args .sample :
47
+ sample_num = args .sample [0 ]
48
+
40
49
def get_dataset (dataset_name ):
41
50
dataset = load_dataset (dataset_name , split = "train" )
51
+ if args .sample :
52
+ cap = min (sample_num , len (dataset ))
53
+ dataset = random .choices (dataset , k = cap )
42
54
# Load prompts for this dataset
43
55
dataset_prompts = DatasetTemplates (dataset_name )
44
56
return dataset , dataset_prompts
45
57
46
58
def get_subset (dataset_name , subset_name ):
47
59
dataset = load_dataset (dataset_name ,subset_name , split = "train" )
60
+ if args .sample :
61
+ cap = min (sample_num , len (dataset ))
62
+ dataset = random .choices (dataset , k = cap )
48
63
# Load prompts for this dataset and subset
49
64
dataset_prompts = DatasetTemplates (f"{ dataset_name } /{ subset_name } " )
50
65
return dataset , dataset_prompts
@@ -59,6 +74,8 @@ def create_task(dataset, dataset_name, dataset_prompts):
59
74
prompt = dataset_prompts [prompt_name ]
60
75
# Apply the prompt to the dataset
61
76
data = {}
77
+ data ["Prompt Name" ] = [prompt_name ]
78
+ data ["Prompt id" ] = [id ]
62
79
data ["Contributors" ] = []
63
80
data ["Source" ] = [dataset_name ]
64
81
data ["Categories" ] = []
@@ -71,7 +88,7 @@ def create_task(dataset, dataset_name, dataset_prompts):
71
88
data ["Positive Examples" ] = []
72
89
data ["Negative Examples" ] = []
73
90
data ["Instances" ] = []
74
- for i in range (min ( 6500 , len (dataset ) )):
91
+ for i in range (len (dataset )):
75
92
result = prompt .apply (dataset [i ])
76
93
if len (result )== 2 :
77
94
data ["Instances" ].append ({
@@ -120,4 +137,4 @@ def save_json(data, dataset_name, prompt_name):
120
137
dataset , dataset_prompts = get_dataset (dataset_name )
121
138
if args .subset :
122
139
dataset , dataset_prompts = get_subset (dataset_name , subset_name )
123
- create_task (dataset , dataset_name , dataset_prompts )
140
+ create_task (dataset , dataset_name , dataset_prompts )
0 commit comments