-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsetup_kaggle_data.py
108 lines (90 loc) · 4.05 KB
/
setup_kaggle_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import io
import json
import time
import pandas as pd
from subprocess import check_output, CalledProcessError, STDOUT
import shlex
import os
from tqdm import tqdm
import random # change this to the real value
import kaggle # make sure kaggle package is downloaded
# to keep first 10 rows of dataset:
# for i in *.csv; do echo "$(head -10 $i)" > $i; done
def syscall(command):
"""
params:
command: string, ex. `"ls -l"`
returns: output, success
"""
command = shlex.split(command)
try:
output = check_output(command, stderr=STDOUT).decode()
success = True
except CalledProcessError as e:
output = e.output.decode()
success = False
return output, success
def main():
datasets = pd.read_csv('dataset_csv.csv')
for _, row in tqdm(datasets.iterrows(), total=len(datasets)):
dataset_name = row['ref']
dataset_path = 'data/kaggle/' + dataset_name.replace('/', '.')
# Download the dataset (if size is < 100MB)
size = row['size']
if 'GB' in size or ('MB' in size and int(size[:size.index('MB')]) > 100):
print('Dataset too big:', dataset_name, size)
continue
else:
download_cmd = f'kaggle datasets download {dataset_name} --path {dataset_path}/data/ --unzip -q'
# download the dataset file
output, succes = syscall(download_cmd)
if not succes:
print('Error while downloading dataset:', dataset_name)
print(output)
# List the notebooks
notebooks = []
for i in range(1, 2):
list_cmd = f'kaggle kernels list --dataset {dataset_name} --page {i} ' \
'--page-size 30 --sort-by voteCount --language python --csv'
output, succes = syscall(list_cmd)
if output == 'No kernels found':
break
df = pd.read_csv(io.StringIO(output))
notebooks.append(df)
notebooks = pd.concat(notebooks, ignore_index=True).drop_duplicates('ref')
# download 20 kernels
count = 0
for _, nb_row in tqdm(notebooks.iterrows(), total=len(notebooks)):
nb_name = nb_row['ref']
dir_path = nb_name.replace("/", "-")
file_name = nb_name.split('/')[1]
nb_download_cmd = f'kaggle kernels pull {nb_name} --path ../CFGDemo/{dataset_path}/notebooks/{dir_path} --metadata'
output, succes = syscall(nb_download_cmd)
if not succes:
print('Error while downloading kernel:', nb_name, '; Dataset:', dataset_name)
continue
count += 1
if os.path.isfile(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb'):
to_python_file_cmd = f'jupyter nbconvert ../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb ' \
f'--to script'
output, success = syscall(to_python_file_cmd)
if not success:
print('Error while tranforming file:', nb_name, '; Dataset:', dataset_name)
else:
os.remove(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb')
if os.path.isdir(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}'):
elements = os.listdir(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}')
if f'{file_name}.py' in elements:
with open(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/pipeline_info.json', 'w') as f:
json.dump({
'url': f"https://www.kaggle.com/{nb_row['ref']}",
'title': nb_row['title'],
'author': nb_row['author'],
'votes': nb_row['totalVotes'],
'score': random.uniform(0.5, 1.0),
'date': nb_row['lastRunTime']
}, f)
if count >= 20:
break
if __name__ == '__main__':
main()