storage_utils/setup_kaggle_data.py

import io
import json
import time

import pandas as pd
from subprocess import check_output, CalledProcessError, STDOUT
import shlex
import os
from tqdm import tqdm
import random  # change this to the real value
import kaggle  # make sure kaggle package is downloaded

# to keep first 10 rows of dataset:
# for i in *.csv; do echo "$(head -10 $i)" > $i; done


def syscall(command):
    """
    params:
        command: string, ex. `"ls -l"`
    returns: output, success
    """
    command = shlex.split(command)
    try:
        output = check_output(command, stderr=STDOUT).decode()
        success = True
    except CalledProcessError as e:
        output = e.output.decode()
        success = False
    return output, success


def main():

    datasets = pd.read_csv('dataset_csv.csv')

    for _, row in tqdm(datasets.iterrows(), total=len(datasets)):
        dataset_name = row['ref']
        dataset_path = 'data/kaggle/' + dataset_name.replace('/', '.')

        # Download the dataset (if size is < 100MB)
        size = row['size']
        if 'GB' in size or ('MB' in size and int(size[:size.index('MB')]) > 100):
            print('Dataset too big:', dataset_name, size)
            continue
        else:
            download_cmd = f'kaggle datasets download {dataset_name} --path {dataset_path}/data/ --unzip -q'
            # download the dataset file
            output, succes = syscall(download_cmd)
            if not succes:
                print('Error while downloading dataset:', dataset_name)
                print(output)

        # List the notebooks
        notebooks = []
        for i in range(1, 2):
            list_cmd = f'kaggle kernels list --dataset {dataset_name} --page {i} ' \
                       '--page-size 30 --sort-by voteCount --language python --csv'
            output, succes = syscall(list_cmd)
            if output == 'No kernels found':
                break
            df = pd.read_csv(io.StringIO(output))
            notebooks.append(df)

        notebooks = pd.concat(notebooks, ignore_index=True).drop_duplicates('ref')

        # download 20 kernels
        count = 0
        for _, nb_row in tqdm(notebooks.iterrows(), total=len(notebooks)):
            nb_name = nb_row['ref']
            dir_path = nb_name.replace("/", "-")
            file_name = nb_name.split('/')[1]

            nb_download_cmd = f'kaggle kernels pull {nb_name} --path ../CFGDemo/{dataset_path}/notebooks/{dir_path} --metadata'
            output, succes = syscall(nb_download_cmd)
            if not succes:
                print('Error while downloading kernel:', nb_name, '; Dataset:', dataset_name)
                continue
            count += 1
            if os.path.isfile(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb'):
                to_python_file_cmd = f'jupyter nbconvert ../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb ' \
                                     f'--to script'
                output, success = syscall(to_python_file_cmd)
            
                if not success:
                    print('Error while tranforming file:', nb_name, '; Dataset:', dataset_name)
                else:
                    os.remove(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/{file_name}.ipynb')

            if os.path.isdir(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}'):
                elements = os.listdir(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}')
                if f'{file_name}.py' in elements:
                    with open(f'../CFGDemo/{dataset_path}/notebooks/{dir_path}/pipeline_info.json', 'w') as f:
                        json.dump({
                            'url': f"https://www.kaggle.com/{nb_row['ref']}",
                            'title': nb_row['title'],
                            'author': nb_row['author'],
                            'votes': nb_row['totalVotes'],
                            'score': random.uniform(0.5, 1.0),
                            'date': nb_row['lastRunTime']
                        }, f)

            if count >= 20:
                break


if __name__ == '__main__':
    main()