Skip to content

Preprocessing of specified datasets #29

Preprocessing of specified datasets

Preprocessing of specified datasets #29

name: Preprocessing of specified datasets
on:
workflow_dispatch:
inputs:
datasets:
description: 'Datasets to preprocess, separated by space. Example: 0001 0002 0044'
required: True
type: string
do_standardize:
description: 'Whether to standardize compounds'
type: boolean
default: true
do_classyfire:
description: 'Whether to compute ClassyFire classes'
type: boolean
default: true
do_descriptors:
description: 'Whether to compute descriptors'
type: boolean
default: true
do_fingerprints:
description: 'Whether to compute fingerprints'
type: boolean
default: true
do_metadata:
description: 'Whether to standardize metadata'
type: boolean
default: true
jobs:
preprocess:
name: Preprocess raw data
# NOTE: on windows as computing of descriptors has a bug on linux right now
runs-on: windows-2019
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} # needed for pulling R packages from github
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
lfs: true
- name: List all files for selected datasets
shell: bash {0}
run: |
for f in ${{ inputs.datasets }}; do
ls -lh raw_data/$f || true
ls -lh processed_data/$f || true
done
continue-on-error: true
- name: Pyton dependencies
run: pip install -r scripts/Python/requirements.txt
# needed for every task
- name: Set java version
run: echo ("JAVA_HOME=" + $Env:JAVA_HOME_13_X64) >> $env:GITHUB_ENV
# needed for every task
- name: Set RENV_PATHS_ROOT
shell: bash
run: |
echo "RENV_PATHS_ROOT=${{ runner.temp }}/renv" >> $GITHUB_ENV
# needed for every task
- name: Setup R
uses: r-lib/actions/setup-r@v2
# needed for every task
- name: Restore Renv package cache
uses: actions/cache@v3
with:
path: ${{ env.RENV_PATHS_ROOT }}
key: ${{ runner.os }}-renv-${{ hashFiles('**/renv.lock') }}
restore-keys: |
${{ runner.os }}-renv-
# needed for every task
- name: Install and activate renv
shell: Rscript {0}
run: |
install.packages("renv")
renv::restore()
# needed for every task
- name: Standardize compounds
run: Rscript scripts/R_ci/compounds_standardize.R ${{ inputs.datasets }}
if: ${{ inputs.do_standardize }}
- name: Compounds classyfire classes
run: Rscript scripts/R_ci/compounds_classyfire.R ${{ inputs.datasets }}
if: ${{ inputs.do_classyfire }}
- name: Compounds descriptors
run: Rscript scripts/R_ci/compounds_descriptors.R ${{ inputs.datasets }}
if: ${{ inputs.do_descriptors }}
- name: Compounds fingerprints
run: Rscript scripts/R_ci/compounds_fingerprints.R ${{ inputs.datasets }}
if: ${{ inputs.do_fingerprints }}
- name: Metadata standardization
run: Rscript scripts/R_ci/metadata_standardize.R ${{ inputs.datasets }}
if: ${{ inputs.do_metadata }}
- name: Generate dataset reports
run: Rscript scripts/R_ci/compounds_overview.R ${{ inputs.datasets }}
- name: Verify that required files are present
run: Rscript scripts/R_ci/files_complete.R ${{ inputs.datasets }}
- name: Update overview table of all datasets
run: python3 scripts/Python/datasets_overview.py
continue-on-error: true
- name: Commit preprocessing
run: |
git config --global user.email '[email protected]'
git config --global user.name 'Github Actions'
# Use LFS storage of main repository: no push access to fork LFS storage
# TODO: change once repository is moved
git add processed_data raw_data
git commit -m "Preprocessing ${{ inputs.datasets }}" -m "Tasks:
- standardize compounds: ${{ inputs.do_standardize }}
- compute classyfire classes: ${{ inputs.do_classyfire }}
- compute descriptors: ${{ inputs.do_descriptors }}
- compute fingerprints: ${{ inputs.do_fingerprints }}
- standardize metadata: ${{ inputs.do_metadata }}"
git lfs push origin HEAD # first push LFS, otherwise failure because of lfs.url
git push origin HEAD