TalusBio · jspaezp · Jun 26, 2023 · Jun 27, 2023 · Jul 21, 2023 · Jul 21, 2023
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - name: Create latest tag
         uses: Actions-R-Us/actions-tagger@latest

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -40,8 +40,8 @@ jobs:
     - name: Run Unit Tests
       run: |
         REMOTE=/app/nf-encyclopedia
-        docker run -v $(pwd):${REMOTE} -w ${REMOTE} nf-encyclopedia:latest \
-          pytest tests/unit_tests
+        docker run -v $(pwd):${REMOTE} nf-encyclopedia:latest \
+          pytest ${REMOTE}/tests/unit_tests  
 
     - name: Run System Tests
       run: |

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .nextflow
+venv
 .Rhistory
 .idea/**
 .gradle

diff --git a/Dockerfile b/Dockerfile
@@ -1,66 +1,77 @@
 FROM --platform=linux/amd64 mambaorg/micromamba:latest as micromamba
-# First Stage of the build, gets the jar for encyclopedia
-FROM --platform=linux/amd64 openjdk:8-jre
+# FROM --platform=linux/amd64 ibmjava:11
+# FROM --platform=linux/amd64 amazoncorretto:11-al2023-headless # Uses yum for package management.
+FROM --platform=linux/amd64 nextflow/nextflow:23.04.2
+
 ARG VERSION=2.12.30
 ENV VERSION ${VERSION}
+LABEL authors="[email protected]" \
+      description="Docker image for most of nf-encyclopedia"
+
+
+# Install procps so that Nextflow can poll CPU usage and
+# deep clean the apt cache to reduce image/layer size
+# RUN apt-get install -y procps sqlite3 libgomp1 \
+#     && apt-get clean -y && rm -rf /var/lib/apt/lists/*
+RUN yum install -y wget
 
-RUN apt-get update && \
-    apt-get -y upgrade && \ 
-    apt-get -y install libgomp1 && \
-    apt-get clean
 
 WORKDIR /code
+# First Stage of the build, gets the jar for encyclopedia
 RUN wget https://bitbucket.org/searleb/encyclopedia/downloads/encyclopedia-${VERSION}-executable.jar
-WORKDIR /app
 
-LABEL authors="[email protected]" \
-      description="Docker image for most of nf-encyclopedia"
+# # Install nextflow
+# RUN wget -qO- https://get.nextflow.io | bash
+# RUN chmod +x nextflow
+# RUN mv nextflow /usr/local/bin/.
 
-# Install procps so that Nextflow can poll CPU usage and
-# deep clean the apt cache to reduce image/layer size
-RUN apt-get update \
-    && apt-get install -y procps sqlite3 \
-    && apt-get clean -y && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
 
 # Setup micromamba:
-ARG MAMBA_USER=mamba
+ARG MAMBA_USER=root
 ARG MAMBA_USER_ID=1000
 ARG MAMBA_USER_GID=1000
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
 ENV MAMBA_USER=$MAMBA_USER
 ENV MAMBA_ROOT_PREFIX="/opt/conda"
 ENV MAMBA_EXE="/bin/micromamba"
+RUN mkdir -p ${MAMBA_ROOT_PREFIX} 
 
 COPY --from=micromamba "$MAMBA_EXE" "$MAMBA_EXE"
 COPY --from=micromamba /usr/local/bin/_activate_current_env.sh /usr/local/bin/_activate_current_env.sh
 COPY --from=micromamba /usr/local/bin/_dockerfile_shell.sh /usr/local/bin/_dockerfile_shell.sh
-COPY --from=micromamba /usr/local/bin/_entrypoint.sh /usr/local/bin/_entrypoint.sh
+COPY --from=micromamba /usr/local/bin/_entrypoint.sh /usr/local/bin/mamba_entrypoint.sh
 COPY --from=micromamba /usr/local/bin/_activate_current_env.sh /usr/local/bin/_activate_current_env.sh
 COPY --from=micromamba /usr/local/bin/_dockerfile_initialize_user_accounts.sh /usr/local/bin/_dockerfile_initialize_user_accounts.sh
 COPY --from=micromamba /usr/local/bin/_dockerfile_setup_root_prefix.sh /usr/local/bin/_dockerfile_setup_root_prefix.sh
 
-RUN /usr/local/bin/_dockerfile_initialize_user_accounts.sh && \
-    /usr/local/bin/_dockerfile_setup_root_prefix.sh
+# No need to set up accounts if we will run as root ...
+# RUN /usr/local/bin/_dockerfile_initialize_user_accounts.sh &&
+
+RUN /usr/local/bin/_dockerfile_setup_root_prefix.sh
 
 # Setup the environment
-USER root
+USER $MAMBA_USER
 COPY environment.yml /tmp/environment.yml
 
 # Instruct R processes to use these empty files instead of
 # clashing with a local one
 RUN touch .Rprofile .Renviron
 
-# Create the environment
-RUN micromamba install -y -n base -f /tmp/environment.yml && \
-    micromamba clean --all --yes
+# Set the path. NextFlow seems to circumvent the conda environment
+# We also need to set options for the JRE here.
+ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH:/bin" _JAVA_OPTIONS="-Djava.awt.headless=true" VERSION=$VERSION
 
 # Setup the EncyclopeDIA executable:
 RUN ln -s /code/encyclopedia-$VERSION-executable.jar /code/encyclopedia.jar
 
-# Set the path. NextFlow seems to circumvent the conda environment
-# We also need to set options for the JRE here.
-ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH" _JAVA_OPTIONS="-Djava.awt.headless=true" VERSION=$VERSION
-
 # Create the entrypoint:
-SHELL ["/usr/local/bin/_dockerfile_shell.sh"]
-ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
-CMD []
+# SHELL ["/usr/local/bin/_dockerfile_shell.sh"]
+# ENTRYPOINT ["/usr/local/bin/mamba_entrypoint.sh", "/usr/local/bin/entry.sh"]
+
+# Create the environment
+RUN micromamba install -y -n base -f /tmp/environment.yml && \
+    micromamba clean --all --yes
+
+CMD ["/bin/bash"]
+
diff --git a/assets/template.sky.zip b/assets/template.sky.zip
diff --git a/environment.yml b/environment.yml
@@ -8,9 +8,9 @@ dependencies:
   - pytest
   - numpy
   - pandas
-  - bioconductor-msstats=4.2.0
-  - r-rlang
-  - r-dplyr
-  - r-tidyr
-  - r-magrittr
-  - r-stringr
+  - bioconda::bioconductor-msstats
+  - conda-forge::r-rlang
+  - conda-forge::r-dplyr
+  - conda-forge::r-tidyr
+  - r::r-magrittr
+  - r::r-stringr
diff --git a/main.nf b/main.nf
@@ -3,7 +3,7 @@
 nextflow.enable.dsl = 2
 
 // Subworkflows
-include { CONVERT_TO_MZML } from "./subworkflows/msconvert"
+include { CONVERT_TO_MZML } from "./subworkflows/convert"
 include {
     BUILD_CHROMATOGRAM_LIBRARY;
     PERFORM_QUANT;
@@ -12,6 +12,12 @@ include {
 
 // Modules
 include { MSSTATS } from "./modules/msstats"
+include { ADD_IMS_INFO } from "./modules/ims"
+include { 
+    SKYLINE_ADD_LIB;
+    SKYLINE_IMPORT_DATA;
+    SKYLINE_MERGE_RESULTS
+} from "./modules/skyline"
 
 
 //
@@ -49,6 +55,7 @@ workflow {
     input = file(params.input, checkIfExists: true)
     fasta = file(params.fasta, checkIfExists: true)
     dlib = file(params.dlib, checkIfExists: true)
+    skyline_empty_template = file("$baseDir/assets/template.sky.zip", checkIfExists: true)
 
     // Optional contrasts arg:
     if ( params.contrasts != null ) {
@@ -74,6 +81,16 @@ workflow {
         error "No MS data files were provided. Nothing to do."
     }
 
+    // Raw Mass Spec files (raw including .raw or .d/.tar)
+    // These files will be used later to quant using skyline.
+    // This also filter out files that are chromatogram libraries
+    ms_files.runs
+    | join(ms_files.meta)
+    | filter { !it[1] }
+    | map { it[0] }
+    | filter( ~/^.*((.raw)|(.d.tar))$/ )
+    | set { raw_quant_files }
+
     // Convert raw files to gzipped mzML and group them by experiment.
     // The chrlib and quant channels take the following form:
     // [[file_ids], [mzml_gz_files], is_chrlib, group]
@@ -107,13 +124,42 @@ workflow {
     PERFORM_QUANT(quant_files, dlib, fasta, params.aggregate)
     | set { quant_results }
 
+    quant_results.local
+    | map { it[0] }
+    | set { groups }
+
+    // Add IMS info to the blib
+    ADD_IMS_INFO(groups, quant_results.blib)
+    | set { blib }
+
+    SKYLINE_ADD_LIB(skyline_empty_template, fasta, blib)
+    | set { skyline_template_zipfile }
+
+    // This will generate a skyd for every raw data file
+    SKYLINE_IMPORT_DATA(
+        skyline_template_zipfile.skyline_zipfile,
+        raw_quant_files,
+    )
+    | set { skyline_import_results }
+
+    SKYLINE_MERGE_RESULTS(
+        skyline_template_zipfile.skyline_zipfile,
+        skyline_import_results.skyd_file.collect(),
+        raw_quant_files.collect(),
+    )
+    | set { skyline_merge_results }
+
+    skyline_merge_results.final_skyline_zipfile.view()
+
     // Perform an aggregate analysis on all files if needed:
     if ( params.aggregate ) {
         // Aggregate quantitative runs with EncyclopeDIA.
         // The output has one channel:
-        // global -> [agg_name, peptides, proteins] or null
+        // global -> [agg_name, peptides_txt, proteins_txt] or null
+        // lib -> blib
         PERFORM_AGGREGATE_QUANT(quant_results.local, dlib, fasta)
         | set { enc_results }
+
     } else {
         quant_results | set{ enc_results }
     }
@@ -122,6 +168,8 @@ workflow {
     if ( params.msstats.enabled ) {
         MSSTATS(enc_results.global, input, contrasts)
     }
+
+    // 
 }
 
 

diff --git a/modules/convert.nf b/modules/convert.nf
@@ -0,0 +1,61 @@
+process MSCONVERT {
+    publishDir "${params.mzml_dir}/${outputDir}", failOnError: true
+    label 'process_low_constant'
+    label 'error_retry'
+
+    input:
+        tuple val(file_id), path(raw_input), val(outputDir)
+
+    output:
+        tuple val(file_id), path("${raw_input.baseName}.mzML.gz")
+
+    script:
+    """
+    wine msconvert \\
+        -v \\
+        --gzip \\
+        --mzML \\
+        --64 \\
+        --zlib \\
+        --filter "peakPicking true 1-" \\
+        ${params.msconvert.demultiplex ? '--filter "demultiplex optimization=overlap_only"' : ''} \\
+        ${raw_input}
+    """
+
+    stub:
+    """
+    touch ${raw_input.baseName}.mzML.gz
+    """
+}
+
+
+process TDF2MZML {
+    publishDir "${params.mzml_dir}/${outputDir}", pattern: "*.mzML.gz", failOnError: true
+    container 'mfreitas/tdf2mzml:latest' // I don't know which stable tag to use...
+    label 'process_single'
+    label 'error_retry'
+
+    input:
+        tuple val(file_id), path(tdf_input), val(outputDir)
+
+    output:
+    tuple val(file_id), path("${file(tdf_input.baseName).baseName}.mzML.gz")
+
+    script:
+    """
+    echo "Unpacking..."
+    tar -xvf ${tdf_input}
+
+    echo "Converting..."
+    tdf2mzml.py -i *.d # --ms1_type "centroid"
+
+    echo "Compressing..."
+    mv *.mzml ${file(tdf_input.baseName).baseName}.mzML
+    gzip ${file(tdf_input.baseName).baseName}.mzML
+    """
+
+    stub:
+    """
+    touch ${file(tdf_input.baseName).baseName}.mzML.gz
+    """
+}
diff --git a/modules/encyclopedia.nf b/modules/encyclopedia.nf
@@ -23,7 +23,7 @@ def stem(suffix) {
 process ENCYCLOPEDIA_SEARCH {
     publishDir "${params.result_dir}/${group}/elib", pattern: '*.elib', failOnError: true
     publishDir "${params.result_dir}/${group}/logs", pattern: '*.log', failOnError: true
-    label 'process_medium'
+    label 'process_high'
 
     input:
         tuple val(group), path(mzml_gz_file)
@@ -70,6 +70,7 @@ process ENCYCLOPEDIA_SEARCH {
 
 process ENCYCLOPEDIA_AGGREGATE {
     publishDir "${params.result_dir}/${group}/elib", pattern: '*.elib', failOnError: true
+    publishDir "${params.result_dir}/${group}/blib", pattern: '*.blib', failOnError: true
     publishDir "${params.result_dir}/${group}/logs", pattern: '*.log', failOnError: true
     publishDir "${params.result_dir}/${group}/results", pattern: '*.txt', failOnError: true
     publishDir "${params.result_dir}/${group}/reports", pattern: '*.csv', failOnError: true
@@ -92,6 +93,7 @@ process ENCYCLOPEDIA_AGGREGATE {
         tuple(
             val(group),
             path("${stem(output_suffix)}.elib"),
+            path("${stem(output_suffix)}.blib"),
             path("${stem(output_suffix)}.global.log"),
             path("${output_suffix}_detection_summary.csv"),
             emit: "lib"
@@ -122,6 +124,9 @@ process ENCYCLOPEDIA_AGGREGATE {
         -a ${align} \\
     | tee ${stem(output_suffix)}.global.log
 
+    ${execEncyclopedia(task.memory)} \\
+        -convert -libtoblib -i ${stem(output_suffix)}.elib
+
     # Better file names:
     if [ "${align}" = true ]; then
         mv ${stem(output_suffix)}.elib.peptides.txt ${stem(output_suffix)}.peptides.txt
@@ -140,6 +145,7 @@ process ENCYCLOPEDIA_AGGREGATE {
     stub:
     """
     touch ${stem(output_suffix)}.elib
+    touch ${stem(output_suffix)}.blib
 
     if [ "${align}" = true ]; then
         touch ${stem(output_suffix)}.peptides.txt

diff --git a/modules/ims.nf b/modules/ims.nf
@@ -0,0 +1,24 @@
+
+process ADD_IMS_INFO {
+    publishDir "${params.result_dir}/${group}/blib", pattern: '*.ims.blib', failOnError: true
+    label 'process_medium'
+    container 'ghcr.io/talusbio/flimsay:v0.4.0'
+
+    input:
+        val group
+        path blib
+
+    output:
+        path("*.ims.blib"), emit: blib
+
+    script:
+    """
+    flimsay fill_blib ${blib} blib.ims.blib
+    """
+
+    stub:
+    """
+    echo "${blib}"
+    touch blib.ims.blib
+    """
+}