Made gene_id_file mandatory. Added instructions to create it manually in docs

Marcel-Mueck · web-flow · commit 46f0247ddf0f · 2025-03-10T16:42:12.000+01:00
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -1988,7 +1988,7 @@ def add_gene_ids(gene_id_file: str, annotations_path: str, out_file: str):
     """
     genes = pd.read_parquet(gene_id_file)
     genes[["gene_base", "feature"]] = genes["gene"].str.split(".", expand=True)
-    genes.drop(columns=["feature", "gene", "gene_name", "gene_type"], inplace=True)
+    genes=genes[['id','gene_base']]
     genes.rename(columns={"id": "gene_id"}, inplace=True)
     annotations = pd.read_parquet(annotations_path)
     len_anno = len(annotations)
diff --git a/docs/annotations.md b/docs/annotations.md
@@ -27,7 +27,10 @@ Download paths:
 - [PrimateAI](https://basespace.illumina.com/s/yYGFdGih1rXL) PrimateAI supplementary data/"PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz"
 - [AlphaMissense](https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz) 
 
-Also a reference GTF file containing transcript annotations is required, this can be downloaded from [here](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz)
+Further requirements:
+- A reference GTF file containing transcript annotations is required, this can be downloaded from [here](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz). 
+- A file containing all genes, which deeprvat should consider together with a unique integer id for each gene. This file may be created manually by the user or automatically using the gtf file as input to create a gene id file for all protein coding genes. See [here](#geneid) for more details.
+
 
 
 ## Configure the annotation pipeline
@@ -38,6 +41,7 @@ The config above would use the following directory structure:
 |--reference
 |   |-- fasta file
 |   |-- GTF file 
+|   |-- gene id file
 
 |-- preprocessing_workdir
 |   |-- norm
@@ -80,6 +84,7 @@ A GTF file as described in [requirements](#requirements) and the FASTA file used
 The output is stored in the `output_dir/annotations` folder and any temporary files in the `tmp` subfolder. All repositories used including VEP with its corresponding cache as well as plugins are stored in `repo_dir`.
 Data for VEP plugins and the CADD cache are stored in `annotation_data`. 
 
+(running)=
 ## Running the annotation pipeline on example data
 
 
@@ -140,6 +145,22 @@ af_mode : 'af_gnomadg'
 ```
 to the config file.
 
+(geneid)=
+## Gene id file
+as mentioned in the [requirements](#requirements) section, the pipeline expects a parquet file contiaining all genes that deeprvat should consider, together with a unique integer id for each gene. 
+This file can be created automatically using a GTF file as input. The output is then a parquet file in the expected format containing all protein coding genes of the provided GTF file.
+To automatically create the gene id file, make sure the annotation environment (mentioned [here](#running) ) is active and run
+```
+deeprvat_annotations create-gene-id-file deeprvat/example/annotations/reference/gencode.v44.annotation.gtf.gz deeprvat/example/annotations/reference/protein_coding_genes.parquet
+```  
+with `deeprvat/example/annotations/reference/gencode.v44.annotation.gtf.gz` pointing to any downloaded GTF file and `deeprvat/example/annotations/reference/protein_coding_genes.parquet` pointing to the desired output path, which has to be specified in the config file. 
+
+Alternatively, when the user want to select a specific set of genes to consider, the gene id file may be created by the user. The file is expected to have two columns:
+- column`gene`:`str` name for each gene
+- column `id`:`int` unique id for each gene
+Each row represents a gene the user want to include in the analysis.
+
+
 ## References
 
 (reference-1-target)=
diff --git a/example/config/deeprvat_annotation_config.yaml b/example/config/deeprvat_annotation_config.yaml
@@ -25,6 +25,7 @@ kipoiveff_repo_dir : repo_dir/kipoi-veff2
 faatpipe_repo_dir : repo_dir/faatpipe
 vep_repo_dir : repo_dir/ensembl-vep
 preprocessing_workdir : ../preprocess/workdir
+gene_id_parquet: reference/protein_coding_genes.parquet
 additional_vep_plugin_cmds:
   cadd : CADD,annotation_data/cadd/whole_genome_SNVs.tsv.gz,annotation_data/cadd/gnomad.genomes.r3.0.indel.tsv.gz
   spliceAI : SpliceAI,snv=annotation_data/spliceAI/spliceai_scores.raw.snv.hg38.vcf.gz,indel=annotation_data/spliceAI/spliceai_scores.raw.indel.hg38.vcf.gz
diff --git a/example/config/deeprvat_annotation_config_minimal.yaml b/example/config/deeprvat_annotation_config_minimal.yaml
@@ -6,6 +6,7 @@
 fasta_dir : reference
 fasta_file_name : GRCh38.primary_assembly.genome.fa
 gtf_file_name : gencode.v44.annotation.gtf.gz
+gene_id_parquet: reference/protein_coding_genes.parquet
 
 source_variant_file_pattern :  chr{chr}test
 source_variant_file_type: 'bcf'
@@ -24,6 +25,7 @@ kipoiveff_repo_dir : repo_dir/kipoi-veff2
 faatpipe_repo_dir : repo_dir/faatpipe
 vep_repo_dir : repo_dir/ensembl-vep
 preprocessing_workdir : preprocessing_workdir
+
 include_absplice : False
 include_deepSEA : False
 vep_online: True
diff --git a/pipelines/annotations.snakefile b/pipelines/annotations.snakefile
@@ -44,7 +44,7 @@ genome_assembly = config.get("genome_assembly") or "GRCh38"
 fasta_dir = Path(config["fasta_dir"])
 fasta_file_name = config["fasta_file_name"]
 gtf_file = fasta_dir / config["gtf_file_name"]
-gene_id_file = config.get("gene_id_parquet")
+gene_id_file = config["gene_id_parquet"]
 
 deeprvat_parent_path = Path(config["deeprvat_repo_dir"])
 annotation_python_file = (
@@ -191,20 +191,6 @@ rule all:
         chckpt = anno_dir / 'chckpts' / 'select_rename_fill_columns.chckpt',
         annotations = anno_dir / 'annotations.parquet'
 
-if not gene_id_file:
-    gene_id_file = fasta_dir / "protein_coding_genes.parquet"
-
-    rule create_gene_id_file:
-        input:
-            gtf_file,
-        output:
-            gene_id_file,
-        resources:
-            mem_mb=lambda wildcards, attempt: 15_000 * (attempt + 1),
-        shell:
-            " ".join(
-                [f"deeprvat_annotations", "create-gene-id-file", "{input}", "{output}"]
-            )
 
 rule extract_with_header:
     input: