-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update scripts/description on preparing reference files (#7)
1. scripts: ref/prep_ref_files.py 2. update params.known_sites to use PASS variants from pf crosses 1.0 vcf 3. update main Readme.md file to provide options to generate all ref file by users 4. update environment.yaml file to include requests package for downloading
- Loading branch information
Showing
7 changed files
with
195 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,18 +10,26 @@ conda install -c bioconda nextflow | |
git clone [email protected]:gbinux/snp_call_nf.git | ||
cd snp_call_nf | ||
``` | ||
4. Link the reference folder. | ||
4. Link the reference files or prepare them by yourself | ||
|
||
On IGS server | ||
- Link the ref files on IGS server | ||
``` | ||
ln -s /local/projects-t3/toconnor_grp/bing.guo/ref/* ref/ | ||
``` | ||
|
||
On Rosalind, the reference file can be linked by running | ||
- Link the ref files on Rosalind, the reference file can be linked by running | ||
``` | ||
ln -s /local/data/Malaria/Projects/Takala-Harrison/Cambodia_Bing/ref/* ref/ | ||
``` | ||
|
||
- Prepare ref file by yourself: | ||
``` | ||
conda create env -f environment.yaml | ||
conda activate snp_call_nf | ||
cd ref | ||
python3 prep_ref_files.py | ||
``` | ||
|
||
5. Run the pipeline | ||
- Test it on HPC (local): `nextflow main.nf` | ||
- Test it on SGE server: `nextflow main.nf -profile sge` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,4 @@ dependencies: | |
- bedtools=2.30.0 | ||
- samtools=1.13 | ||
- bcftools=1.13 | ||
- requests # for downloading files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
import requests | ||
import ftplib | ||
from pathlib import Path | ||
from subprocess import run | ||
|
||
|
||
class URLS: | ||
genomes = [ | ||
{ | ||
"prefix": "host/hg38", | ||
"url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz", | ||
"local_fn": "host/hg38.fa.gz", | ||
"fasta_fn": "host/hg38.fasta", | ||
}, | ||
{ | ||
"prefix": "PlasmoDB-44_Pfalciparum3D7_Genome", | ||
"url": "https://plasmodb.org/common/downloads/release-44/Pfalciparum3D7/fasta/data/PlasmoDB-44_Pfalciparum3D7_Genome.fasta", | ||
"local_fn": "PlasmoDB-44_Pfalciparum3D7_Genome.fasta", | ||
"fasta_fn": "PlasmoDB-44_Pfalciparum3D7_Genome.fasta", | ||
}, | ||
] | ||
pf_crosses_v1 = { | ||
"server": "ngs.sanger.ac.uk", | ||
"remotedir": "/production/malaria/pf-crosses/1.0/", | ||
"vcf_files": "3d7_hb3.gatk.final.vcf.gz 7g8_gb4.gatk.final.vcf.gz hb3_dd2.gatk.final.vcf.gz".split(), | ||
"orig_files": [ | ||
"pf_crosses_v1/3d7_hb3.gatk.final.vcf.gz", | ||
"pf_crosses_v1/7g8_gb4.gatk.final.vcf.gz", | ||
"pf_crosses_v1/hb3_dd2.gatk.final.vcf.gz", | ||
], | ||
"filt_files": [ | ||
"pf_crosses_v1/3d7_hb3.gatk.final.pass.vcf.gz", | ||
"pf_crosses_v1/7g8_gb4.gatk.final.pass.vcf.gz", | ||
"pf_crosses_v1/hb3_dd2.gatk.final.pass.vcf.gz", | ||
], | ||
"known_variants_vcf": "pf_crosses_v1/known_variants.vcf", | ||
} | ||
|
||
|
||
def download_genome_fasta_files(): | ||
for genome in URLS.genomes: | ||
# local file | ||
url: str = genome["url"] | ||
local_fn: str = genome["local_fn"] | ||
target_fn: str = genome["fasta_fn"] | ||
|
||
Path(local_fn).parent.mkdir(parents=True, exist_ok=True) | ||
|
||
# skip if already exists | ||
if Path(target_fn).exists(): | ||
continue | ||
|
||
with requests.get(url, allow_redirects=True, stream=True) as response: | ||
response.raise_for_status() | ||
with open(local_fn, "wb") as f: | ||
for chunk in response.iter_content(chunk_size=8192): | ||
f.write(chunk) | ||
|
||
if local_fn.endswith(".gz"): | ||
res = run( | ||
f"""gzip -dc {local_fn} > {target_fn}; rm {local_fn}""", shell=True | ||
) | ||
assert res.returncode == 0 | ||
|
||
|
||
def index_fasta_files(): | ||
for genome in URLS.genomes: | ||
fa_prefix = genome["prefix"] | ||
fa_fn = genome["fasta_fn"] | ||
# bt2 indices | ||
if not Path(fa_prefix + ".1.bt2").exists(): | ||
print(Path(fa_prefix + ".1.bt2")) | ||
cmd = f"bowtie2-build --threads 20 {fa_prefix}.fasta {fa_prefix}" | ||
print(f"\t building bt2 indices for {fa_fn}") | ||
res = run(cmd, shell=True, capture_output=True) | ||
if res.returncode != 0: | ||
print(res.stderr) | ||
exit(-1) | ||
# fai index | ||
if not Path(fa_prefix + ".fasta.fai").exists(): | ||
cmd = f"samtools faidx {fa_prefix}.fasta" | ||
print(f"\t building fai index for {fa_fn}") | ||
res = run(cmd, shell=True, capture_output=True) | ||
if res.returncode != 0: | ||
print(res.stderr) | ||
exit(-1) | ||
# dict index | ||
if not Path(fa_prefix + ".dict").exists(): | ||
cmd = f""" | ||
gatk --java-options "-Xmx10G" CreateSequenceDictionary -R {fa_prefix}.fasta -O {fa_prefix}.dict | ||
""" | ||
print(f"\t building dict index for {fa_fn}") | ||
res = run(cmd, shell=True, capture_output=True) | ||
if res.returncode != 0: | ||
print(res.stderr) | ||
exit(-1) | ||
|
||
|
||
def download_pf_crosses_v1_vcfs(): | ||
# file names and folders | ||
remote_filename_lst = URLS.pf_crosses_v1["vcf_files"] | ||
local_filename_lst = URLS.pf_crosses_v1["orig_files"] | ||
for fn in local_filename_lst: | ||
Path(fn).parent.mkdir(parents=True, exist_ok=True) | ||
|
||
# DOWNLOAD VCF from pfcrosses1.0 | ||
ftp_server = ftplib.FTP(URLS.pf_crosses_v1["server"]) | ||
ftp_server.login() | ||
ftp_server.cwd(URLS.pf_crosses_v1["remotedir"]) | ||
for remote_filename, local_filename in zip(remote_filename_lst, local_filename_lst): | ||
if not Path(local_filename).exists(): | ||
with open(local_filename, "wb") as file: | ||
print("download " + remote_filename) | ||
ftp_server.retrbinary("RETR " + remote_filename, file.write) | ||
ftp_server.quit() | ||
|
||
|
||
def known_variants_from_pf_crosses_v1(): | ||
local_filename_lst = URLS.pf_crosses_v1["orig_files"] | ||
filt_filename_lst = URLS.pf_crosses_v1["filt_files"] | ||
|
||
# keep pass only | ||
for origf, passf in zip(local_filename_lst, filt_filename_lst): | ||
print("filter " + origf) | ||
res = run( | ||
f""" | ||
bcftools view -f PASS {origf} -Oz -o {passf} | ||
bcftools index {passf} | ||
""", | ||
shell=True, | ||
capture_output=True, | ||
text=True, | ||
) | ||
assert res.returncode == 0 | ||
|
||
# merge vcfs and generate .idx file | ||
print("merge filtered vcfs and generate.idx file") | ||
filt_filename_str = " ".join(filt_filename_lst) | ||
known_variants_vcf = URLS.pf_crosses_v1["known_variants_vcf"] | ||
cmd = ( | ||
f""" | ||
bcftools merge --force-samples {filt_filename_str} -Ov -o {known_variants_vcf} | ||
gatk IndexFeatureFile -I {known_variants_vcf} | ||
""", | ||
) | ||
res = run(cmd, shell=True, capture_output=True, text=True) | ||
if ( | ||
res.returncode != 0 | ||
or "error" in res.stderr.lower() | ||
or "error" in res.stdout.lower() | ||
): | ||
print(res.stderr) | ||
exit(-1) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
print("downloading fasta files") | ||
download_genome_fasta_files() | ||
print("Done obtaining fasta files") | ||
|
||
print("indexing will take a long time ....") | ||
index_fasta_files() | ||
print("Done indexing") | ||
|
||
print("download pf crosses v1.0 vcf files") | ||
download_pf_crosses_v1_vcfs() | ||
print("Done download pf crosses 1.0") | ||
|
||
print("generating known variants vcf from pf crosses 1.0") | ||
known_variants_from_pf_crosses_v1() | ||
print("Done known variants vcf") |
3fc4855
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
close #7
and provide an alternative way to address the first task of #3