Skip to content

Commit

Permalink
update scripts/description on preparing reference files (#7)
Browse files Browse the repository at this point in the history
1. scripts: ref/prep_ref_files.py
2. update params.known_sites to use PASS variants from pf crosses 1.0 vcf
3. update main Readme.md file to provide options to generate all ref file by users
4. update environment.yaml file to include requests package for downloading
  • Loading branch information
bguo068 committed Dec 10, 2022
1 parent 35aa7d9 commit 3fc4855
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 105 deletions.
14 changes: 11 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,26 @@ conda install -c bioconda nextflow
git clone [email protected]:gbinux/snp_call_nf.git
cd snp_call_nf
```
4. Link the reference folder.
4. Link the reference files or prepare them by yourself

On IGS server
- Link the ref files on IGS server
```
ln -s /local/projects-t3/toconnor_grp/bing.guo/ref/* ref/
```

On Rosalind, the reference file can be linked by running
- Link the ref files on Rosalind, the reference file can be linked by running
```
ln -s /local/data/Malaria/Projects/Takala-Harrison/Cambodia_Bing/ref/* ref/
```

- Prepare ref file by yourself:
```
conda create env -f environment.yaml
conda activate snp_call_nf
cd ref
python3 prep_ref_files.py
```

5. Run the pipeline
- Test it on HPC (local): `nextflow main.nf`
- Test it on SGE server: `nextflow main.nf -profile sge`
Expand Down
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ dependencies:
- bedtools=2.30.0
- samtools=1.13
- bcftools=1.13
- requests # for downloading files
3 changes: 2 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ nextflow.enable.dsl = 2
def print_parameters() {
println("""======================== PARAMETERS ==========================
fq_map:\t${file(params.fq_map)}
known_variants:\t${params.known_sites}
split:\t${params.split}
filtering:
\thard:\t ${params.hard}
Expand Down Expand Up @@ -384,7 +385,7 @@ workflow {
// prepare tmpdir for gatk
def tmpdir = file(params.gatk_tmpdir)
if (!tmpdir.exists()) {
file.mkdirs()
tmpdir.mkdirs()
}

// Prepare input chanel
Expand Down
10 changes: 9 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@ params {
fasta = ["$projectDir/ref/host/hg38.fasta"]
fasta_prefix = ["$projectDir/ref/host/hg38"]
}
known_sites = ["$projectDir/ref/known_snps_unsorted.vcf"]

// Original known_snps_unsorted.vcf has 944,270 SNPs (from previous lab members).
// The updated file is pf_crosses_v1/known_variants.vcf which has 66,121 variants (snp/indels)
// The later is generated following the MalariaGen Pf6 paper:
// MalariaGEN, et al. (2021). An open dataset of Plasmodium falciparum
// genome variation in 7,000 worldwide samples. Wellcome Open Res 6, 42.
// 10.12688/wellcomeopenres.16168.2.

known_sites = ["$projectDir/ref/pf_crosses_v1/known_variants.vcf"]

genome_intervals = [
"chromosomes": [ "Pf3D7_01_v3", "Pf3D7_02_v3", "Pf3D7_03_v3", "Pf3D7_04_v3", "Pf3D7_05_v3",
Expand Down
51 changes: 0 additions & 51 deletions ref/Readme.md

This file was deleted.

49 changes: 0 additions & 49 deletions ref/prep_known_variant_vcf_pf.py

This file was deleted.

172 changes: 172 additions & 0 deletions ref/prep_ref_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import requests
import ftplib
from pathlib import Path
from subprocess import run


class URLS:
genomes = [
{
"prefix": "host/hg38",
"url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz",
"local_fn": "host/hg38.fa.gz",
"fasta_fn": "host/hg38.fasta",
},
{
"prefix": "PlasmoDB-44_Pfalciparum3D7_Genome",
"url": "https://plasmodb.org/common/downloads/release-44/Pfalciparum3D7/fasta/data/PlasmoDB-44_Pfalciparum3D7_Genome.fasta",
"local_fn": "PlasmoDB-44_Pfalciparum3D7_Genome.fasta",
"fasta_fn": "PlasmoDB-44_Pfalciparum3D7_Genome.fasta",
},
]
pf_crosses_v1 = {
"server": "ngs.sanger.ac.uk",
"remotedir": "/production/malaria/pf-crosses/1.0/",
"vcf_files": "3d7_hb3.gatk.final.vcf.gz 7g8_gb4.gatk.final.vcf.gz hb3_dd2.gatk.final.vcf.gz".split(),
"orig_files": [
"pf_crosses_v1/3d7_hb3.gatk.final.vcf.gz",
"pf_crosses_v1/7g8_gb4.gatk.final.vcf.gz",
"pf_crosses_v1/hb3_dd2.gatk.final.vcf.gz",
],
"filt_files": [
"pf_crosses_v1/3d7_hb3.gatk.final.pass.vcf.gz",
"pf_crosses_v1/7g8_gb4.gatk.final.pass.vcf.gz",
"pf_crosses_v1/hb3_dd2.gatk.final.pass.vcf.gz",
],
"known_variants_vcf": "pf_crosses_v1/known_variants.vcf",
}


def download_genome_fasta_files():
for genome in URLS.genomes:
# local file
url: str = genome["url"]
local_fn: str = genome["local_fn"]
target_fn: str = genome["fasta_fn"]

Path(local_fn).parent.mkdir(parents=True, exist_ok=True)

# skip if already exists
if Path(target_fn).exists():
continue

with requests.get(url, allow_redirects=True, stream=True) as response:
response.raise_for_status()
with open(local_fn, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)

if local_fn.endswith(".gz"):
res = run(
f"""gzip -dc {local_fn} > {target_fn}; rm {local_fn}""", shell=True
)
assert res.returncode == 0


def index_fasta_files():
for genome in URLS.genomes:
fa_prefix = genome["prefix"]
fa_fn = genome["fasta_fn"]
# bt2 indices
if not Path(fa_prefix + ".1.bt2").exists():
print(Path(fa_prefix + ".1.bt2"))
cmd = f"bowtie2-build --threads 20 {fa_prefix}.fasta {fa_prefix}"
print(f"\t building bt2 indices for {fa_fn}")
res = run(cmd, shell=True, capture_output=True)
if res.returncode != 0:
print(res.stderr)
exit(-1)
# fai index
if not Path(fa_prefix + ".fasta.fai").exists():
cmd = f"samtools faidx {fa_prefix}.fasta"
print(f"\t building fai index for {fa_fn}")
res = run(cmd, shell=True, capture_output=True)
if res.returncode != 0:
print(res.stderr)
exit(-1)
# dict index
if not Path(fa_prefix + ".dict").exists():
cmd = f"""
gatk --java-options "-Xmx10G" CreateSequenceDictionary -R {fa_prefix}.fasta -O {fa_prefix}.dict
"""
print(f"\t building dict index for {fa_fn}")
res = run(cmd, shell=True, capture_output=True)
if res.returncode != 0:
print(res.stderr)
exit(-1)


def download_pf_crosses_v1_vcfs():
# file names and folders
remote_filename_lst = URLS.pf_crosses_v1["vcf_files"]
local_filename_lst = URLS.pf_crosses_v1["orig_files"]
for fn in local_filename_lst:
Path(fn).parent.mkdir(parents=True, exist_ok=True)

# DOWNLOAD VCF from pfcrosses1.0
ftp_server = ftplib.FTP(URLS.pf_crosses_v1["server"])
ftp_server.login()
ftp_server.cwd(URLS.pf_crosses_v1["remotedir"])
for remote_filename, local_filename in zip(remote_filename_lst, local_filename_lst):
if not Path(local_filename).exists():
with open(local_filename, "wb") as file:
print("download " + remote_filename)
ftp_server.retrbinary("RETR " + remote_filename, file.write)
ftp_server.quit()


def known_variants_from_pf_crosses_v1():
local_filename_lst = URLS.pf_crosses_v1["orig_files"]
filt_filename_lst = URLS.pf_crosses_v1["filt_files"]

# keep pass only
for origf, passf in zip(local_filename_lst, filt_filename_lst):
print("filter " + origf)
res = run(
f"""
bcftools view -f PASS {origf} -Oz -o {passf}
bcftools index {passf}
""",
shell=True,
capture_output=True,
text=True,
)
assert res.returncode == 0

# merge vcfs and generate .idx file
print("merge filtered vcfs and generate.idx file")
filt_filename_str = " ".join(filt_filename_lst)
known_variants_vcf = URLS.pf_crosses_v1["known_variants_vcf"]
cmd = (
f"""
bcftools merge --force-samples {filt_filename_str} -Ov -o {known_variants_vcf}
gatk IndexFeatureFile -I {known_variants_vcf}
""",
)
res = run(cmd, shell=True, capture_output=True, text=True)
if (
res.returncode != 0
or "error" in res.stderr.lower()
or "error" in res.stdout.lower()
):
print(res.stderr)
exit(-1)


if __name__ == "__main__":

print("downloading fasta files")
download_genome_fasta_files()
print("Done obtaining fasta files")

print("indexing will take a long time ....")
index_fasta_files()
print("Done indexing")

print("download pf crosses v1.0 vcf files")
download_pf_crosses_v1_vcfs()
print("Done download pf crosses 1.0")

print("generating known variants vcf from pf crosses 1.0")
known_variants_from_pf_crosses_v1()
print("Done known variants vcf")

1 comment on commit 3fc4855

@bguo068
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

close #7
and provide an alternative way to address the first task of #3

Please sign in to comment.