forked from myoungblom/RGAPepPipe_MAY
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpilonVCFtoFasta.py
executable file
·71 lines (63 loc) · 2.61 KB
/
pilonVCFtoFasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
import sys
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
#####################
# This script takes a variable number of input vcf files. Each vcf file must contain
# snp information about only one strain. At least one input file must be supplied. The
# name of an output file must also be supplied, and output is in snpt able format. The
# output contains the combined information about all strains.
# NOTE : position for indels is the locus before the insertion or deletion
#############
# check for correct number of inputs
if len(sys.argv) < 2 :
print("Usage: VCFToFasta.py <input vcf 1> ... <input vcf n>")
sys.exit(0)
def read_vcf(inFile):
"""Create strings corresponding to chromosome"""
with open(inFile, 'r') as vcf:
for line in vcf:
if ("contig" in line and "length" in line):
line = line.strip()
length = line.split("=")[-1].strip(">")
print(length)
refID = line.split(",")[0].split("=")[-1]
chromosome = ["?"] * int(length)
#elif ("CHROM" in line and "POS" in line and "ID" in line):
# line = line.strip()
# tokens = line.split()
# strain = tokens[9]
elif line[0] != "#":
line = line.strip()
CHROM, POS, ID, REF, ALT, QUAL, FILTER = line.split()[0:7]
FILTERS = FILTER.split(";")
if len(REF) == 1 and len(ALT) == 1:
if "Amb" in FILTERS:
ALLELE = "?"
elif "PASS" in FILTERS:
if ALT == ".":
ALLELE = REF
else:
ALLELE = ALT
elif "Del" in FILTERS:
ALLELE = "-"
else:
if FILTER == "LowCov":
ALLELE = "?"
else:
print(FILTER)
index = int(POS) - 1
chromosome[index] = ALLELE
return(chromosome, refID)
def write_fasta(chromosome, RGID, refID):
"""Writes a RGA fasta alignment for each vcf"""
outFile = RGID + "_RGA_pilon.fasta"
Sample = "pilon_" + RGID
record = SeqRecord(Seq("".join(chromosome)), id=Sample, description = "RGA_to_" + refID)
SeqIO.write(record, outFile, "fasta")
for n in sys.argv[1:(len(sys.argv))] :
RGID = os.path.basename(n)[0:-4]
chromosome, refID = read_vcf(n)
write_fasta(chromosome, RGID, refID)