-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
140 lines (110 loc) · 5.22 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
containerized: "docker://albihhu/pancov:latest"
configfile: "config.yaml"
#We process the covid reference that is used and extract basic information
with open('data/input/'+config['ref_genome'],'r') as infile:
raw = infile.read().splitlines()
header = raw[0]
bases = ''.join(raw[1:])
config['ref_genome_chr'] = header[1:]
config['ref_genome_length'] = len(bases)
#We detect the runs that are provided as input, alternatively we can use a user-defined subset of runs (this needs to be toggled in the cofig.yaml)
runs = config['runs'] if config['useSubsetOfRuns'] else glob_wildcards('data/input/{run}').run
#For each run we detect the barcodes it contains (individual samples)
barcodes = {}
#It is possible to use a user defined subset here as well
if config['useSubsetOfBarcodes']:
barcodes = config['barcodes']
print("Using the following subset of barcodes: {}".format(barcodes))
else:
for run in runs:
#We use the medaka output here to determine the existing barcodes, this is arbritrary, any other file could be chosen for this purpose
barcodes[run] = glob_wildcards('data/input/'+run+'/barcode{barcode}.medaka.'+config['vcf_suffix']).barcode
####
#EVALUATION ONLY / NOT REQUIRED FOR PRODUCTIVE USE
####
if config['evaluate']:
gisaidMapping = {}
gisaidMappingInverse = {}
with open('data/input/mappingRunsGisaid.csv', 'r') as infile:
lines = infile.read().splitlines()
for l in lines:
data = l.split()
run = data[0]
barcode = data[1]
if len(data) != 2: #entry in the table
gisaidID = data[2]
if not run in gisaidMapping:
gisaidMapping[run] = {}
gisaidMapping[run][barcode] = gisaidID
gisaidMappingInverse[gisaidID] = run+'_'+barcode
def getGisaidFile(run,barcode):
if run in gisaidMapping:
if barcode in gisaidMapping[run]:
return gisaidMapping[run][barcode]
return None
for run in runs:
for barcode in barcodes[run]:
gisaidid = getGisaidFile(run,barcode)
if gisaidid:
gisaidpath = gisaidid + ".fasta"
#print(gisaidpath)
if os.path.isfile(os.path.join('data/input/gisaidseqs',gisaidpath)):
continue
else:
#print('No sequence for gisaid id: {} (run: {} bc: {})'.format(gisaidpath,run,barcode))
del gisaidMapping[run][barcode]
else:
#print('No mapping for run: {} bc: {}'.format(run,barcode))
pass
illuminaMapping = {}
for run in runs:
illuminaMapping[run] = set()
for barcode in barcodes[run]:
if os.path.isdir(os.path.join('data/input/illumina',(run+'_'+barcode))):
illuminaMapping[run].add(barcode)
####
# This function assembles all the required output files that serve as a target for the snakemake pipeline
def getInput(wildcards):
inputList = []
for run in runs:
#Mandatory: For each sample we curate a consensus sequence
inputList += expand('data/output/consensus/'+run+'/{barcode}/consensus.fasta', barcode=barcodes[run])
#We also create an IGV session file for interactive data exploration
inputList += expand('data/output/IgvSessions/' + run + '/{barcode}.igv.xml',barcode=barcodes[run])
#Optional Files: For each sample perform quality control on the reads and output a human-readable .html report
if config['performQualityControl']:
inputList += expand('data/output/softClippedSeqs/'+run+'/{barcode}.html',barcode=barcodes[run])
#Optional Files: For each sample we can annotate our detected variants using SnpEff
if config['VarAnnotSnpEff']:
inputList += expand('data/auxiliary/pangenome_vc/'+run+'/{barcode}/filter.annoted.vcf', barcode=barcodes[run])
if config['evaluate']:
#VCF Based
inputList += ['data/output/evaluation/vcfbased/concordance_pancov.html']
inputList += ['data/output/evaluation/vcfbased/concordance_medaka.html']
inputList += ['data/output/evaluation/vcfbased/concordance_nanopolish.html']
for barcode in barcodes[run]:
if run in illuminaMapping and barcode in illuminaMapping[run]:
pass
else:
#print('skipping run {} barcode {} for illumina comparison as we have no seq yet ...'.format(run,barcode))
continue
inputList += [
'data/auxiliary/illuminaVarCalls/'+run+'_'+barcode+'/ivar.vcf.tsv'
]
return inputList
#Main rule that aggregates all the targets and is used when they are not specified, see function above for output files that are being created
rule all:
input:
getInput
include: 'rules/shared.snk'
include: 'rules/readAnalysis.snk'
include: 'rules/variantAnalysis.snk'
include: 'rules/consensus.snk'
include: 'rules/pangenome.snk'
include: 'rules/pangenome_variant_call.snk'
include: 'rules/monogenomic.snk'
if config['evaluate']:
### EVALUATION ONLY
include: 'rules/pangenome_eval.snk'
include: 'rules/illumina.snk'