-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparamsweep.snk
113 lines (90 loc) · 3.81 KB
/
paramsweep.snk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
#Load Config File
configfile: "paramsweep.yaml"
runs = config['runs']
#For each run we detect the barcodes it contains (individual samples)
barcodes = {}
#It is possible to use a user defined subset here as well
if config['useSubsetOfBarcodes']:
barcodes = config['barcodes']
print("Using the following subset of barcodes: {}".format(barcodes))
else:
for run in runs:
#We use the medaka output here to determine the existing barcodes, this is arbritrary, any other file could be chosen for this purpose
barcodes[run] = glob_wildcards('data/input/'+run+'/barcode{barcode}.medaka.'+config['vcf_suffix']).barcode
### REMOVE LATER (THIS SECTION IS ONLY USED FOR EVALUATION OF THE PIPELINE AND IRRELEVANT FOR PRODUCTIVE USE)
gisaidMapping = {}
gisaidMappingInverse = {}
with open('data/input/mappingRunsGisaid.csv', 'r') as infile:
lines = infile.read().splitlines()
for l in lines:
data = l.split()
run = data[0]
barcode = data[1]
if len(data) != 2: #entry in the table
gisaidID = data[2]
if not run in gisaidMapping:
gisaidMapping[run] = {}
gisaidMapping[run][barcode] = gisaidID
gisaidMappingInverse[gisaidID] = run+'_'+barcode
def getGisaidFile(run,barcode):
if run in gisaidMapping:
if barcode in gisaidMapping[run]:
return gisaidMapping[run][barcode]
return None
for run in runs:
for barcode in barcodes[run]:
gisaidid = getGisaidFile(run,barcode)
if gisaidid:
gisaidpath = gisaidid + ".fasta"
#print(gisaidpath)
if os.path.isfile(os.path.join('data/input/gisaidseqs',gisaidpath)):
continue
else:
#print('No sequence for gisaid id: {} (run: {} bc: {})'.format(gisaidpath,run,barcode))
del gisaidMapping[run][barcode]
else:
#print('No mapping for run: {} bc: {}'.format(run,barcode))
pass
illuminaMapping = {}
for run in runs:
illuminaMapping[run] = set()
for barcode in barcodes[run]:
if os.path.isdir(os.path.join('data/input/illumina',(run+'_'+barcode))):
illuminaMapping[run].add(barcode)
### REMOVE LATER END
# This function assembles all the required output files that serve as a target for the snakemake pipeline
def getInput(wildcards):
inputList = []
for run in runs:
if config['VarAnnotSnpEff']:
inputList += expand('data/auxiliary/pangenome_vc/'+run+'/{barcode}/filter.annoted.vcf', barcode=barcodes[run])
#VCF Based
inputList += ['data/output/evaluation/vcfbased/comparison_pancov.tsv']
inputList += ['data/output/evaluation/vcfbased/comparison_medaka.tsv']
inputList += ['data/output/evaluation/vcfbased/comparison_nanopolish.tsv']
for barcode in barcodes[run]:
if run in illuminaMapping and barcode in illuminaMapping[run]:
pass
else:
#print('skipping run {} barcode {} for illumina comparison as we have no seq yet ...'.format(run,barcode))
continue
inputList += [
'data/auxiliary/illuminaVarCalls/'+run+'_'+barcode+'/ivar.vcf.tsv'
]
return inputList
#Main rule that aggregates all the targets and is used when they are not specified, see function above for output files that are being created
rule all:
input:
getInput
include: 'rules/shared.snk'
include: 'rules/errorCorrection.snk'
include: 'rules/readAnalysis.snk'
include: 'rules/variantAnalysis.snk'
include: 'rules/consensus.snk'
include: 'rules/pangenome.snk'
include: 'rules/discovery.snk'
include: 'rules/pangenome_variant_call.snk'
### REMOVE LATER (THE INCLUDES BELOW ARE ONLY FOR EVALUATION)
include: 'rules/pangenome_eval.snk'
include: 'rules/illumina.snk'