-
Notifications
You must be signed in to change notification settings - Fork 0
/
SampleSheet.py
188 lines (158 loc) · 9.34 KB
/
SampleSheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import pandas
import re
import os
from copy import deepcopy
from scripts.cellranger_indexes import *
"""
Reads an IGO LIMS generated sample sheet .csv and splits the sample sheet if necessary to generate sample sheets ready for
Illumina DRAGEN demuxes with the correct options set for 10X & DLP samples.
"""
class SampleSheet:
"""
Overloaded constructor either should have 1 argument with the path to a sample sheet
or three arguments: the [Header] data frame, the [Data] data frame and the path
"""
def __init__(self, *args):
if len(args) == 1: # this is the path to the sample sheet
self.path = args[0]
self.read_csv(self.path)
else:
self.df_ss_header = args[0]
self.df_ss_data = args[1]
self.path = args[2]
# dictionary of project->recipe - NOTE, not accurate for MICHELLE_0485 where 08822_PC has HumanWholeGenome & RNASeq_RiboDeplete
self.project_dict = pandas.Series(self.df_ss_data['Sample_Well'].values,index=self.df_ss_data['Sample_Project']).to_dict()
self.sample_dict = pandas.Series(self.df_ss_data['Sample_Well'].values,index=self.df_ss_data['Sample_ID']).to_dict()
self.project_set = set(self.df_ss_data['Sample_Project'].tolist()) # Sample_Project column has the projects, convert it to a set
# set of all recipes in the sample sheet
self.recipe_set = set(self.df_ss_data['Sample_Well'].tolist()) # Sample_Well column has the recipe, convert it to a set
# for dual barcode sample sheets concat "index" and "index2" columns
index_list = self.df_ss_data['index'].tolist()
if 'index2' in self.df_ss_data.columns: # check if this is a dual-index run
index2_list = self.df_ss_data['index2'].tolist()
self.barcode_list = [a + b for a, b in zip(index_list, index2_list)]
else:
self.barcode_list = index_list
barcode_10X = re.compile("SI*")
# list of all special "SI-*" 10X barcodes
self.barcode_list_10X = list(filter(barcode_10X.match, self.barcode_list))
# list of index read lengths such as [151,151] for a PE run
self.read_lengths = []
index_read1 = int(self.df_ss_header.iat[9,0])
self.read_lengths.append(index_read1)
if type(self.df_ss_header.iat[10,0]) != float:
index_read2 = int(self.df_ss_header.iat[10,0])
self.read_lengths.append(index_read2)
def read_csv(self, path_to_samplesheet):
# skip the header and read only data rows in the this dataframe - the [Data] section
# find row of sample sheet which has the [Data] section
line_number = 0
sheet = open(path_to_samplesheet,"r")
with open(path_to_samplesheet, 'r') as read_obj:
for line in read_obj:
line_number += 1
if "[Data]" in line:
break
sheet.close()
if line_number <= 1:
raise Exception("Sample sheet is blank")
print("[Data] section of sample sheet detected on line: {}".format(line_number))
self.df_ss_header = pandas.read_csv(path_to_samplesheet,nrows=line_number-1)
self.df_ss_data = pandas.read_csv(path_to_samplesheet,skiprows=line_number)
def write_csv(self):
print("Saving sample sheet to " + self.path)
# pandas dataframe has blank column headers, make them write correctly to the .csv
csv = open(self.path, "w")
csv.write("[Header],,,,,,,,,\n")
csv.close()
self.df_ss_header.to_csv(self.path, mode='a',index=False,header=False)
self.df_ss_data.to_csv(self.path, mode='a',index=False)
"""
Creates a new [Data] section without 'Lane' information for each sample.
"""
def remove_lane_information(self):
#DRAGEN "--no-lane-splitting" requires a sample sheet without lane information
print("Removing sample sheet lane information.")
self.df_ss_data.drop_duplicates(inplace=True)
"""
Returns a list of sample sheets from splitting the original or the original sample sheet
if no splitting of samples for demux is necessary
"""
def split_sample_sheet(self):
"""
10X
if barcodes start with 'SI' like 'SI-NA-C7' take just the barcodes starting with 'SI' and remove index2 called "_10X"
DLP
if sample sheet recipes have mixed DLP and other all DLP need to go on a separate sample sheet named "_DLP"
"""
# if 10x DRAGEN demux add to header CreateFastqForIndexReads,1,,,,,,,
if any("SC_Chromium" in s for s in self.recipe_set):
print("Adding CreateFastqForIndexReads,1 to sample sheet header since 10X samples are present")
self.df_ss_header.loc[len(self.df_ss_header.index)-1] = ["CreateFastqForIndexReads",1,"","","","","","",""]
self.df_ss_header.loc[len(self.df_ss_header.index)] = ["[Data]","","","","","","","",""]
ss_copy = deepcopy(self)
# result list starts with the original sample sheet, this assumes we will split
# and the first entry will be the reference demux sample sheet
split_ss_list = [ss_copy, self]
was_split = False
if "SC_DLP" in self.recipe_set and len(self.recipe_set) > 1:
print("Copying all DLP samples to a new sample sheet")
# copy all DLP rows to a new sample sheet
dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == True].copy()
# and remove DLP samples from the main sample sheet
self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == False].copy()
# rename DLP sample sheet w/"_DLP.csv"
dlp_path = os.path.splitext(self.path)[0]+'_DLP.csv'
header_copy = self.df_ss_header.copy(deep=True)
dlp_ss = SampleSheet(header_copy, dlp_data, dlp_path)
split_ss_list.append(dlp_ss)
was_split = True
# check if sample sheet has 'SI-*' barcodes and normal barcodes
if len(self.barcode_list_10X) > 0 and len(self.barcode_list) != len(self.barcode_list_10X):
print("Copying all 10X SI- barcodes to new sheet and remove index2 column")
print("Non-DRAGEN demux, must have Sample_ID column with Sample_ prefix")
tenx_data = self.df_ss_data[ self.df_ss_data["index2"].str.match('^SI-.*') == True ].copy()
rest_data = self.df_ss_data[ self.df_ss_data["index2"].str.match('^SI-.*') == False ].copy()
self.df_ss_data = rest_data
tenx_path = os.path.splitext(self.path)[0]+'_10X.csv'
# if ATAC because read length is 51,50 () for example DIANA_427 must use cellranger-ATAC mkfastq
tenx_ss = SampleSheet(self.df_ss_header, tenx_data, tenx_path)
# convert SI barcodes to their real barcodes
tenx_ss_real_barcodes = convert_SI_barcodes(deepcopy(tenx_ss))
split_ss_list.append(tenx_ss_real_barcodes)
was_split = True
if was_split:
# Rename the original sample sheet
split_ss_list[0].path = os.path.splitext(self.path)[0]+'_REFERENCE.csv'
split_ss_list[1].path = os.path.splitext(self.path)[0]+'.csv'
else: # if we did not need to split the sample sheet just return it
split_ss_list = [ss_copy]
# if the sample sheet is all 'SI-*' 10x barcodes convert them to real barcodes
if len(self.barcode_list) == len(self.barcode_list_10X):
print("Converting all 10X SI- barcodes to real barcodes")
tenx_real_barcodes = convert_SI_barcodes(self)
split_ss_list = [tenx_real_barcodes]
return split_ss_list
def convert_SI_barcodes(samplesheet):
""" function to convert SI barcodes from sample sheet to the 10X quad barcodes from the cellranger_indexes.py """
print("Converting 10X samplesheet with SI barcodes to their real barcodes")
# create new data frame for special sample sheet for the quad barcodes
quad_ss_data = pandas.DataFrame(columns=samplesheet.df_ss_data.columns.values)
# row_position will make sure we will skip down to the correct rows when creating the new sample sheet rows
row_position = 0
for x in range(0, len(samplesheet.df_ss_data["I7_Index_ID"])):
# get the quad from the imported variables
si_barcode = samplesheet.df_ss_data["I7_Index_ID"].iloc[x].replace("-", "_")
quad_list = globals()[si_barcode] # lookup "SI-" barcode in the global variable list
# loop thru the quad set of barcodes and use these to replace the SI barcodes
for y in range(0, len(quad_list)):
quad_ss_data.loc[row_position] = samplesheet.df_ss_data.iloc[x]
quad_ss_data["index"].loc[row_position] = quad_list[y]
row_position += 1
quad_ss_data = quad_ss_data.drop(columns=['index2'])
# make sure BarcodeMismatchesIndex1 is set to zero to prevent collisions with other samples in a specific lane
# drop the row for BarcodeMismatchesIndex2, add the OverrideCycles and its mask to samplesheet.df_ss_header
samplesheet.df_ss_header.loc[13, "Unnamed: 1"] = 0
samplesheet.df_ss_header.loc[14, "[Header]"] = "OverrideCycles"
samplesheet.df_ss_header.loc[14, "Unnamed: 1"] = "Y29;I8N2;N10;Y89"
return SampleSheet(samplesheet.df_ss_header, quad_ss_data, samplesheet.path)