diff --git a/01_extract_wells.py b/01_extract_wells.py index 17d1caf..02ae480 100644 --- a/01_extract_wells.py +++ b/01_extract_wells.py @@ -53,26 +53,26 @@ # parse fasta file line by line for line in seq: if set(line) <= DNA: - read_number+=1 + read_number+=1 # print out statistics of progress - print '\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count, + print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count) line=line.replace('\n','') - # T cell receptor read found - if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ - not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: - location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1) - if location in t_ass_seqs: - t_ass_seqs[location].append(revcompl(line[14:].replace('\n',''))) - else: - t_ass_seqs[location]=[revcompl(line[14:].replace('\n', ''))] - tcr_count+=1 - assigned+=1 - # sequence not assigned + # T cell receptor read found + if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ + not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: + location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1) + if location in t_ass_seqs: + t_ass_seqs[location].append(revcompl(line[14:].replace('\n',''))) else: - unassigned+=1 + t_ass_seqs[location]=[revcompl(line[14:].replace('\n', ''))] + tcr_count+=1 + assigned+=1 + # sequence not assigned + else: + unassigned+=1 # statistics of assigned and unassigned sequences -print 'Unassigned: ', unassigned, ' Assigned: ', assigned +print('Unassigned: ', unassigned, ' Assigned: ', assigned) seq.close() # write sequences to files diff --git a/02_generate_IMGT_input_cytokine_output.py b/02_generate_IMGT_input_cytokine_output.py index d579362..e72a7bb 100644 --- a/02_generate_IMGT_input_cytokine_output.py +++ b/02_generate_IMGT_input_cytokine_output.py @@ -21,39 +21,39 @@ # sub-thread for processing one file def parse_file(filename, blast_cytokines): - # if option for blasting each read to identify cytokine reads is set - # generate temporary file without cytokine reads and proceed with it - if blast_cytokines: - tmp_file, cytokine_list = CdrExtraction.FileWithoutCytokines(filename) - possible_TCR_list, empty_cytokine_list = CdrExtraction.ParseWell(tmp_file) - os.unlink(tmp_file) - else: - possible_TCR_list, cytokine_list = CdrExtraction.ParseWell(filename) - + # if option for blasting each read to identify cytokine reads is set + # generate temporary file without cytokine reads and proceed with it + if blast_cytokines: + tmp_file, cytokine_list = CdrExtraction.FileWithoutCytokines(filename) + possible_TCR_list, empty_cytokine_list = CdrExtraction.ParseWell(tmp_file) + os.unlink(tmp_file) + else: + possible_TCR_list, cytokine_list = CdrExtraction.ParseWell(filename) out_imgt.write(CdrExtraction.HighV_QuestInput(filename.split('.')[0], possible_TCR_list)) - out_cytokine.write(CdrExtraction.CytokineOutput(filename.split('.')[0], cytokine_list)) + + out_cytokine.write(CdrExtraction.CytokineOutput(filename.split('.')[0], cytokine_list)) # main thread if __name__ == '__main__': - # parsing arguments - parser = argparse.ArgumentParser(description='Process files containing sequencing reads.') - parser.add_argument('--imgt_input', required=True, help='File that will contain input for IMGT High/V-Quest') - parser.add_argument('--cytokine_output', required=True, help='File that will contain output of cytokine reads') - parser.add_argument('-b','--blast_cytokines', help='Blast each read to identify cytokine reads', action='store_true') - args = parser.parse_args() - - # files to be written - out_imgt = open(args.imgt_input, 'w',0) - out_cytokine = open(args.cytokine_output, 'w',0) - out_cytokine.write ('Well\t' + '\t'.join(CdrExtractionOptions.CYTOKINE_LIST.keys()) + '\n') - - # starting sub-threads - pool = multiprocessing.Pool() - for filename in sorted(glob.glob('*.fasta')): - pool.apply_async(parse_file, args=(filename, args.blast_cytokines)) - - # clean up - pool.close() - pool.join() - out_imgt.close() - out_cytokine.close() + # parsing arguments + parser = argparse.ArgumentParser(description='Process files containing sequencing reads.') + parser.add_argument('--imgt_input', required=True, help='File that will contain input for IMGT High/V-Quest') + parser.add_argument('--cytokine_output', required=True, help='File that will contain output of cytokine reads') + parser.add_argument('-b','--blast_cytokines', help='Blast each read to identify cytokine reads', action='store_true') + args = parser.parse_args() + + # files to be written + out_imgt = open(args.imgt_input, 'w') + out_cytokine = open(args.cytokine_output, 'w') + out_cytokine.write ('Well\t' + '\t'.join(CdrExtractionOptions.CYTOKINE_LIST.keys()) + '\n') + + # starting sub-threads + pool = multiprocessing.Pool() + for filename in sorted(glob.glob('*.fasta')): + pool.apply_async(parse_file, args=(filename, args.blast_cytokines)) + + # clean up + pool.close() + pool.join() + out_imgt.close() + out_cytokine.close() diff --git a/CdrExtraction.py b/CdrExtraction.py index 03f91a1..6ebb818 100644 --- a/CdrExtraction.py +++ b/CdrExtraction.py @@ -13,7 +13,7 @@ from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML from Bio.Seq import Seq -from cStringIO import StringIO +from io import StringIO # extract cytokine from read def CytokineExtraction(SEQ, DB): @@ -42,27 +42,27 @@ def CytokineOutput(wellname, cytokine_list): # generate file without cytokine reads and return temporary file plus cytokine list def FileWithoutCytokines(filename): - # generate temporary file - tmp_file = tempfile.NamedTemporaryFile(delete=False) - cytokine_list = copy.deepcopy(CdrExtractionOptions.CYTOKINE_LIST) - - # read in all reads from sequence file - sequences = ConsensusClusters.ReadSequences(filename) - - # go through all reads - for s in sequences: - # check if read contains cytokine and count it - cytokine = CytokineExtraction(sequences[s], CdrExtractionOptions.PATH_TO_CYTOKINE_DB) - if cytokine != '': - cytokine_list[cytokine]+=1 - # if it does not contain cytokine, write to file - else: - tmp_file.write('>' + s + '\n' + sequences[s] + '\n') + # generate temporary file + tmp_file = tempfile.NamedTemporaryFile(delete=False) + cytokine_list = copy.deepcopy(CdrExtractionOptions.CYTOKINE_LIST) + + # read in all reads from sequence file + sequences = ConsensusClusters.ReadSequences(filename) - # close temporary file - tmp_file.close() - - return tmp_file.name, cytokine_list + # go through all reads + for s in sequences: + # check if read contains cytokine and count it + cytokine = CytokineExtraction(sequences[s], CdrExtractionOptions.PATH_TO_CYTOKINE_DB) + if cytokine != '': + cytokine_list[cytokine]+=1 + # if it does not contain cytokine, write to file + else: + tmp_file.write('>' + s + '\n' + sequences[s] + '\n') + + # close temporary file + tmp_file.close() + + return tmp_file.name, cytokine_list # generate input for IMGT HighV-Quest # >wellname:index:number of reads