From c645872a46a833833310dab4ab6d604f03a2f6eb Mon Sep 17 00:00:00 2001 From: Emma Bishop Date: Tue, 16 Aug 2022 14:05:23 -0700 Subject: [PATCH 1/5] Use Python3 print syntax --- 01_extract_wells.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/01_extract_wells.py b/01_extract_wells.py index 17d1caf..19fb4fa 100644 --- a/01_extract_wells.py +++ b/01_extract_wells.py @@ -55,8 +55,8 @@ if set(line) <= DNA: read_number+=1 # print out statistics of progress - print '\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count, - line=line.replace('\n','') + print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count, + line=line.replace('\n','')) # T cell receptor read found if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: @@ -72,7 +72,7 @@ unassigned+=1 # statistics of assigned and unassigned sequences -print 'Unassigned: ', unassigned, ' Assigned: ', assigned +print('Unassigned: ', unassigned, ' Assigned: ', assigned) seq.close() # write sequences to files From fc4731c6388532b547cf810c249999dc551d7c11 Mon Sep 17 00:00:00 2001 From: Emma Bishop Date: Tue, 16 Aug 2022 14:06:06 -0700 Subject: [PATCH 2/5] Fix indentation Fixes errors from improper indentation (such as after if statements) and inconsistent use of tabs and spaces (I used spaces). --- 01_extract_wells.py | 24 +++++----- 02_generate_IMGT_input_cytokine_output.py | 58 +++++++++++------------ CdrExtraction.py | 40 ++++++++-------- 3 files changed, 61 insertions(+), 61 deletions(-) diff --git a/01_extract_wells.py b/01_extract_wells.py index 19fb4fa..03f0fcf 100644 --- a/01_extract_wells.py +++ b/01_extract_wells.py @@ -53,23 +53,23 @@ # parse fasta file line by line for line in seq: if set(line) <= DNA: - read_number+=1 + read_number+=1 # print out statistics of progress print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count, line=line.replace('\n','')) # T cell receptor read found - if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ - not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: - location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1) - if location in t_ass_seqs: - t_ass_seqs[location].append(revcompl(line[14:].replace('\n',''))) - else: - t_ass_seqs[location]=[revcompl(line[14:].replace('\n', ''))] - tcr_count+=1 - assigned+=1 - # sequence not assigned + if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ + not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: + location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1) + if location in t_ass_seqs: + t_ass_seqs[location].append(revcompl(line[14:].replace('\n',''))) else: - unassigned+=1 + t_ass_seqs[location]=[revcompl(line[14:].replace('\n', ''))] + tcr_count+=1 + assigned+=1 + # sequence not assigned + else: + unassigned+=1 # statistics of assigned and unassigned sequences print('Unassigned: ', unassigned, ' Assigned: ', assigned) diff --git a/02_generate_IMGT_input_cytokine_output.py b/02_generate_IMGT_input_cytokine_output.py index d579362..0f7daa8 100644 --- a/02_generate_IMGT_input_cytokine_output.py +++ b/02_generate_IMGT_input_cytokine_output.py @@ -21,39 +21,39 @@ # sub-thread for processing one file def parse_file(filename, blast_cytokines): - # if option for blasting each read to identify cytokine reads is set - # generate temporary file without cytokine reads and proceed with it - if blast_cytokines: - tmp_file, cytokine_list = CdrExtraction.FileWithoutCytokines(filename) - possible_TCR_list, empty_cytokine_list = CdrExtraction.ParseWell(tmp_file) - os.unlink(tmp_file) - else: - possible_TCR_list, cytokine_list = CdrExtraction.ParseWell(filename) + # if option for blasting each read to identify cytokine reads is set + # generate temporary file without cytokine reads and proceed with it + if blast_cytokines: + tmp_file, cytokine_list = CdrExtraction.FileWithoutCytokines(filename) + possible_TCR_list, empty_cytokine_list = CdrExtraction.ParseWell(tmp_file) + os.unlink(tmp_file) + else: + possible_TCR_list, cytokine_list = CdrExtraction.ParseWell(filename) out_imgt.write(CdrExtraction.HighV_QuestInput(filename.split('.')[0], possible_TCR_list)) - out_cytokine.write(CdrExtraction.CytokineOutput(filename.split('.')[0], cytokine_list)) + out_cytokine.write(CdrExtraction.CytokineOutput(filename.split('.')[0], cytokine_list)) # main thread if __name__ == '__main__': - # parsing arguments - parser = argparse.ArgumentParser(description='Process files containing sequencing reads.') - parser.add_argument('--imgt_input', required=True, help='File that will contain input for IMGT High/V-Quest') - parser.add_argument('--cytokine_output', required=True, help='File that will contain output of cytokine reads') - parser.add_argument('-b','--blast_cytokines', help='Blast each read to identify cytokine reads', action='store_true') - args = parser.parse_args() - - # files to be written - out_imgt = open(args.imgt_input, 'w',0) - out_cytokine = open(args.cytokine_output, 'w',0) - out_cytokine.write ('Well\t' + '\t'.join(CdrExtractionOptions.CYTOKINE_LIST.keys()) + '\n') - - # starting sub-threads - pool = multiprocessing.Pool() - for filename in sorted(glob.glob('*.fasta')): + # parsing arguments + parser = argparse.ArgumentParser(description='Process files containing sequencing reads.') + parser.add_argument('--imgt_input', required=True, help='File that will contain input for IMGT High/V-Quest') + parser.add_argument('--cytokine_output', required=True, help='File that will contain output of cytokine reads') + parser.add_argument('-b','--blast_cytokines', help='Blast each read to identify cytokine reads', action='store_true') + args = parser.parse_args() + + # files to be written + out_imgt = open(args.imgt_input, 'w',0) + out_cytokine = open(args.cytokine_output, 'w',0) + out_cytokine.write ('Well\t' + '\t'.join(CdrExtractionOptions.CYTOKINE_LIST.keys()) + '\n') + + # starting sub-threads + pool = multiprocessing.Pool() + for filename in sorted(glob.glob('*.fasta')): pool.apply_async(parse_file, args=(filename, args.blast_cytokines)) - # clean up - pool.close() - pool.join() - out_imgt.close() - out_cytokine.close() + # clean up + pool.close() + pool.join() + out_imgt.close() + out_cytokine.close() diff --git a/CdrExtraction.py b/CdrExtraction.py index 03f91a1..9f9c619 100644 --- a/CdrExtraction.py +++ b/CdrExtraction.py @@ -42,27 +42,27 @@ def CytokineOutput(wellname, cytokine_list): # generate file without cytokine reads and return temporary file plus cytokine list def FileWithoutCytokines(filename): - # generate temporary file - tmp_file = tempfile.NamedTemporaryFile(delete=False) - cytokine_list = copy.deepcopy(CdrExtractionOptions.CYTOKINE_LIST) - - # read in all reads from sequence file - sequences = ConsensusClusters.ReadSequences(filename) - - # go through all reads - for s in sequences: - # check if read contains cytokine and count it - cytokine = CytokineExtraction(sequences[s], CdrExtractionOptions.PATH_TO_CYTOKINE_DB) - if cytokine != '': - cytokine_list[cytokine]+=1 - # if it does not contain cytokine, write to file - else: - tmp_file.write('>' + s + '\n' + sequences[s] + '\n') + # generate temporary file + tmp_file = tempfile.NamedTemporaryFile(delete=False) + cytokine_list = copy.deepcopy(CdrExtractionOptions.CYTOKINE_LIST) + + # read in all reads from sequence file + sequences = ConsensusClusters.ReadSequences(filename) - # close temporary file - tmp_file.close() - - return tmp_file.name, cytokine_list + # go through all reads + for s in sequences: + # check if read contains cytokine and count it + cytokine = CytokineExtraction(sequences[s], CdrExtractionOptions.PATH_TO_CYTOKINE_DB) + if cytokine != '': + cytokine_list[cytokine]+=1 + # if it does not contain cytokine, write to file + else: + tmp_file.write('>' + s + '\n' + sequences[s] + '\n') + + # close temporary file + tmp_file.close() + + return tmp_file.name, cytokine_list # generate input for IMGT HighV-Quest # >wellname:index:number of reads From af01615d3b7888bdb0db6d75bc181b8454f276d4 Mon Sep 17 00:00:00 2001 From: Emma Bishop Date: Wed, 17 Aug 2022 14:56:39 -0700 Subject: [PATCH 3/5] Fix print statement I accidentally included the following line in the print statement when adding parentheses. --- 01_extract_wells.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/01_extract_wells.py b/01_extract_wells.py index 03f0fcf..89862f5 100644 --- a/01_extract_wells.py +++ b/01_extract_wells.py @@ -55,8 +55,8 @@ if set(line) <= DNA: read_number+=1 # print out statistics of progress - print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count, - line=line.replace('\n','')) + print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count) + line=line.replace('\n','') # T cell receptor read found if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: From 77b7ab84d6e88b829da023ffc8fe265aea1dbd4e Mon Sep 17 00:00:00 2001 From: Emma Bishop Date: Wed, 17 Aug 2022 14:58:57 -0700 Subject: [PATCH 4/5] Tweak spacing a little more --- 01_extract_wells.py | 2 +- 02_generate_IMGT_input_cytokine_output.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/01_extract_wells.py b/01_extract_wells.py index 89862f5..02ae480 100644 --- a/01_extract_wells.py +++ b/01_extract_wells.py @@ -57,7 +57,7 @@ # print out statistics of progress print('\rAssigned', assigned, '/', read_number, 'TCR: ', tcr_count, 'BCR: ', bcr_count) line=line.replace('\n','') - # T cell receptor read found + # T cell receptor read found if line[2:7] in TCR_plate_barcodes and line[9:14] in TCR_row_barcodes and line[len(line)-7:-2] in TCR_column_barcodes and \ not 'N' in line and not 'AAAAAAAAAAAA' in line and not 'GGGGGGGGGGGG' in line and not 'CCCCCCCCCCCC' in line and not 'TTTTTTTTTTTT' in line: location=str(TCR_plate_barcodes.index(line[2:7])+1) + str(chr(ord('A')+TCR_row_barcodes.index(line[9:14]))) + str(TCR_column_barcodes.index(line[len(line)-7:-2])+1) diff --git a/02_generate_IMGT_input_cytokine_output.py b/02_generate_IMGT_input_cytokine_output.py index 0f7daa8..9d1cbdb 100644 --- a/02_generate_IMGT_input_cytokine_output.py +++ b/02_generate_IMGT_input_cytokine_output.py @@ -29,8 +29,8 @@ def parse_file(filename, blast_cytokines): os.unlink(tmp_file) else: possible_TCR_list, cytokine_list = CdrExtraction.ParseWell(filename) - out_imgt.write(CdrExtraction.HighV_QuestInput(filename.split('.')[0], possible_TCR_list)) + out_cytokine.write(CdrExtraction.CytokineOutput(filename.split('.')[0], cytokine_list)) # main thread @@ -50,7 +50,7 @@ def parse_file(filename, blast_cytokines): # starting sub-threads pool = multiprocessing.Pool() for filename in sorted(glob.glob('*.fasta')): - pool.apply_async(parse_file, args=(filename, args.blast_cytokines)) + pool.apply_async(parse_file, args=(filename, args.blast_cytokines)) # clean up pool.close() From 29d5c6d0deccc31c92ad3902f04659f48a8db90a Mon Sep 17 00:00:00 2001 From: Emma Bishop Date: Wed, 17 Aug 2022 17:12:16 -0700 Subject: [PATCH 5/5] Fix StringIO import for Python3 and buffering Recieved an error when buffer was set to 0/off. --- 02_generate_IMGT_input_cytokine_output.py | 4 ++-- CdrExtraction.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/02_generate_IMGT_input_cytokine_output.py b/02_generate_IMGT_input_cytokine_output.py index 9d1cbdb..e72a7bb 100644 --- a/02_generate_IMGT_input_cytokine_output.py +++ b/02_generate_IMGT_input_cytokine_output.py @@ -43,8 +43,8 @@ def parse_file(filename, blast_cytokines): args = parser.parse_args() # files to be written - out_imgt = open(args.imgt_input, 'w',0) - out_cytokine = open(args.cytokine_output, 'w',0) + out_imgt = open(args.imgt_input, 'w') + out_cytokine = open(args.cytokine_output, 'w') out_cytokine.write ('Well\t' + '\t'.join(CdrExtractionOptions.CYTOKINE_LIST.keys()) + '\n') # starting sub-threads diff --git a/CdrExtraction.py b/CdrExtraction.py index 9f9c619..6ebb818 100644 --- a/CdrExtraction.py +++ b/CdrExtraction.py @@ -13,7 +13,7 @@ from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML from Bio.Seq import Seq -from cStringIO import StringIO +from io import StringIO # extract cytokine from read def CytokineExtraction(SEQ, DB):