Skip to content

Commit 63faccb

Browse files
fixes issue encountered during testing.
1 parent 2d66c98 commit 63faccb

File tree

2 files changed

+82
-3
lines changed

2 files changed

+82
-3
lines changed

qp_klp/Workflows.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,40 @@ def get_samples_in_qiita(cls, qclient, qiita_id):
584584

585585
return (samples, tids)
586586

587+
@classmethod
588+
def _determine_orientation(cls, file_name):
589+
# aka forward, reverse, and indexed reads
590+
orientations = ['R1', 'R2', 'I1', 'I2']
591+
592+
results = []
593+
594+
# assume orientation is always present in the file's name.
595+
# assume that it is of one of the four forms above.
596+
# assume that it is always the right-most occurance of the four
597+
# orientations above.
598+
# assume that orientation is encapsulated with either '_' or '.'
599+
# e.g.: '_R1_', '.I2.'.
600+
# assume users can and will include any or all of the four
601+
# orientation as part of their filenames as well. e.g.:
602+
# ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz
603+
for o in orientations:
604+
variations = [f"_{o}_", f".{o}."]
605+
for v in variations:
606+
# rfind searches from the end of the string, rather than
607+
# its beginning. It returns the position in the string
608+
# where the substring begins.
609+
results.append((file_name.rfind(v), o))
610+
611+
# the orientation will be the substring found with the maximum
612+
# found value for pos. That is, it will be the substring that
613+
# begins at the rightest most position in the file name.
614+
results.sort(reverse=True)
615+
616+
pos, orientation = results[0]
617+
618+
# if no orientations were found, then return None.
619+
return None if pos == -1 else orientation
620+
587621
def _get_postqc_fastq_files(self, out_dir, project):
588622
af = None
589623
sub_folders = ['amplicon', 'filtered_sequences', 'trimmed_sequences']
@@ -599,11 +633,13 @@ def _get_postqc_fastq_files(self, out_dir, project):
599633
'raw_reverse_seqs': []}
600634

601635
for fastq_file in af:
602-
if '_I1_' in fastq_file or '_I2_' in fastq_file:
636+
_, file_name = split(fastq_file)
637+
orientation = self._determine_orientation(file_name)
638+
if orientation in ['I1', 'I2']:
603639
files['raw_barcodes'].append(fastq_file)
604-
elif '_R1_' in fastq_file:
640+
elif orientation == 'R1':
605641
files['raw_forward_seqs'].append(fastq_file)
606-
elif '_R2_' in fastq_file:
642+
elif orientation == 'R2':
607643
files['raw_reverse_seqs'].append(fastq_file)
608644
else:
609645
raise ValueError(f"Unrecognized file: {fastq_file}")

qp_klp/tests/test_workflows.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from os import environ, remove, getcwd
1313
import re
1414
from qp_klp.WorkflowFactory import WorkflowFactory
15+
from qp_klp.Workflows import Workflow
1516
from metapool import load_sample_sheet
1617
from collections import defaultdict
1718
from random import randint
@@ -890,3 +891,45 @@ def open_job_script(script_path):
890891
exp = open_job_script("qp_klp/tests/data/tellread_test.sbatch")
891892

892893
self.assertEqual(obs, exp)
894+
895+
def test_foo(self):
896+
test_names = [
897+
# single additional occurance: R1
898+
("ABC_7_04_1776_R1_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
899+
("ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
900+
("ABC_7_04_1776_R1_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
901+
("ABC_7_04_1776_R1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
902+
903+
# test w/dots.
904+
("ABC_7_04_1776.R1.SRE_S3_L007.R1.001.trimmed.fastq.gz", "R1"),
905+
("ABC_7_04_1776.R1.SRE_S3_L007.R2.001.trimmed.fastq.gz", "R2"),
906+
("ABC_7_04_1776.R1.SRE_S3_L007.I1.001.trimmed.fastq.gz", "I1"),
907+
("ABC_7_04_1776.R1.SRE_S3_L007.I2.001.trimmed.fastq.gz", "I2"),
908+
909+
# single additional occurance: R2
910+
("ABC_7_04_1776_R2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
911+
("ABC_7_04_1776_R2_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
912+
("ABC_7_04_1776_R2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
913+
("ABC_7_04_1776_R2_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
914+
915+
# single additional occurance: In
916+
("ABC_7_04_1776_I2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
917+
("ABC_7_04_1776_I1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
918+
("ABC_7_04_1776_I2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
919+
("ABC_7_04_1776_I1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
920+
921+
# no additional occurances
922+
("ABC_7_04_1776_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
923+
("ABC_7_04_1776_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
924+
("ABC_7_04_1776_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
925+
("ABC_7_04_1776_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
926+
927+
# two additional occurances
928+
("ABC_7_04_1776_I2_SRE.R1.S3_L007_R1_001.trimmed.fastq.gz", "R1"),
929+
("ABC_7_04_1776_I1_SRE.R1.S3_L007_R2_001.trimmed.fastq.gz", "R2"),
930+
("ABC_7_04_1776_I2_SRE.R1.S3_L007_I1_001.trimmed.fastq.gz", "I1"),
931+
("ABC_7_04_1776_I1_SRE.R1.S3_L007_I2_001.trimmed.fastq.gz", "I2"),
932+
]
933+
934+
for file_name, exp in test_names:
935+
self.assertEqual(Workflow._determine_orientation(file_name), exp)

0 commit comments

Comments
 (0)