-
Notifications
You must be signed in to change notification settings - Fork 4
add subsample_reads #138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add subsample_reads #138
Changes from 8 commits
856e9b4
ab54d4a
6c6ebd6
19e53a3
8be711a
cb6848b
a5b6ae8
c2ee439
98c294f
13ffd2b
94e885d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,11 +4,14 @@ | |
| from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob | ||
| from sequence_processing_pipeline.PipelineError import PipelineError | ||
| from sequence_processing_pipeline.util import determine_orientation | ||
| from os.path import join, split | ||
| from os.path import join, split, basename, dirname | ||
| from re import match | ||
| from os import makedirs, rename, walk | ||
| from metapool import load_sample_sheet | ||
| from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ | ||
| import pandas as pd | ||
| from glob import glob | ||
| from qiita_client.util import system_call | ||
|
|
||
|
|
||
| PROTOCOL_NAME_NONE = "None" | ||
|
|
@@ -22,6 +25,48 @@ class Protocol(): | |
| initialization. | ||
| """ | ||
| protocol_type = PROTOCOL_NAME_NONE | ||
| # this value was selected by looking at all the successful NuQC/SPP jobs, | ||
| # the max sequeces were: 712,497,596 | ||
| MAX_READS = 720000000 | ||
|
|
||
| def subsample_reads(self): | ||
| if self.assay_type == 'Amplicon': | ||
| return | ||
|
|
||
| df = pd.read_csv(self.reports_path) | ||
| if 'raw_reads_r1r2' in df.columns: | ||
| # this is a TellSeq run: SeqCounts.csv | ||
| read_col = 'raw_reads_r1r2' | ||
| index_col = 'Sample_ID' | ||
| elif '# Reads' in df.columns: | ||
| # this is a Illumina: Demultiplex_Stats.csv | ||
| read_col = '# Reads' | ||
| index_col = 'SampleID' | ||
| else: | ||
| raise ValueError( | ||
| 'Not sure how to check for seq counts to subsample, ' | ||
| 'please let an admin know.') | ||
| df = df[df[read_col] > self.MAX_READS] | ||
| if df.shape[0]: | ||
| for _, row in df.iterrows(): | ||
| sn = row[index_col] | ||
| files = glob(f'{self.raw_fastq_files_path}/*/{sn}*.fastq.gz') | ||
| for f in files: | ||
| dn = dirname(f) | ||
| bn = basename(f) | ||
| nbn = join(dn, bn.replace('fastq.gz', 'subsampled.gz')) | ||
|
||
| cmd = f'mv {f} {nbn}' | ||
| _, se, rv = system_call(cmd) | ||
| if rv != 0 or se: | ||
| raise ValueError(f'Error during mv: {cmd}. {se}') | ||
| cmd = (f'seqtk sample -s 42 {nbn} {self.MAX_READS} ' | ||
| f'| gzip > {f}') | ||
| _, se, rv = system_call(cmd) | ||
| if rv != 0 or se: | ||
| raise ValueError(f'Error during mv: {cmd}. {se}') | ||
|
||
| self.assay_warnings.append( | ||
| f'{sn} ({bn}) had {row[read_col]} sequences, ' | ||
| f'subsampling to {self.MAX_READS}') | ||
|
|
||
|
|
||
| class Illumina(Protocol): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean that we will now ALWAYS subsample?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but really we will always check if subsample is needed and only run it when necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, we will always subsample every fastq that has more than the max number of reads, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct.