|
1 | | -from sequence_processing_pipeline.ConvertJob import ConvertJob |
| 1 | +from sequence_processing_pipeline.ConvertJob import ( |
| 2 | + ConvertJob, ConvertPacBioBam2FastqJob) |
2 | 3 | from sequence_processing_pipeline.TellReadJob import TellReadJob |
3 | 4 | from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob |
4 | 5 | from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob |
5 | 6 | from sequence_processing_pipeline.PipelineError import PipelineError |
6 | 7 | from sequence_processing_pipeline.util import determine_orientation |
7 | | -from os.path import join, split, basename, dirname |
8 | 8 | from re import match |
9 | 9 | from os import makedirs, rename, walk |
| 10 | +from os.path import join, split, basename, dirname, exists |
10 | 11 | from metapool import load_sample_sheet |
11 | | -from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ |
| 12 | +from metapool.sample_sheet import ( |
| 13 | + PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ, |
| 14 | + PROTOCOL_NAME_PACBIO_SMRT) |
12 | 15 | import pandas as pd |
13 | 16 | from glob import glob |
14 | 17 | from qiita_client.util import system_call |
@@ -79,6 +82,26 @@ def subsample_reads(self): |
79 | 82 |
|
80 | 83 | class Illumina(Protocol): |
81 | 84 | protocol_type = PROTOCOL_NAME_ILLUMINA |
| 85 | + # required files for successful operation for Illumina (making the default |
| 86 | + # here) both RTAComplete.txt and RunInfo.xml should reside in the root of |
| 87 | + # the run directory. |
| 88 | + required_files = ['RTAComplete.txt', 'RunInfo.xml'] |
| 89 | + read_length = 'short' |
| 90 | + |
| 91 | + def __init__(self) -> None: |
| 92 | + super().__init__() |
| 93 | + |
| 94 | + for some_file in self.required_files: |
| 95 | + if not exists(join(self.run_dir, some_file)): |
| 96 | + raise PipelineError(f"required file '{some_file}' is not " |
| 97 | + f"present in {self.run_dir}.") |
| 98 | + |
| 99 | + # verify that RunInfo.xml file is readable. |
| 100 | + try: |
| 101 | + fp = open(join(self.run_dir, 'RunInfo.xml')) |
| 102 | + fp.close() |
| 103 | + except PermissionError: |
| 104 | + raise PipelineError('RunInfo.xml is present, but not readable') |
82 | 105 |
|
83 | 106 | def convert_raw_to_fastq(self): |
84 | 107 | def get_config(command): |
@@ -156,6 +179,7 @@ def generate_sequence_counts(self): |
156 | 179 |
|
157 | 180 | class TellSeq(Protocol): |
158 | 181 | protocol_type = PROTOCOL_NAME_TELLSEQ |
| 182 | + read_length = 'short' |
159 | 183 |
|
160 | 184 | def convert_raw_to_fastq(self): |
161 | 185 | config = self.pipeline.get_software_configuration('tell-seq') |
@@ -369,3 +393,75 @@ def _post_process_file(self, fastq_file, mapping, lane): |
369 | 393 | rename(fastq_file, final_path) |
370 | 394 |
|
371 | 395 | return final_path |
| 396 | + |
| 397 | + |
| 398 | +class PacBio(Protocol): |
| 399 | + protocol_type = PROTOCOL_NAME_PACBIO_SMRT |
| 400 | + read_length = 'long' |
| 401 | + |
| 402 | + def convert_raw_to_fastq(self): |
| 403 | + config = self.pipeline.get_software_configuration('pacbio_convert') |
| 404 | + |
| 405 | + job = ConvertPacBioBam2FastqJob( |
| 406 | + self.pipeline.run_dir, |
| 407 | + self.pipeline.output_path, |
| 408 | + self.pipeline.input_file_path, |
| 409 | + config['queue'], |
| 410 | + config['nodes'], |
| 411 | + config['nprocs'], |
| 412 | + config['wallclock_time_in_minutes'], |
| 413 | + config['per_process_memory_limit'], |
| 414 | + config['executable_path'], |
| 415 | + config['modules_to_load'], |
| 416 | + self.master_qiita_job_id) |
| 417 | + |
| 418 | + self.raw_fastq_files_path = join(self.pipeline.output_path, |
| 419 | + 'ConvertJob') |
| 420 | + |
| 421 | + # if ConvertJob already completed, then skip the over the time- |
| 422 | + # consuming portion but populate the needed member variables. |
| 423 | + if 'ConvertJob' not in self.skip_steps: |
| 424 | + job.run(callback=self.job_callback) |
| 425 | + |
| 426 | + # audit the results to determine which samples failed to convert |
| 427 | + # properly. Append these to the failed-samples report and also |
| 428 | + # return the list directly to the caller. |
| 429 | + failed_samples = job.audit(self.pipeline.get_sample_ids()) |
| 430 | + if hasattr(self, 'fsr'): |
| 431 | + # NB 16S does not require a failed samples report and |
| 432 | + # it is not performed by SPP. |
| 433 | + self.fsr.write(failed_samples, job.__class__.__name__) |
| 434 | + |
| 435 | + return failed_samples |
| 436 | + |
| 437 | + def generate_sequence_counts(self): |
| 438 | + # for other instances of generate_sequence_counts in other objects |
| 439 | + # the sequence counting needs to be done; however, for PacBio we |
| 440 | + # already have done it and just need to merge the results. |
| 441 | + gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz') |
| 442 | + data, missing_files = [], [] |
| 443 | + |
| 444 | + for gzf in gz_files: |
| 445 | + cf = gzf.replace('.fastq.gz', '.counts.txt') |
| 446 | + sn = basename(cf).replace( |
| 447 | + f'_S000_L00{self.lane_number}_R1_001.counts.txt', '') |
| 448 | + if not exists(cf): |
| 449 | + missing_files.append(sn) |
| 450 | + continue |
| 451 | + with open(cf, 'r') as fh: |
| 452 | + counts = fh.read().strip() |
| 453 | + data.append({'Sample_ID': sn, |
| 454 | + 'raw_reads_r1r2': counts, |
| 455 | + 'Lane': self.lane_number}) |
| 456 | + |
| 457 | + if missing_files: |
| 458 | + raise ValueError(f'Missing count files: {missing_files}') |
| 459 | + |
| 460 | + df = pd.DataFrame(data) |
| 461 | + self.reports_path = join(self.pipeline.output_path, |
| 462 | + 'ConvertJob', |
| 463 | + 'SeqCounts.csv') |
| 464 | + df.to_csv(self.reports_path, index=False) |
| 465 | + |
| 466 | + def integrate_results(self): |
| 467 | + pass |
0 commit comments