diff --git a/scripts/back_populate_library_type.py b/scripts/back_populate_library_type.py index 5754ed6d2..9458347a8 100644 --- a/scripts/back_populate_library_type.py +++ b/scripts/back_populate_library_type.py @@ -44,8 +44,8 @@ # VCGS fastq # This is the current name format vcgs_fastq_regex = ( - r'(?P\d{6})_(?P[A-Z\d]+)_(?P\d{4})_' - r'(?P[A-Z]{2}\d+)_(?P[\w\d-]+)_(?P[A-Z\d-]+)_' + r'(?P\d{6,8})_(?P[A-Z\d]+)_(?P\d{4})_' + r'(?P[A-Z]{2}\d+)_(?P[\w\d-]+)_(?P[A-Z\d-]+\D?)_' r'(?P[\w\d]+)_(?PL\d+)_(?PR[12])\.fastq\.gz' ) # Pre mid 2018 the library id was not included: @@ -115,6 +115,12 @@ def check_assay_meta_fields(assays: list[dict], update_sequencing_groups: bool): 'design_description' ) + # Some fastq files are not named in the standard format, but contain the word 'TWIST' + # which we can use to label then with the generic 'TWIST' library type from VCGS + elif 'TWIST' in fastq_filename: + assay_meta_fields_to_update['facility'] = 'vcgs' + assay_meta_fields_to_update['library_type'] = 'TWIST' + else: logging.warning( f'No file name match found for assay {assay_id}. Skipping {fastq_filename}.'