diff --git a/api/routes/billing.py b/api/routes/billing.py index be1fb46ff..7d93a599c 100644 --- a/api/routes/billing.py +++ b/api/routes/billing.py @@ -7,11 +7,11 @@ from api.settings import BILLING_CACHE_RESPONSE_TTL, BQ_AGGREG_VIEW from api.utils.db import BqConnection, get_author from db.python.layers.billing import BillingLayer +from models.enums import BillingSource from models.models import ( BillingColumn, BillingCostBudgetRecord, BillingHailBatchCostRecord, - BillingSource, BillingTotalCostQueryModel, BillingTotalCostRecord, ) diff --git a/api/server.py b/api/server.py index 1e7a482dd..652aa4322 100644 --- a/api/server.py +++ b/api/server.py @@ -140,11 +140,11 @@ async def exception_handler(request: Request, e: Exception): cors_middleware = middlewares[0] request_origin = request.headers.get('origin', '') - if cors_middleware and '*' in cors_middleware.options['allow_origins']: # type: ignore[attr-defined] + if cors_middleware and '*' in cors_middleware.options['allow_origins']: # type: ignore response.headers['Access-Control-Allow-Origin'] = '*' elif ( cors_middleware - and request_origin in cors_middleware.options['allow_origins'] # type: ignore[attr-defined] + and request_origin in cors_middleware.options['allow_origins'] # type: ignore ): response.headers['Access-Control-Allow-Origin'] = request_origin diff --git a/db/python/layers/billing.py b/db/python/layers/billing.py index 8f991c728..737be6857 100644 --- a/db/python/layers/billing.py +++ b/db/python/layers/billing.py @@ -4,13 +4,11 @@ from db.python.tables.bq.billing_daily_extended import BillingDailyExtendedTable from db.python.tables.bq.billing_gcp_daily import BillingGcpDailyTable from db.python.tables.bq.billing_raw import BillingRawTable +from models.enums import BillingSource, BillingTimeColumn, BillingTimePeriods from models.models import ( BillingColumn, BillingCostBudgetRecord, BillingHailBatchCostRecord, - BillingSource, - BillingTimeColumn, - BillingTimePeriods, BillingTotalCostQueryModel, ) diff --git a/db/python/tables/bq/billing_base.py b/db/python/tables/bq/billing_base.py index 12e56f02b..335603c3b 100644 --- a/db/python/tables/bq/billing_base.py +++ b/db/python/tables/bq/billing_base.py @@ -12,10 +12,11 @@ from db.python.tables.bq.billing_filter import BillingFilter from db.python.tables.bq.function_bq_filter import FunctionBQFilter from db.python.tables.bq.generic_bq_filter import GenericBQFilter +from models.enums import BillingTimeColumn, BillingTimePeriods from models.models import ( BillingColumn, BillingCostBudgetRecord, - BillingTimePeriods, + BillingCostDetailsRecord, BillingTotalCostQueryModel, ) @@ -29,46 +30,60 @@ 'TimeGroupingDetails', ['field', 'formula', 'separator'] ) +# constants to abbrevate (S)tores and (C)ompute +STORAGE = 'S' +COMPUTE = 'C' + def abbrev_cost_category(cost_category: str) -> str: """abbreviate cost category""" - return 'S' if cost_category == 'Cloud Storage' else 'C' + return STORAGE if cost_category == 'Cloud Storage' else COMPUTE def prepare_time_periods( query: BillingTotalCostQueryModel, ) -> TimeGroupingDetails: """Prepare Time periods grouping and parsing formulas""" - time_column = query.time_column or 'day' - result = TimeGroupingDetails('', '', '') + time_column: BillingTimeColumn = query.time_column or BillingTimeColumn.DAY # Based on specified time period, add the corresponding column if query.time_periods == BillingTimePeriods.DAY: - result = TimeGroupingDetails( - field=f'FORMAT_DATE("%Y-%m-%d", {time_column}) as day', + return TimeGroupingDetails( + field=f'FORMAT_DATE("%Y-%m-%d", {time_column.value}) as day', formula='PARSE_DATE("%Y-%m-%d", day) as day', separator=',', ) - elif query.time_periods == BillingTimePeriods.WEEK: - result = TimeGroupingDetails( - field=f'FORMAT_DATE("%Y%W", {time_column}) as day', + + if query.time_periods == BillingTimePeriods.WEEK: + return TimeGroupingDetails( + field=f'FORMAT_DATE("%Y%W", {time_column.value}) as day', formula='PARSE_DATE("%Y%W", day) as day', separator=',', ) - elif query.time_periods == BillingTimePeriods.MONTH: - result = TimeGroupingDetails( - field=f'FORMAT_DATE("%Y%m", {time_column}) as day', + + if query.time_periods == BillingTimePeriods.MONTH: + return TimeGroupingDetails( + field=f'FORMAT_DATE("%Y%m", {time_column.value}) as day', formula='PARSE_DATE("%Y%m", day) as day', separator=',', ) - elif query.time_periods == BillingTimePeriods.INVOICE_MONTH: - result = TimeGroupingDetails( + + if query.time_periods == BillingTimePeriods.INVOICE_MONTH: + return TimeGroupingDetails( field='invoice_month as day', formula='PARSE_DATE("%Y%m", day) as day', separator=',', ) - return result + return TimeGroupingDetails('', '', '') + + +def time_optimisation_parameter() -> bigquery.ScalarQueryParameter: + """ + BQ tables and views are partitioned by day, to avoid full scans + we need to limit the amount of data scanned + """ + return bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)) class BillingBaseTable(BqDbBase): @@ -165,7 +180,7 @@ async def _budgets_by_gcp_project( WITH t AS ( SELECT gcp_project, MAX(created_at) as last_created_at FROM `{BQ_BUDGET_VIEW}` - GROUP BY 1 + GROUP BY gcp_project ) SELECT t.gcp_project, d.budget FROM t inner join `{BQ_BUDGET_VIEW}` d @@ -193,7 +208,7 @@ async def _last_loaded_day(self): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) @@ -328,38 +343,36 @@ async def _append_total_running_cost( all_details = [] for cat, mth_cost in total_monthly_category.items(): all_details.append( - { - 'cost_group': abbrev_cost_category(cat), - 'cost_category': cat, - 'daily_cost': total_daily_category[cat] - if is_current_month - else None, - 'monthly_cost': mth_cost, - } + BillingCostDetailsRecord( + cost_group=abbrev_cost_category(cat), + cost_category=cat, + daily_cost=total_daily_category[cat] if is_current_month else None, + monthly_cost=mth_cost, + ) ) # add total row: compute + storage results.append( - BillingCostBudgetRecord.from_json( - { - 'field': f'{BillingColumn.generate_all_title(field)}', - 'total_monthly': ( - total_monthly['C']['ALL'] + total_monthly['S']['ALL'] - ), - 'total_daily': (total_daily['C']['ALL'] + total_daily['S']['ALL']) - if is_current_month - else None, - 'compute_monthly': total_monthly['C']['ALL'], - 'compute_daily': (total_daily['C']['ALL']) - if is_current_month - else None, - 'storage_monthly': total_monthly['S']['ALL'], - 'storage_daily': (total_daily['S']['ALL']) - if is_current_month - else None, - 'details': all_details, - 'last_loaded_day': last_loaded_day, - } + BillingCostBudgetRecord( + field=f'{BillingColumn.generate_all_title(field)}', + total_monthly=( + total_monthly[COMPUTE]['ALL'] + total_monthly[STORAGE]['ALL'] + ), + total_daily=(total_daily[COMPUTE]['ALL'] + total_daily[STORAGE]['ALL']) + if is_current_month + else None, + compute_monthly=total_monthly[COMPUTE]['ALL'], + compute_daily=(total_daily[COMPUTE]['ALL']) + if is_current_month + else None, + storage_monthly=total_monthly[STORAGE]['ALL'], + storage_daily=(total_daily[STORAGE]['ALL']) + if is_current_month + else None, + details=all_details, + budget_spent=None, + budget=None, + last_loaded_day=last_loaded_day, ) ) @@ -385,13 +398,17 @@ async def _append_running_cost_records( # add rows by field for key, details in field_details.items(): - compute_daily = total_daily['C'][key] if key in total_daily['C'] else 0 - storage_daily = total_daily['S'][key] if key in total_daily['S'] else 0 + compute_daily = ( + total_daily[COMPUTE][key] if key in total_daily[COMPUTE] else 0 + ) + storage_daily = ( + total_daily[STORAGE][key] if key in total_daily[STORAGE] else 0 + ) compute_monthly = ( - total_monthly['C'][key] if key in total_monthly['C'] else 0 + total_monthly[COMPUTE][key] if key in total_monthly[COMPUTE] else 0 ) storage_monthly = ( - total_monthly['S'][key] if key in total_monthly['S'] else 0 + total_monthly[STORAGE][key] if key in total_monthly[STORAGE] else 0 ) monthly = compute_monthly + storage_monthly budget_monthly = budgets_per_gcp_project.get(key) diff --git a/db/python/tables/bq/billing_daily.py b/db/python/tables/bq/billing_daily.py index e7cfd2148..14f21cef0 100644 --- a/db/python/tables/bq/billing_daily.py +++ b/db/python/tables/bq/billing_daily.py @@ -1,7 +1,10 @@ from google.cloud import bigquery -from api.settings import BQ_AGGREG_VIEW, BQ_DAYS_BACK_OPTIMAL -from db.python.tables.bq.billing_base import BillingBaseTable +from api.settings import BQ_AGGREG_VIEW +from db.python.tables.bq.billing_base import ( + BillingBaseTable, + time_optimisation_parameter, +) class BillingDailyTable(BillingBaseTable): @@ -31,7 +34,7 @@ async def get_topics(self): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) @@ -42,12 +45,13 @@ async def get_topics(self): return [] async def get_invoice_months(self): - """Get all invoice months in database""" + """Get all invoice months in database + Aggregated views contain invoice_month field + """ _query = f""" - SELECT DISTINCT FORMAT_DATE("%Y%m", day) as invoice_month + SELECT DISTINCT invoice_month FROM `{self.table_name}` - WHERE EXTRACT(day from day) = 1 ORDER BY invoice_month DESC; """ @@ -76,7 +80,7 @@ async def get_cost_categories(self): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) @@ -114,7 +118,7 @@ async def get_skus( _query += ' OFFSET @offset_val' query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), bigquery.ScalarQueryParameter('limit_val', 'INT64', limit), bigquery.ScalarQueryParameter('offset_val', 'INT64', offset), ] diff --git a/db/python/tables/bq/billing_daily_extended.py b/db/python/tables/bq/billing_daily_extended.py index e65dd2787..009144911 100644 --- a/db/python/tables/bq/billing_daily_extended.py +++ b/db/python/tables/bq/billing_daily_extended.py @@ -1,7 +1,8 @@ -from google.cloud import bigquery - -from api.settings import BQ_AGGREG_EXT_VIEW, BQ_DAYS_BACK_OPTIMAL -from db.python.tables.bq.billing_base import BillingBaseTable +from api.settings import BQ_AGGREG_EXT_VIEW +from db.python.tables.bq.billing_base import ( + BillingBaseTable, + time_optimisation_parameter, +) from models.models import BillingColumn @@ -39,7 +40,7 @@ async def get_extended_values(self, field: str): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) diff --git a/db/python/tables/bq/billing_gcp_daily.py b/db/python/tables/bq/billing_gcp_daily.py index 1588ab65f..b765547c3 100644 --- a/db/python/tables/bq/billing_gcp_daily.py +++ b/db/python/tables/bq/billing_gcp_daily.py @@ -2,8 +2,11 @@ from google.cloud import bigquery -from api.settings import BQ_DAYS_BACK_OPTIMAL, BQ_GCP_BILLING_VIEW -from db.python.tables.bq.billing_base import BillingBaseTable +from api.settings import BQ_GCP_BILLING_VIEW +from db.python.tables.bq.billing_base import ( + BillingBaseTable, + time_optimisation_parameter, +) from db.python.tables.bq.billing_filter import BillingFilter from db.python.tables.bq.generic_bq_filter import GenericBQFilter from models.models import BillingTotalCostQueryModel @@ -39,6 +42,15 @@ def _query_to_partitioned_filter( if query.end_date else None, ) + # add day filter after partition filter is applied + billing_filter.day = GenericBQFilter[datetime]( + gte=datetime.strptime(query.start_date, '%Y-%m-%d') + if query.start_date + else None, + lte=datetime.strptime(query.end_date, '%Y-%m-%d') + if query.end_date + else None, + ) return billing_filter async def _last_loaded_day(self): @@ -56,7 +68,7 @@ async def _last_loaded_day(self): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) @@ -121,7 +133,7 @@ async def get_gcp_projects(self): """ query_parameters = [ - bigquery.ScalarQueryParameter('days', 'INT64', -int(BQ_DAYS_BACK_OPTIMAL)), + time_optimisation_parameter(), ] query_job_result = self._execute_query(_query, query_parameters) diff --git a/db/python/tables/bq/function_bq_filter.py b/db/python/tables/bq/function_bq_filter.py index 07e89e1eb..f18f60211 100644 --- a/db/python/tables/bq/function_bq_filter.py +++ b/db/python/tables/bq/function_bq_filter.py @@ -97,7 +97,7 @@ def _sql_cond_prep(key: str, value: Any) -> str: def _sql_value_prep(key: str, value: Any) -> bigquery.ScalarQueryParameter: """ """ if isinstance(value, Enum): - return bigquery.ScalarQueryParameter(key, 'STRING', value.value) + return FunctionBQFilter._sql_value_prep(key, value.value) if isinstance(value, int): return bigquery.ScalarQueryParameter(key, 'INT64', value) if isinstance(value, float): diff --git a/db/python/tables/bq/generic_bq_filter.py b/db/python/tables/bq/generic_bq_filter.py index 8aeabd729..b0bfba973 100644 --- a/db/python/tables/bq/generic_bq_filter.py +++ b/db/python/tables/bq/generic_bq_filter.py @@ -38,17 +38,13 @@ def to_sql( values[k] = self._sql_value_prep(k, self.in_[0]) else: k = self.generate_field_name(_column_name + '_in') - conditionals.append( - f'{column} IN UNNEST({self._sql_cond_prep(k, self.in_)})' - ) + conditionals.append(f'{column} IN ({self._sql_cond_prep(k, self.in_)})') values[k] = self._sql_value_prep(k, self.in_) if self.nin is not None: if not isinstance(self.nin, list): raise ValueError('NIN filter must be a list') k = self.generate_field_name(column + '_nin') - conditionals.append( - f'{column} NOT IN UNNEST({self._sql_cond_prep(k, self.nin)})' - ) + conditionals.append(f'{column} NOT IN ({self._sql_cond_prep(k, self.nin)})') values[k] = self._sql_value_prep(k, self.nin) if self.gt is not None: k = self.generate_field_name(column + '_gt') @@ -87,16 +83,11 @@ def _sql_value_prep(key, value): Overrides the default _sql_value_prep to handle BQ parameters """ if isinstance(value, list): - if value and isinstance(value[0], int): - return bigquery.ArrayQueryParameter(key, 'INT64', value) - if value and isinstance(value[0], float): - return bigquery.ArrayQueryParameter(key, 'FLOAT64', value) - - # otherwise all list records as string - return bigquery.ArrayQueryParameter(key, 'STRING', [str(v) for v in value]) - + return bigquery.ArrayQueryParameter( + key, 'STRING', ','.join([str(v) for v in value]) + ) if isinstance(value, Enum): - return bigquery.ScalarQueryParameter(key, 'STRING', value.value) + return GenericBQFilter._sql_value_prep(key, value.value) if isinstance(value, int): return bigquery.ScalarQueryParameter(key, 'INT64', value) if isinstance(value, float): diff --git a/db/python/tables/bq/generic_bq_filter_model.py b/db/python/tables/bq/generic_bq_filter_model.py index 3de5051af..c2736cc3a 100644 --- a/db/python/tables/bq/generic_bq_filter_model.py +++ b/db/python/tables/bq/generic_bq_filter_model.py @@ -19,6 +19,8 @@ def prepare_bq_query_from_dict_field( raise ValueError(f'Filter {field_name} must be a GenericFilter') if '"' in key: raise ValueError('Meta key contains " character, which is not allowed') + if "'" in key: + raise ValueError("Meta key contains ' character, which is not allowed") fconditionals, fvalues = value.to_sql( f"JSON_EXTRACT({column_name}, '$.{key}')", column_name=f'{column_name}_{key}', diff --git a/models/enums/__init__.py b/models/enums/__init__.py index 14bcb9d5a..14f047786 100644 --- a/models/enums/__init__.py +++ b/models/enums/__init__.py @@ -1,3 +1,4 @@ from models.enums.analysis import AnalysisStatus +from models.enums.billing import BillingSource, BillingTimeColumn, BillingTimePeriods from models.enums.search import SearchResponseType from models.enums.web import MetaSearchEntityPrefix diff --git a/models/models/__init__.py b/models/models/__init__.py index d3b836e9a..ce5868cb1 100644 --- a/models/models/__init__.py +++ b/models/models/__init__.py @@ -17,9 +17,6 @@ BillingCostDetailsRecord, BillingHailBatchCostRecord, BillingInternal, - BillingSource, - BillingTimeColumn, - BillingTimePeriods, BillingTotalCostQueryModel, BillingTotalCostRecord, ) diff --git a/models/models/billing.py b/models/models/billing.py index 05d062519..9587ed44c 100644 --- a/models/models/billing.py +++ b/models/models/billing.py @@ -135,59 +135,59 @@ def str_to_enum(cls, value: str) -> 'BillingColumn': def raw_cols(cls) -> list[str]: """Return list of raw column names""" return [ - 'id', - 'topic', - 'service', - 'sku', - 'usage_start_time', - 'usage_end_time', - 'project', - 'labels', - 'system_labels', - 'location', - 'export_time', - 'cost', - 'currency', - 'currency_conversion_rate', - 'usage', - 'credits', - 'invoice', - 'cost_type', - 'adjustment_info', + BillingColumn.ID.value, + BillingColumn.TOPIC.value, + BillingColumn.SERVICE.value, + BillingColumn.SKU.value, + BillingColumn.USAGE_START_TIME.value, + BillingColumn.USAGE_END_TIME.value, + BillingColumn.PROJECT.value, + BillingColumn.LABELS.value, + BillingColumn.SYSTEM_LABELS.value, + BillingColumn.LOCATION.value, + BillingColumn.EXPORT_TIME.value, + BillingColumn.COST.value, + BillingColumn.CURRENCY.value, + BillingColumn.CURRENCY_CONVERSION_RATE.value, + BillingColumn.USAGE.value, + BillingColumn.CREDITS.value, + BillingColumn.INVOICE.value, + BillingColumn.COST_TYPE.value, + BillingColumn.ADJUSTMENT_INFO.value, ] @classmethod def standard_cols(cls) -> list[str]: """Return list of standard column names""" return [ - 'topic', - 'gcp_project', - 'sku', - 'currency', - 'cost', - 'labels', - 'day', - 'cost_category', - 'ar_guid', - 'invoice_month', + BillingColumn.TOPIC.value, + BillingColumn.GCP_PROJECT.value, + BillingColumn.SKU.value, + BillingColumn.CURRENCY.value, + BillingColumn.COST.value, + BillingColumn.LABELS.value, + BillingColumn.DAY.value, + BillingColumn.COST_CATEGORY.value, + BillingColumn.AR_GUID.value, + BillingColumn.INVOICE_MONTH.value, ] @classmethod def extended_cols(cls) -> list[str]: """Return list of extended column names""" return [ - 'dataset', - 'batch_id', - 'sequencing_type', - 'stage', - 'sequencing_group', - 'ar_guid', - 'compute_category', - 'cromwell_sub_workflow_name', - 'cromwell_workflow_id', - 'goog_pipelines_worker', - 'wdl_task_name', - 'namespace', + BillingColumn.DATASET.value, + BillingColumn.BATCH_ID.value, + BillingColumn.SEQUENCING_TYPE.value, + BillingColumn.STAGE.value, + BillingColumn.SEQUENCING_GROUP.value, + BillingColumn.AR_GUID.value, + BillingColumn.COMPUTE_CATEGORY.value, + BillingColumn.CROMWELL_SUB_WORKFLOW_NAME.value, + BillingColumn.CROMWELL_WORKFLOW_ID.value, + BillingColumn.GOOG_PIPELINES_WORKER.value, + BillingColumn.WDL_TASK_NAME.value, + BillingColumn.NAMESPACE.value, ] @staticmethod diff --git a/scripts/create_md5s.py b/scripts/create_md5s.py index 488ec8297..f99f64906 100644 --- a/scripts/create_md5s.py +++ b/scripts/create_md5s.py @@ -12,24 +12,25 @@ def create_md5s_for_files_in_directory(skip_filetypes: tuple[str, str], force_re if not gs_dir.startswith('gs://'): raise ValueError(f'Expected GS directory, got: {gs_dir}') - billing_project = get_config()['hail']['billing_project'] + billing_project = get_config()['workflow']['gcp_billing_project'] driver_image = get_config()['workflow']['driver_image'] bucket_name, *components = gs_dir[5:].split('/') client = storage.Client() - blobs = client.list_blobs(bucket_name, prefix='/'.join(components)) + bucket = client.bucket(bucket_name, user_project=billing_project) + blobs = bucket.list_blobs(prefix='/'.join(components)) files: set[str] = {f'gs://{bucket_name}/{blob.name}' for blob in blobs} - for obj in files: - if obj.endswith('.md5') or obj.endswith(skip_filetypes): + for filepath in files: + if filepath.endswith('.md5') or filepath.endswith(skip_filetypes): continue - if f'{obj}.md5' in files and not force_recreate: - print(f'{obj}.md5 already exists, skipping') + if f'{filepath}.md5' in files and not force_recreate: + print(f'{filepath}.md5 already exists, skipping') continue - print('Creating md5 for', obj) - job = b.new_job(f'Create {os.path.basename(obj)}.md5') - create_md5(job, obj, billing_project, driver_image) + print('Creating md5 for', filepath) + job = b.new_job(f'Create {os.path.basename(filepath)}.md5') + create_md5(job, filepath, billing_project, driver_image) b.run(wait=False) @@ -46,7 +47,7 @@ def create_md5(job, file, billing_project, driver_image): f"""\ set -euxo pipefail gcloud -q auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS - gsutil cat {file} | md5sum | cut -d " " -f1 > /tmp/uploaded.md5 + gsutil -u {billing_project} cat {file} | md5sum | cut -d " " -f1 > /tmp/uploaded.md5 gsutil -u {billing_project} cp /tmp/uploaded.md5 {md5} """ ) diff --git a/scripts/create_test_subset.py b/scripts/create_test_subset.py index cc1adf52b..14f8ccd00 100755 --- a/scripts/create_test_subset.py +++ b/scripts/create_test_subset.py @@ -276,7 +276,7 @@ def transfer_samples_sgs_assays( type=sample_type or None, meta=(copy_files_in_dict(s['meta'], project) or {}), participant_id=existing_pid, - sequencing_groups=upsert_sequencing_groups(s, existing_data), + sequencing_groups=upsert_sequencing_groups(s, existing_data, project), id=existing_sid, ) @@ -289,7 +289,7 @@ def transfer_samples_sgs_assays( def upsert_sequencing_groups( - sample: dict, existing_data: dict + sample: dict, existing_data: dict, project: str ) -> list[SequencingGroupUpsert]: """Create SG Upsert Objects for a sample""" sgs_to_upsert: list[SequencingGroupUpsert] = [] @@ -306,7 +306,7 @@ def upsert_sequencing_groups( technology=sg.get('technology'), type=sg.get('type'), assays=upsert_assays( - sg, existing_sgid, existing_data, sample.get('externalId') + sg, existing_sgid, existing_data, sample.get('externalId'), project ), ) sgs_to_upsert.append(sg_upsert) @@ -315,7 +315,11 @@ def upsert_sequencing_groups( def upsert_assays( - sg: dict, existing_sgid: str | None, existing_data: dict, sample_external_id + sg: dict, + existing_sgid: str | None, + existing_data: dict, + sample_external_id, + project: str, ) -> list[AssayUpsert]: """Create Assay Upsert Objects for a sequencing group""" print(sg) @@ -325,17 +329,14 @@ def upsert_assays( # Check if assay exists if existing_sgid: _existing_assay = get_existing_assay( - existing_data, - sample_external_id, - existing_sgid, - assay + existing_data, sample_external_id, existing_sgid, assay ) existing_assay_id = _existing_assay.get('id') if _existing_assay else None assay_upsert = AssayUpsert( type=assay.get('type'), id=existing_assay_id, external_ids=assay.get('externalIds') or {}, - meta=assay.get('meta'), + meta=copy_files_in_dict(assay.get('meta'), project), ) assays_to_upsert.append(assay_upsert) diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py index 34d64f370..2603a8d60 100644 --- a/scripts/parse_existing_cohort.py +++ b/scripts/parse_existing_cohort.py @@ -111,6 +111,7 @@ def __init__( batch_number, include_participant_column, allow_missing_files, + sequencing_type, ): if include_participant_column: participant_column = Columns.PARTICIPANT_COLUMN @@ -131,6 +132,7 @@ def __init__( assay_meta_map=Columns.sequence_meta_map(), batch_number=batch_number, allow_extra_files_in_search_path=True, + default_sequencing_type=sequencing_type, ) def _get_dict_reader(self, file_pointer, delimiter: str): @@ -210,6 +212,11 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]): '--project', help='The metamist project to import manifest into', ) +@click.option( + '--sequencing-type', + type=click.Choice(['genome', 'exome']), + help='Sequencing type: genome or exome', +) @click.option('--search-location', 'search_locations', multiple=True) @click.option( '--confirm', is_flag=True, help='Confirm with user input before updating server' @@ -236,6 +243,7 @@ async def main( dry_run=False, include_participant_column=False, allow_missing_files=False, + sequencing_type: str = 'genome', ): """Run script from CLI arguments""" @@ -245,6 +253,7 @@ async def main( batch_number=batch_number, include_participant_column=include_participant_column, allow_missing_files=allow_missing_files, + sequencing_type=sequencing_type, ) for manifest_path in manifests: diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py index d8e755bf7..8fb169803 100644 --- a/test/test_parse_existing_cohort.py +++ b/test/test_parse_existing_cohort.py @@ -45,6 +45,7 @@ async def test_single_row( search_locations=[], project=self.project_name, allow_missing_files=False, + sequencing_type='genome', ) parser.filename_map = { @@ -116,6 +117,7 @@ async def test_no_header(self): search_locations=[], project=self.project_name, allow_missing_files=False, + sequencing_type='genome', ) parser.filename_map = { @@ -217,6 +219,7 @@ async def test_existing_row( search_locations=[], project=self.project_name, allow_missing_files=False, + sequencing_type='genome', ) parser.filename_map = { @@ -248,6 +251,7 @@ async def test_get_read_filenames_no_reads_fail(self): search_locations=[], project=self.project_name, allow_missing_files=False, + sequencing_type='genome', ) parser.filename_map = {} @@ -268,6 +272,7 @@ async def test_get_read_filenames_no_reads_pass(self): search_locations=[], project=self.project_name, allow_missing_files=True, + sequencing_type='genome', ) parser.filename_map = {} @@ -280,3 +285,117 @@ async def test_get_read_filenames_no_reads_pass(self): self.assertIn('No read files found for ', cm.output[0]) self.assertEqual(len(read_filenames), 0) + + @run_as_sync + async def test_genome_sequencing_type(self): + """Test that the sequencing type is set correctly when the --sequencing-type flag is set to 'genome''""" + + # Test with 'genome' + parser = ExistingCohortParser( + include_participant_column=False, + batch_number='M01', + search_locations=[], + project=self.project_name, + allow_missing_files=True, + sequencing_type='genome', + ) + self.assertEqual(parser.default_sequencing_type, 'genome') + + @run_as_sync + async def test_exome_sequencing_type(self): + """Test that the sequencing type is set correctly when the --sequencing-type flag is set to 'exome'""" + + # Test with 'exome' + parser = ExistingCohortParser( + include_participant_column=False, + batch_number='M01', + search_locations=[], + project=self.project_name, + allow_missing_files=True, + sequencing_type='exome', + ) + self.assertEqual(parser.default_sequencing_type, 'exome') + + @run_as_sync + @patch('metamist.parser.generic_parser.query_async') + @patch('metamist.parser.cloudhelper.CloudHelper.datetime_added') + @patch('metamist.parser.cloudhelper.CloudHelper.file_exists') + @patch('metamist.parser.cloudhelper.CloudHelper.file_size') + async def test_sequencing_type_in_assay_meta( + self, + mock_filesize, + mock_fileexists, + mock_datetime_added, + mock_graphql_query, + ): + """Test that the sequencing type is set correctly when the --sequencing-type flag is set to 'genome' or 'exome'""" + + mock_graphql_query.side_effect = self.run_graphql_query_async + + mock_filesize.return_value = 111 + mock_fileexists.return_value = False + mock_datetime_added.return_value = datetime.fromisoformat('2022-02-02T22:22:22') + + rows = [ + 'HEADER', + '""', + 'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\tParticipant ID\t', + 'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\tPID123', + ] + + for sequencing_type in ['genome', 'exome']: + with self.subTest(sequencing_type=sequencing_type): + parser = ExistingCohortParser( + include_participant_column=False, + batch_number='M01', + search_locations=[], + project=self.project_name, + allow_missing_files=False, + sequencing_type=sequencing_type, + ) + parser.filename_map = { + 'HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq', + 'HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq', + } + + file_contents = '\n'.join(rows) + participants: list[ParsedParticipant] + _, participants = await parser.parse_manifest( + StringIO(file_contents), delimiter='\t', dry_run=True + ) + + sample_to_add = participants[0].samples[0] + expected_sequence_dict = { + 'reference_genome': 'hg38', + 'platform': 'App', + 'concentration': '100', + 'volume': '100', + 'fluid_x_tube_id': '220405_FLUIDX1234', + 'reads_type': 'fastq', + 'reads': [ + { + 'location': '/path/to/HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq', + 'basename': 'HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq', + 'class': 'File', + 'checksum': None, + 'size': 111, + 'datetime_added': '2022-02-02T22:22:22', + }, + { + 'location': '/path/to/HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq', + 'basename': 'HG3F_2_220405_FLUIDX1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq', + 'class': 'File', + 'checksum': None, + 'size': 111, + 'datetime_added': '2022-02-02T22:22:22', + }, + ], + 'sequencing_platform': 'illumina', + 'sequencing_technology': 'short-read', + 'sequencing_type': f'{sequencing_type}', + 'batch': 'M01', + } + assay = sample_to_add.sequencing_groups[0].assays[0] + self.maxDiff = None + self.assertDictEqual(expected_sequence_dict, assay.meta) + return diff --git a/web/src/pages/billing/BillingInvoiceMonthCost.tsx b/web/src/pages/billing/BillingInvoiceMonthCost.tsx index 3dcfc9e48..e06324670 100644 --- a/web/src/pages/billing/BillingInvoiceMonthCost.tsx +++ b/web/src/pages/billing/BillingInvoiceMonthCost.tsx @@ -169,7 +169,7 @@ const BillingCurrentCost = () => { const year = invoiceMonth.substring(0, 4) const month = invoiceMonth.substring(4, 6) let nextYear = year - let nextMonth = (parseInt(month, 10) + 1).toString() + let nextMonth = (parseInt(month, 10) + 1).toString().padStart(2, '0') if (month === '12') { nextYear = (parseInt(year, 10) + 1).toString() nextMonth = '01' diff --git a/web/src/pages/billing/components/HailBatchGrid.tsx b/web/src/pages/billing/components/HailBatchGrid.tsx index 4596a382f..d3e8e199c 100644 --- a/web/src/pages/billing/components/HailBatchGrid.tsx +++ b/web/src/pages/billing/components/HailBatchGrid.tsx @@ -398,6 +398,7 @@ const HailBatchGrid: React.FunctionComponent<{ {combinedData .sort((a, b) => { + // Sorts an array of objects first by 'batch_id' and then by 'job_id' in ascending order. if (a.batch_id < b.batch_id) { return -1 } diff --git a/web/src/shared/components/Graphs/DonutChart.tsx b/web/src/shared/components/Graphs/DonutChart.tsx index 9f179098c..02f46292b 100644 --- a/web/src/shared/components/Graphs/DonutChart.tsx +++ b/web/src/shared/components/Graphs/DonutChart.tsx @@ -30,13 +30,13 @@ function calcTranslate(data: IDonutChartPreparadData, move = 4) { } export const DonutChart: React.FC = ({ data, maxSlices, colors, isLoading }) => { - // if (isLoading) { - // return ( - //
- // - //
- // ) - // } + if (isLoading) { + return ( +
+ +
+ ) + } if (!data || data.length === 0) { return <>No Data @@ -79,14 +79,15 @@ export const DonutChart: React.FC = ({ data, maxSlices, colors const margin = 15 const radius = Math.min(width, height) / 2 - margin - // keep order of the slices + // keep order of the slices, declare custom sort function to keep order of slices as passed in + // by default pie function starts from index 1 and sorts by value const pieFnc = pie() .value((d) => d.value) .sort((a) => { if (typeof a === 'object' && a.type === 'inc') { return 1 } - return -1 + return 0 // works both on Safari and Firefox, any other value will break one of them }) const data_ready = pieFnc(data) const innerRadius = radius / 1.75 // inner radius of pie, in pixels (non-zero for donut) diff --git a/web/src/shared/components/Graphs/StackedBarChart.tsx b/web/src/shared/components/Graphs/StackedBarChart.tsx index f91955e18..00d8d79b0 100644 --- a/web/src/shared/components/Graphs/StackedBarChart.tsx +++ b/web/src/shared/components/Graphs/StackedBarChart.tsx @@ -25,6 +25,14 @@ function alignToStartOfMonth(date: Date): Date { return new Date(Date.UTC(year, month, 1)) } +/** + * Creates an array of three new dates, each incremented by a specified number of days from the given last date. + * If the difference in days is greater than 28, the dates are aligned to the start of their respective months. + * + * @param lastDate - The last date from which the new dates will be calculated. + * @param differenceInDays - The number of days to increment for each new date. + * @returns An array of three new Date objects. + */ function createNewDates(lastDate: Date, differenceInDays: number): Date[] { const newDates: Date[] = [] for (let i = 1; i <= 3; i++) {