From 3feaffff964141390d0c04d6c3c0cf25ba72fc0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karatu=C4=9F=20Ozan=20Bircan?= Date: Thu, 12 Dec 2024 17:03:54 +0000 Subject: [PATCH 1/2] fix: Generate yamls from old yamls --- sumstats_service/resources/api_utils.py | 188 +++++++++++++++++++++++- 1 file changed, 184 insertions(+), 4 deletions(-) diff --git a/sumstats_service/resources/api_utils.py b/sumstats_service/resources/api_utils.py index 2ef0e6c..7d790c6 100644 --- a/sumstats_service/resources/api_utils.py +++ b/sumstats_service/resources/api_utils.py @@ -1,3 +1,4 @@ +import csv import ftplib import glob import hashlib @@ -6,9 +7,9 @@ import os import subprocess import urllib -from datetime import date +from datetime import date, datetime from pathlib import Path -from typing import Union +from typing import Optional, Union from urllib.parse import unquote from flask import url_for @@ -456,7 +457,17 @@ def generate_yaml_hm(accession_id, is_harmonised_included): # ) logger.info(f"For hm {accession_id=} - {metadata_from_gwas_cat=}") - metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() + + latest_update = find_latest_metadata_update( + os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_metadatalastupdated.csv"), + accession_id, + "harmonised", + ) + logger.info(f"Latest update was at {latest_update}") + metadata_from_gwas_cat["date_metadata_last_modified"] = ( + latest_update if latest_update else date.today() + ) + metadata_from_gwas_cat["file_type"] = get_file_type_from_mongo(accession_id) metadata_from_gwas_cat["data_file_name"] = "" @@ -523,6 +534,18 @@ def generate_yaml_hm(accession_id, is_harmonised_included): filenames_to_md5_values[metadata_filename_hm] = compute_md5_local(out_file_hm) logger.info(f"For HM {accession_id=} - {filenames_to_md5_values=}") + md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename_hm) + md5sums = find_latest_yamlmd5sums( + os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"), + accession_id, + "harmonised", + ) + + if md5sum_new_yaml not in md5sums: + metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() + metadata_client_hm.update_metadata(metadata_from_gwas_cat) + metadata_client_hm.to_file() + write_md5_for_files(filenames_to_md5_values, os.path.join(hm_dir, "md5sum.txt")) logger.info(f"Metadata yaml file creation is successful for HM {accession_id=}.") @@ -544,7 +567,16 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included): # ) logger.info(f"For non-hm {accession_id=} - {metadata_from_gwas_cat=}") - metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() + + latest_update = find_latest_metadata_update( + os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_metadatalastupdated.csv"), + accession_id, + "not_harmonised", + ) + logger.info(f"Latest update was at {latest_update}") + metadata_from_gwas_cat["date_metadata_last_modified"] = ( + latest_update if latest_update else date.today() + ) metadata_from_gwas_cat["file_type"] = get_file_type_from_mongo(accession_id) metadata_from_gwas_cat["is_harmonised"] = False @@ -619,6 +651,18 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included): filenames_to_md5_values[metadata_filename] = compute_md5_local(out_file) logger.info(f"For non-hm {accession_id=} - {filenames_to_md5_values=}") + md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename_hm) + md5sums = find_latest_yamlmd5sums( + os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"), + accession_id, + "not_harmonised", + ) + + if md5sum_new_yaml not in md5sums: + metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() + metadata_client_hm.update_metadata(metadata_from_gwas_cat) + metadata_client_hm.to_file() + write_md5_for_files(filenames_to_md5_values, os.path.join(out_dir, "md5sum.txt")) logger.info( @@ -824,6 +868,46 @@ def get_md5_for_accession( return {} +def get_md5_for_yaml( + md5_checksums: dict, + accession_id: str, + is_harmonised=False, +) -> dict: + """ + Return the key (filename) and value (MD5 checksum) from md5_checksums + if there's a key that equals to accession_id.tsv.gz-meta.yaml + or accession_id.tsv-meta.yaml. + + Parameters: + - md5_checksums: Dictionary with filenames as keys and their MD5 checksums as + values. + - accession_id: The accession ID to look for, with .tsv or .tsv.gz extensions. + + Returns: + - A dictionary with the matching filename and its MD5 checksum. Empty if no + match is found. + """ + possible_keys = ( + [f"{accession_id}.tsv-meta.yaml", f"{accession_id}.tsv.gz-meta.yaml"] + if not is_harmonised + else [f"{accession_id}.h.tsv-meta.yaml", f"{accession_id}.h.tsv.gz-meta.yaml"] + ) + + # Check for exact matches first + for key in possible_keys: + if key in md5_checksums: + return {key: md5_checksums[key]} + + # Check for partial matches if no exact match is found + # i.e., files are named _.* + # e.g. http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90308001-GCST90309000/GCST90308682/ # noqa:E501 + for key in md5_checksums: + if "yaml" in key or "yml" in key: + return {key: md5_checksums[key]} + + return {} + + def construct_get_payload_response(callback_id): response = None payload = pl.Payload(callback_id=callback_id) @@ -884,3 +968,99 @@ def create_study_report(study): def val_from_dict(key, dict, default=None): return dict[key] if key in dict else default + + +def find_latest_metadata_update( + file_path: str, gcst_id: str, harmonised_status: str +) -> Optional[str]: + """ + Finds the latest metadata last update date for a given GCST ID + and harmonised status. + + Args: + file_path (str): Path to the CSV file. + gcst_id (str): The GCST identifier to search for. + harmonised_status (str): The harmonised status to filter by + (e.g., "harmonised" or "not_harmonised"). + + Returns: + Optional[str]: The latest update date in "YYYY-MM-DD" format if found, + else None. + """ + latest_date = None + date_format = "%Y-%m-%d" + + try: + with open(file_path, mode="r", newline="", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + for row_number, row in enumerate(reader, start=1): + if len(row) != 3: + print(f"Skipping malformed row {row_number}: {row}") + continue + + current_gcst, current_status, date_str = [item.strip() for item in row] + + if ( + current_gcst == gcst_id + and current_status.lower() == harmonised_status.lower() + ): + try: + current_date = datetime.strptime(date_str, date_format) + if latest_date is None or current_date > latest_date: + latest_date = current_date + except ValueError: + print(f"Invalid date format on row {row_number}: {date_str}") + continue + + if latest_date: + return latest_date.strftime(date_format) + else: + print("No matching records found.") + return None + + except FileNotFoundError: + print(f"File not found: {file_path}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None + + +def find_latest_yamlmd5sums( + file_path: str, + gcst_id: str, + harmonised_status: str, + # search_file_name: str +) -> Optional[list]: + mdsums = [] + + try: + with open(file_path, mode="r", newline="", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + for row_number, row in enumerate(reader, start=1): + if len(row) != 4: + print(f"Skipping malformed row {row_number}: {row}") + continue + + current_gcst, current_status, file_name, md5sum = [ + item.strip() for item in row + ] + + if ( + current_gcst == gcst_id + and current_status.lower() == harmonised_status.lower() + ): + mdsums.append(md5sum) + + if mdsums: + return mdsums + else: + print("No matching records found.") + return None + + except FileNotFoundError: + print(f"File not found: {file_path}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None From 5d4f085415631c2562c460951509449bea9de7f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karatu=C4=9F=20Ozan=20Bircan?= Date: Thu, 12 Dec 2024 17:26:30 +0000 Subject: [PATCH 2/2] fix: Add logs for debugging --- sumstats_service/resources/api_utils.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sumstats_service/resources/api_utils.py b/sumstats_service/resources/api_utils.py index 7d790c6..07cd7d0 100644 --- a/sumstats_service/resources/api_utils.py +++ b/sumstats_service/resources/api_utils.py @@ -463,7 +463,7 @@ def generate_yaml_hm(accession_id, is_harmonised_included): accession_id, "harmonised", ) - logger.info(f"Latest update was at {latest_update}") + logger.info(f"For hm {accession_id=} - latest update was at {latest_update}") metadata_from_gwas_cat["date_metadata_last_modified"] = ( latest_update if latest_update else date.today() ) @@ -535,13 +535,18 @@ def generate_yaml_hm(accession_id, is_harmonised_included): logger.info(f"For HM {accession_id=} - {filenames_to_md5_values=}") md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename_hm) + logger.info(f"For HM {accession_id=} - new md5sum is {md5sum_new_yaml}") + md5sums = find_latest_yamlmd5sums( os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"), accession_id, "harmonised", ) + logger.info(f"For HM {accession_id=} - old md5sums are {md5sums}") + if md5sum_new_yaml not in md5sums: + logger.info(f"For HM {accession_id=} - Use today's date") metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() metadata_client_hm.update_metadata(metadata_from_gwas_cat) metadata_client_hm.to_file() @@ -573,7 +578,7 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included): accession_id, "not_harmonised", ) - logger.info(f"Latest update was at {latest_update}") + logger.info(f"For non-hm {accession_id=} - latest update was at {latest_update}") metadata_from_gwas_cat["date_metadata_last_modified"] = ( latest_update if latest_update else date.today() ) @@ -651,17 +656,21 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included): filenames_to_md5_values[metadata_filename] = compute_md5_local(out_file) logger.info(f"For non-hm {accession_id=} - {filenames_to_md5_values=}") - md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename_hm) + md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename) + logger.info(f"For non-hm {accession_id=} - new md5sum is {md5sum_new_yaml}") + md5sums = find_latest_yamlmd5sums( os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"), accession_id, "not_harmonised", ) + logger.info(f"For non-hm {accession_id=} - md5sums are {md5sums}") if md5sum_new_yaml not in md5sums: + logger.info(f"For non-hm {accession_id=} - Use today's date") metadata_from_gwas_cat["date_metadata_last_modified"] = date.today() - metadata_client_hm.update_metadata(metadata_from_gwas_cat) - metadata_client_hm.to_file() + metadata_client.update_metadata(metadata_from_gwas_cat) + metadata_client.to_file() write_md5_for_files(filenames_to_md5_values, os.path.join(out_dir, "md5sum.txt"))