Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Generate yamls from old yamls #365

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 193 additions & 4 deletions sumstats_service/resources/api_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import ftplib
import glob
import hashlib
Expand All @@ -6,9 +7,9 @@
import os
import subprocess
import urllib
from datetime import date
from datetime import date, datetime
from pathlib import Path
from typing import Union
from typing import Optional, Union
from urllib.parse import unquote

from flask import url_for
Expand Down Expand Up @@ -456,7 +457,17 @@ def generate_yaml_hm(accession_id, is_harmonised_included):
#
)
logger.info(f"For hm {accession_id=} - {metadata_from_gwas_cat=}")
metadata_from_gwas_cat["date_metadata_last_modified"] = date.today()

latest_update = find_latest_metadata_update(
os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_metadatalastupdated.csv"),
accession_id,
"harmonised",
)
logger.info(f"For hm {accession_id=} - latest update was at {latest_update}")
metadata_from_gwas_cat["date_metadata_last_modified"] = (
latest_update if latest_update else date.today()
)

metadata_from_gwas_cat["file_type"] = get_file_type_from_mongo(accession_id)

metadata_from_gwas_cat["data_file_name"] = ""
Expand Down Expand Up @@ -523,6 +534,23 @@ def generate_yaml_hm(accession_id, is_harmonised_included):
filenames_to_md5_values[metadata_filename_hm] = compute_md5_local(out_file_hm)
logger.info(f"For HM {accession_id=} - {filenames_to_md5_values=}")

md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename_hm)
logger.info(f"For HM {accession_id=} - new md5sum is {md5sum_new_yaml}")

md5sums = find_latest_yamlmd5sums(
os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"),
accession_id,
"harmonised",
)

logger.info(f"For HM {accession_id=} - old md5sums are {md5sums}")

if md5sum_new_yaml not in md5sums:
logger.info(f"For HM {accession_id=} - Use today's date")
metadata_from_gwas_cat["date_metadata_last_modified"] = date.today()
metadata_client_hm.update_metadata(metadata_from_gwas_cat)
metadata_client_hm.to_file()

write_md5_for_files(filenames_to_md5_values, os.path.join(hm_dir, "md5sum.txt"))
logger.info(f"Metadata yaml file creation is successful for HM {accession_id=}.")

Expand All @@ -544,7 +572,16 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included):
#
)
logger.info(f"For non-hm {accession_id=} - {metadata_from_gwas_cat=}")
metadata_from_gwas_cat["date_metadata_last_modified"] = date.today()

latest_update = find_latest_metadata_update(
os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_metadatalastupdated.csv"),
accession_id,
"not_harmonised",
)
logger.info(f"For non-hm {accession_id=} - latest update was at {latest_update}")
metadata_from_gwas_cat["date_metadata_last_modified"] = (
latest_update if latest_update else date.today()
)
metadata_from_gwas_cat["file_type"] = get_file_type_from_mongo(accession_id)

metadata_from_gwas_cat["is_harmonised"] = False
Expand Down Expand Up @@ -619,6 +656,22 @@ def generate_yaml_non_hm(accession_id, is_harmonised_included):
filenames_to_md5_values[metadata_filename] = compute_md5_local(out_file)
logger.info(f"For non-hm {accession_id=} - {filenames_to_md5_values=}")

md5sum_new_yaml = filenames_to_md5_values.get(metadata_filename)
logger.info(f"For non-hm {accession_id=} - new md5sum is {md5sum_new_yaml}")

md5sums = find_latest_yamlmd5sums(
os.path.join(config.FTP_STAGING_PATH, "gcst_harmo_yamlmd5sum.csv"),
accession_id,
"not_harmonised",
)
logger.info(f"For non-hm {accession_id=} - md5sums are {md5sums}")

if md5sum_new_yaml not in md5sums:
logger.info(f"For non-hm {accession_id=} - Use today's date")
metadata_from_gwas_cat["date_metadata_last_modified"] = date.today()
metadata_client.update_metadata(metadata_from_gwas_cat)
metadata_client.to_file()

write_md5_for_files(filenames_to_md5_values, os.path.join(out_dir, "md5sum.txt"))

logger.info(
Expand Down Expand Up @@ -824,6 +877,46 @@ def get_md5_for_accession(
return {}


def get_md5_for_yaml(
md5_checksums: dict,
accession_id: str,
is_harmonised=False,
) -> dict:
"""
Return the key (filename) and value (MD5 checksum) from md5_checksums
if there's a key that equals to accession_id.tsv.gz-meta.yaml
or accession_id.tsv-meta.yaml.

Parameters:
- md5_checksums: Dictionary with filenames as keys and their MD5 checksums as
values.
- accession_id: The accession ID to look for, with .tsv or .tsv.gz extensions.

Returns:
- A dictionary with the matching filename and its MD5 checksum. Empty if no
match is found.
"""
possible_keys = (
[f"{accession_id}.tsv-meta.yaml", f"{accession_id}.tsv.gz-meta.yaml"]
if not is_harmonised
else [f"{accession_id}.h.tsv-meta.yaml", f"{accession_id}.h.tsv.gz-meta.yaml"]
)

# Check for exact matches first
for key in possible_keys:
if key in md5_checksums:
return {key: md5_checksums[key]}

# Check for partial matches if no exact match is found
# i.e., files are named <GCST ID>_<build number>.*
# e.g. http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90308001-GCST90309000/GCST90308682/ # noqa:E501
for key in md5_checksums:
if "yaml" in key or "yml" in key:
return {key: md5_checksums[key]}

return {}


def construct_get_payload_response(callback_id):
response = None
payload = pl.Payload(callback_id=callback_id)
Expand Down Expand Up @@ -884,3 +977,99 @@ def create_study_report(study):

def val_from_dict(key, dict, default=None):
return dict[key] if key in dict else default


def find_latest_metadata_update(
file_path: str, gcst_id: str, harmonised_status: str
) -> Optional[str]:
"""
Finds the latest metadata last update date for a given GCST ID
and harmonised status.

Args:
file_path (str): Path to the CSV file.
gcst_id (str): The GCST identifier to search for.
harmonised_status (str): The harmonised status to filter by
(e.g., "harmonised" or "not_harmonised").

Returns:
Optional[str]: The latest update date in "YYYY-MM-DD" format if found,
else None.
"""
latest_date = None
date_format = "%Y-%m-%d"

try:
with open(file_path, mode="r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row_number, row in enumerate(reader, start=1):
if len(row) != 3:
print(f"Skipping malformed row {row_number}: {row}")
continue

current_gcst, current_status, date_str = [item.strip() for item in row]

if (
current_gcst == gcst_id
and current_status.lower() == harmonised_status.lower()
):
try:
current_date = datetime.strptime(date_str, date_format)
if latest_date is None or current_date > latest_date:
latest_date = current_date
except ValueError:
print(f"Invalid date format on row {row_number}: {date_str}")
continue

if latest_date:
return latest_date.strftime(date_format)
else:
print("No matching records found.")
return None

except FileNotFoundError:
print(f"File not found: {file_path}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None


def find_latest_yamlmd5sums(
file_path: str,
gcst_id: str,
harmonised_status: str,
# search_file_name: str
) -> Optional[list]:
mdsums = []

try:
with open(file_path, mode="r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row_number, row in enumerate(reader, start=1):
if len(row) != 4:
print(f"Skipping malformed row {row_number}: {row}")
continue

current_gcst, current_status, file_name, md5sum = [
item.strip() for item in row
]

if (
current_gcst == gcst_id
and current_status.lower() == harmonised_status.lower()
):
mdsums.append(md5sum)

if mdsums:
return mdsums
else:
print("No matching records found.")
return None

except FileNotFoundError:
print(f"File not found: {file_path}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None