From e2226cd4fdf2203693d1bf3a04af1fdc3c82de5d Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 19 May 2023 14:47:26 -0400 Subject: [PATCH 01/18] add docker_usage_sum.py --- scripts/docker/docker_usage_sum.py | 216 +++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 scripts/docker/docker_usage_sum.py diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py new file mode 100644 index 000000000..c335e0c02 --- /dev/null +++ b/scripts/docker/docker_usage_sum.py @@ -0,0 +1,216 @@ +import os +import re +import urllib.request +from urllib.error import HTTPError, URLError +import json + + +# A script to collect which dockers are in use and which latest dockers are available + +def main(): + dir = os.path.abspath(os.path.dirname(__file__)) + # os.chdir(dir) + + print("COLLECTING DOCKERS IN USE...") + # os.chdir("../../wdl") + wdls_dir = os.path.join(dir, "../../wdl") + if os.path.exists("dockers.in_use.tsv"): + os.remove("dockers.in_use.tsv") + + wdl_files = get_wdl_files(dir_to_wdls=wdls_dir) + global_docker_info = [] + + for wdl_path in wdl_files: + + wdl_name = wdl_path + + with open(wdl_path, "r") as file: + content = file.read() + pattern = re.compile(r'.*docker.*"') + if pattern.search(content): + matched_lines = [] + file.seek(0) + lines = file.readlines() + + for line_number, line in enumerate(lines, start=1): + if pattern.search(line): + matched_lines.append((line_number, line.strip())) + + docker_info: list[str] = get_docker_info_from_string( + wdl_lines=matched_lines, wdl_name=wdl_name + ) + + sorted_info: list = sorted(docker_info, reverse=False) + + global_docker_info.append(sorted_info) + + with open("dockers.in_use.tsv", "a") as tsv_file: + tsv_file.write(f"name\tused_tag\tlatest_tag\tline\twdl\n") + for line in sorted(global_docker_info): + tsv_file.write("\n".join(line) + "\n") + + print("DONE. PLEASE CHECKOUT TSV FILE: dockers.in_use.tsv") + # os.chdir(dir) + # os.rename("../../wdl/dockers.in_use.tsv", "dockers.in_use.tsv") + + +def get_wdl_files(dir_to_wdls: str) -> list: + """ + Returns a list of wdl files + @return: + """ + wdl_files = [] + for root, _, files in os.walk(dir_to_wdls): + for filename in files: + if filename.endswith(".wdl"): + wdl_path = os.path.join(root, filename) + wdl_files.append(wdl_path) + + return wdl_files + + +def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list: + """ + Returns a list of docker info + @param wdl_name: + @param wdl_lines: (line_number, line_content) + @return: + """ + docker_detail = [] + + for line_num, line_content in wdl_lines: + docker_names = re.findall(r'docker.*"(\S*?)"', line_content) + if docker_names: + docker_name = docker_names[0] + used_tag = os.path.basename(docker_name).split(":")[1] + docker_path = docker_name.split(":")[0] + latest_tag = get_latest_local_docker_tag(docker_path) + latest_tag = get_latest_remote_docker_tag( + docker_path) if latest_tag == "NA" else latest_tag + docker_detail.append( + f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_name}") + else: + pass + + return docker_detail + + +def get_latest_remote_docker_tag(docker_path: str) -> str: + """ + Returns the latest tag of a docker + @param docker_path: + @return: + """ + if "gcr" in docker_path: + latest_tag = get_latest_tag_from_gcr(docker_path) + elif "quay.io" in docker_path: + latest_tag = get_latest_tag_from_quayio(docker_path) + else: + latest_tag = get_latest_tag_from_duckerhub(docker_path) + return latest_tag + + +def get_latest_tag_from_duckerhub(docker_path: str) -> str: + image_name = docker_path + registry_url = f"https://registry.hub.docker.com/v2/repositories/{image_name}/tags/?page_size=1&ordering=last_updated" + try: + with urllib.request.urlopen(registry_url) as response: + data = response.read().decode("utf-8") + json_data = json.loads(data) + tags = json_data.get("results") + if tags: + latest_tag = tags[0].get("name") + return latest_tag + else: + return "NA" + except urllib.error.HTTPError as e: + # print(f"Error: {e.code} - {e.reason}") + pass + except urllib.error.URLError as e: + # print(f"Error: Failed to reach the server - {e.reason}") + pass + + +def get_latest_tag_from_gcr(docker_path: str) -> str: + # Split the image string into project ID and image name + parts = docker_path.split("/") + gcr_repo = parts[0] + project_id = parts[1] + image_name = "/".join(parts[2:]) + # Construct the URL for retrieving tags + registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list" + + try: + # Send the GET request to the Container Registry API + with urllib.request.urlopen(registry_url) as response: + data = response.read().decode("utf-8") + json_data = json.loads(data) + tags = json_data.get("tags") + if tags: + latest_tag = max(tags) + return latest_tag + else: + return "NA" + except urllib.error.HTTPError as e: + # print(f"Error: {e.code} - {e.reason}") + pass + except urllib.error.URLError as e: + # print(f"Error: Failed to reach the server - {e.reason}") + pass + + +def get_latest_tag_from_quayio(docker_path: str) -> str: + # Split the image string into project ID and image name + parts = docker_path.split("/") + quayio_repo = parts[0] + project_id = parts[1] + image_name = "/".join(parts[2:]) + # Construct the URL for retrieving tags + registry_url = f"https://{quayio_repo}/v2/{project_id}/{image_name}/tags/list" + + try: + # Send the GET request to the Container Registry API + with urllib.request.urlopen(registry_url) as response: + data = response.read().decode("utf-8") + json_data = json.loads(data) + tags = json_data.get("tags") + if tags: + latest_tag = max(tags) + return latest_tag + else: + return "NA" + except urllib.error.HTTPError as e: + # print(f"Error: {e.code} - {e.reason}") + pass + except urllib.error.URLError as e: + # print(f"Error: Failed to reach the server - {e.reason}") + pass + + +def get_latest_local_docker_tag(docker_path: str) -> str: + """ + Returns the latest tag of a docker + @param docker_path: + @return: + """ + docker_name = os.path.basename(docker_path) + docker_dir = "../docker" + latest_tag = "NA" + + for docker_im_dir in os.listdir(docker_dir): + if docker_im_dir == docker_name: + docker_dir_path = os.path.join(docker_dir, docker_im_dir) + for makefile in os.listdir(docker_dir_path): + if not makefile.endswith("Makefile"): + continue + + with open(os.path.join(docker_dir_path, makefile)) as f: + for makefile_line in f: + if "VERSION =" in makefile_line: + latest_tag = makefile_line.split("=")[1].strip() + + return latest_tag + + +if __name__ == "__main__": + main() From b9165b7c071afad85dea49ac101e13d857e29ef1 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 19 May 2023 17:54:59 -0400 Subject: [PATCH 02/18] updates made to docker_usage_sum.py --- scripts/docker/docker_usage_sum.py | 151 +++++++++++++++++++++++------ 1 file changed, 124 insertions(+), 27 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index c335e0c02..e9ce2842b 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -1,26 +1,36 @@ import os import re +import subprocess import urllib.request from urllib.error import HTTPError, URLError import json # A script to collect which dockers are in use and which latest dockers are available +# Usage: python3 docker_usage_sum.py +# Output: dockers.in_use.tsv +# Note: This script is not perfect. It will not be able to detect dockers that are +# imported from other wdl files. It will only detect dockers that are +# explicitly defined in the wdl file. +# The script assumes it is executed from the scripts/docker directory, and the +# wdl files are in ../../wdl directory. def main(): - dir = os.path.abspath(os.path.dirname(__file__)) - # os.chdir(dir) + current_dir = os.path.abspath(os.path.dirname(__file__)) print("COLLECTING DOCKERS IN USE...") - # os.chdir("../../wdl") - wdls_dir = os.path.join(dir, "../../wdl") - if os.path.exists("dockers.in_use.tsv"): - os.remove("dockers.in_use.tsv") + wdls_dir = os.path.abspath(os.path.join(current_dir, "../../wdl")) + sum_tsv_file = os.path.join(current_dir, "dockers.in_use.tsv") + + if os.path.exists(sum_tsv_file): + os.remove(sum_tsv_file) wdl_files = get_wdl_files(dir_to_wdls=wdls_dir) global_docker_info = [] - for wdl_path in wdl_files: + total_files = len(wdl_files) # Used for Progression calculation + + for index, wdl_path in enumerate(wdl_files, start=1): wdl_name = wdl_path @@ -37,21 +47,26 @@ def main(): matched_lines.append((line_number, line.strip())) docker_info: list[str] = get_docker_info_from_string( - wdl_lines=matched_lines, wdl_name=wdl_name + wdl_lines=matched_lines, wdl_path=wdl_name ) sorted_info: list = sorted(docker_info, reverse=False) global_docker_info.append(sorted_info) - with open("dockers.in_use.tsv", "a") as tsv_file: - tsv_file.write(f"name\tused_tag\tlatest_tag\tline\twdl\n") + # Progression + # Calculate the percentage completion + progress = (index + 1) / total_files * 100 + + # Clear the previous line and print the progress + print(f"Progress: {progress:.2f}%\r", end="") + + with open(sum_tsv_file, "a") as tsv_file: + tsv_file.write(f"DOCKER_NAME\tUSED_TAG\tLATEST_TAG\tFILE_LINE\tWDL_PATH\n") for line in sorted(global_docker_info): tsv_file.write("\n".join(line) + "\n") - print("DONE. PLEASE CHECKOUT TSV FILE: dockers.in_use.tsv") - # os.chdir(dir) - # os.rename("../../wdl/dockers.in_use.tsv", "dockers.in_use.tsv") + print(f"DONE. PLEASE CHECKOUT TSV FILE: {sum_tsv_file}") def get_wdl_files(dir_to_wdls: str) -> list: @@ -69,15 +84,17 @@ def get_wdl_files(dir_to_wdls: str) -> list: return wdl_files -def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list: +def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: """ Returns a list of docker info - @param wdl_name: + @param wdl_path: @param wdl_lines: (line_number, line_content) @return: """ docker_detail = [] + wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):] + for line_num, line_content in wdl_lines: docker_names = re.findall(r'docker.*"(\S*?)"', line_content) if docker_names: @@ -88,7 +105,7 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_name: str) -> list: latest_tag = get_latest_remote_docker_tag( docker_path) if latest_tag == "NA" else latest_tag docker_detail.append( - f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_name}") + f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_path_sum}") else: pass @@ -101,16 +118,25 @@ def get_latest_remote_docker_tag(docker_path: str) -> str: @param docker_path: @return: """ - if "gcr" in docker_path: + if "gcr" in docker_path or "ghcr" in docker_path: latest_tag = get_latest_tag_from_gcr(docker_path) + if latest_tag == "NA" or latest_tag == "None": + latest_tag = get_gcr_tag_with_gcloud(docker_path) elif "quay.io" in docker_path: - latest_tag = get_latest_tag_from_quayio(docker_path) + latest_tag = get_latest_tag_from_quay(docker_path) else: - latest_tag = get_latest_tag_from_duckerhub(docker_path) + latest_tag = get_latest_tag_from_dockerhub(docker_path) return latest_tag -def get_latest_tag_from_duckerhub(docker_path: str) -> str: +def get_latest_tag_from_dockerhub(docker_path: str) -> str: + + """ + Returns the latest tag of a docker from dockerhub using the dockerhub API + @param docker_path: + @return: + """ + image_name = docker_path registry_url = f"https://registry.hub.docker.com/v2/repositories/{image_name}/tags/?page_size=1&ordering=last_updated" try: @@ -132,13 +158,20 @@ def get_latest_tag_from_duckerhub(docker_path: str) -> str: def get_latest_tag_from_gcr(docker_path: str) -> str: + + """ + Returns the latest tag of a docker from GCR using the Container Registry API + @param docker_path: + @return: + """ + # Split the image string into project ID and image name parts = docker_path.split("/") gcr_repo = parts[0] project_id = parts[1] image_name = "/".join(parts[2:]) # Construct the URL for retrieving tags - registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list" + registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list?page_size=1&ordering=last_updated" try: # Send the GET request to the Container Registry API @@ -146,9 +179,12 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: data = response.read().decode("utf-8") json_data = json.loads(data) tags = json_data.get("tags") - if tags: - latest_tag = max(tags) - return latest_tag + + tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] + if tags_str_removed: + if tags_str_removed is not None: + latest_tag = max(tags_str_removed) + return latest_tag else: return "NA" except urllib.error.HTTPError as e: @@ -159,7 +195,66 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: pass -def get_latest_tag_from_quayio(docker_path: str) -> str: +def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: + + """ + Returns the latest tag of a docker using gcloud + @param docker_path: + @return: + """ + + # Split the image string into project ID and image name + + if is_gcloud_installed(): + + command = [ + "gcloud", + "container", + "images", + "list-tags", + docker_path, + "--format=get(tags)", + "--limit=1", + "--sort-by=~timestamp.datetime", + "--filter=tags:*", + ] + + process = subprocess.run(command, capture_output=True, text=True) + if process.returncode == 0: + output = process.stdout.strip() + if output: + latest_tag = output.splitlines()[0] + return latest_tag + + # Error handling + error_message = process.stderr.strip() if process.stderr else process.stdout.strip() + #print(f"Error: {error_message}") + return None + else: + return None + + +def is_gcloud_installed() -> bool: + """ + Checks if gcloud is installed + @return: + """ + + command = ["gcloud", "--version"] + + try: + subprocess.run(command, check=True, capture_output=True) + return True + except subprocess.CalledProcessError: + return False + + +def get_latest_tag_from_quay(docker_path: str) -> str: + """ + Returns the latest tag of a docker from quay.io + @param docker_path: + @return: + """ # Split the image string into project ID and image name parts = docker_path.split("/") quayio_repo = parts[0] @@ -174,8 +269,10 @@ def get_latest_tag_from_quayio(docker_path: str) -> str: data = response.read().decode("utf-8") json_data = json.loads(data) tags = json_data.get("tags") - if tags: - latest_tag = max(tags) + + tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] + if tags_str_removed: + latest_tag = max(tags_str_removed) return latest_tag else: return "NA" From 387fe74d6a7da1de9356841d7728aa729128670a Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 19 May 2023 17:56:09 -0400 Subject: [PATCH 03/18] moved collect_docker_in_system.sh to scripts/docker folder --- scripts/{ => docker}/collect_docker_in_system.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{ => docker}/collect_docker_in_system.sh (100%) diff --git a/scripts/collect_docker_in_system.sh b/scripts/docker/collect_docker_in_system.sh similarity index 100% rename from scripts/collect_docker_in_system.sh rename to scripts/docker/collect_docker_in_system.sh From a9e24fcfd02c17e7e6406a8965a98a30679d4750 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 19 May 2023 18:04:47 -0400 Subject: [PATCH 04/18] adding notes to docker_usage_sum.py --- scripts/docker/docker_usage_sum.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index e9ce2842b..edc7c1272 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -9,11 +9,15 @@ # A script to collect which dockers are in use and which latest dockers are available # Usage: python3 docker_usage_sum.py # Output: dockers.in_use.tsv -# Note: This script is not perfect. It will not be able to detect dockers that are +# Note: - This script is not perfect. It will not be able to detect dockers that are # imported from other wdl files. It will only detect dockers that are # explicitly defined in the wdl file. -# The script assumes it is executed from the scripts/docker directory, and the +# - The script assumes it is executed from the scripts/docker directory, and the # wdl files are in ../../wdl directory. +# - The script will retrieve the "latest" tag by date, so if an unofficial tag was +# created, after the official tag was created, the script will retrieve the +# unofficial tag as the latest tag. (It tries to avoid this by filtering out +# tags with no digits). def main(): current_dir = os.path.abspath(os.path.dirname(__file__)) From d027cf0cd5d71273f6e0a991c2a187750fd22876 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 22 Aug 2023 11:09:45 -0400 Subject: [PATCH 05/18] Refactoring, switched tsv columns, logic to look for latest tag for gcr --- scripts/docker/docker_usage_sum.py | 130 ++++++++++++++++++----------- 1 file changed, 79 insertions(+), 51 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index edc7c1272..073b0eec0 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -4,6 +4,9 @@ import urllib.request from urllib.error import HTTPError, URLError import json +import logging + +logging.basicConfig(level=logging.INFO) # A script to collect which dockers are in use and which latest dockers are available @@ -19,17 +22,19 @@ # unofficial tag as the latest tag. (It tries to avoid this by filtering out # tags with no digits). +# TODO: Future suggestion: have the results be generated for main branch for each merge + def main(): current_dir = os.path.abspath(os.path.dirname(__file__)) print("COLLECTING DOCKERS IN USE...") - wdls_dir = os.path.abspath(os.path.join(current_dir, "../../wdl")) - sum_tsv_file = os.path.join(current_dir, "dockers.in_use.tsv") + WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl")) + OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv") - if os.path.exists(sum_tsv_file): - os.remove(sum_tsv_file) + if os.path.exists(OUT_SUMMARY_TSV): + os.remove(OUT_SUMMARY_TSV) - wdl_files = get_wdl_files(dir_to_wdls=wdls_dir) + wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR) global_docker_info = [] total_files = len(wdl_files) # Used for Progression calculation @@ -38,13 +43,13 @@ def main(): wdl_name = wdl_path - with open(wdl_path, "r") as file: - content = file.read() - pattern = re.compile(r'.*docker.*"') - if pattern.search(content): + with open(wdl_path, "r") as file_content: + content = file_content.read() + pattern = re.compile(r'.*docker:.*"') + if pattern.search(content): # If wdl file contains "docker:" matched_lines = [] - file.seek(0) - lines = file.readlines() + file_content.seek(0) + lines = file_content.readlines() for line_number, line in enumerate(lines, start=1): if pattern.search(line): @@ -57,20 +62,23 @@ def main(): sorted_info: list = sorted(docker_info, reverse=False) global_docker_info.append(sorted_info) + else: + pass # Progression # Calculate the percentage completion - progress = (index + 1) / total_files * 100 + progress: float = (index + 1) / total_files * 100 # Clear the previous line and print the progress print(f"Progress: {progress:.2f}%\r", end="") + with open(OUT_SUMMARY_TSV, "a") as tsv_file: + # Add header + tsv_file.write(f"DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH") + # Add content + for docker_info_line in sorted(global_docker_info): + tsv_file.write("\n".join(docker_info_line) + "\n") - with open(sum_tsv_file, "a") as tsv_file: - tsv_file.write(f"DOCKER_NAME\tUSED_TAG\tLATEST_TAG\tFILE_LINE\tWDL_PATH\n") - for line in sorted(global_docker_info): - tsv_file.write("\n".join(line) + "\n") - - print(f"DONE. PLEASE CHECKOUT TSV FILE: {sum_tsv_file}") + print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}") def get_wdl_files(dir_to_wdls: str) -> list: @@ -91,25 +99,27 @@ def get_wdl_files(dir_to_wdls: str) -> list: def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: """ Returns a list of docker info - @param wdl_path: + @param wdl_path: path to wdl file @param wdl_lines: (line_number, line_content) - @return: + @return: list of docker info e.g. [" docker_name\tlatest_tag\tused_tag\tline_num\twdl_path, ..."] """ docker_detail = [] + # Get the path after /wdl/ for better readability wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):] for line_num, line_content in wdl_lines: - docker_names = re.findall(r'docker.*"(\S*?)"', line_content) + docker_names = re.findall(r'docker:\s*"(\S*?)"', line_content) # if docker_names: - docker_name = docker_names[0] - used_tag = os.path.basename(docker_name).split(":")[1] - docker_path = docker_name.split(":")[0] - latest_tag = get_latest_local_docker_tag(docker_path) + docker_name_and_version = docker_names[0] + used_tag = os.path.basename(docker_name_and_version).split(":")[1] + docker_name = docker_name_and_version.split(":")[0] + latest_tag = get_latest_local_docker_tag(docker_name) + # If the latest tag is not found locally, try to get it from remote latest_tag = get_latest_remote_docker_tag( - docker_path) if latest_tag == "NA" else latest_tag + docker_name) if latest_tag == "NA" else latest_tag docker_detail.append( - f"{docker_path}\t{used_tag}\t{latest_tag}\t{line_num}\t{wdl_path_sum}") + f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") else: pass @@ -118,13 +128,13 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: def get_latest_remote_docker_tag(docker_path: str) -> str: """ - Returns the latest tag of a docker + Returns the latest tag of a docker from gcr, quay or dockerhub @param docker_path: @return: """ if "gcr" in docker_path or "ghcr" in docker_path: latest_tag = get_latest_tag_from_gcr(docker_path) - if latest_tag == "NA" or latest_tag == "None": + if latest_tag == "NA" or latest_tag == "None" or latest_tag == "latest" or latest_tag is None: latest_tag = get_gcr_tag_with_gcloud(docker_path) elif "quay.io" in docker_path: latest_tag = get_latest_tag_from_quay(docker_path) @@ -170,12 +180,15 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: """ # Split the image string into project ID and image name + # us.gcr.io/broad-dsp-lrma/lr-transcript_utils:latest parts = docker_path.split("/") - gcr_repo = parts[0] - project_id = parts[1] - image_name = "/".join(parts[2:]) + gcr_repo = parts[0] # Example: us.gcr.io + project_id = parts[1] # Example: broad-dsp-lrma + image_name = "/".join(parts[2:]) # Example: lr-transcript_utils + # Construct the URL for retrieving tags - registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list?page_size=1&ordering=last_updated" + # https://cloud.google.com/artifact-registry/docs/reference/docker-api + registry_url = f"https://{gcr_repo}/v2/{project_id}/{image_name}/tags/list" try: # Send the GET request to the Container Registry API @@ -183,7 +196,19 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: data = response.read().decode("utf-8") json_data = json.loads(data) tags = json_data.get("tags") - + manifest = json_data.get("manifest") + + # The manifest is a list of dicts for each version of an image, each dict + # has a key called "tag", which is a list of tags for that version. + # The image version having "latest" as part of its tag is what we want. + for sha_key in manifest: + sha_key_tags = manifest[sha_key].get("tag") + if "latest" in sha_key_tags: + latest_tag = sha_key_tags[0] + return latest_tag if latest_tag else "NA" + + # If the image doesn't have a "latest" tag, return the tag with the + # highest version number. tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] if tags_str_removed: if tags_str_removed is not None: @@ -217,25 +242,28 @@ def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: "images", "list-tags", docker_path, - "--format=get(tags)", + "--format=json", "--limit=1", "--sort-by=~timestamp.datetime", - "--filter=tags:*", + "--filter=tags:latest", ] - process = subprocess.run(command, capture_output=True, text=True) - if process.returncode == 0: - output = process.stdout.strip() - if output: - latest_tag = output.splitlines()[0] - return latest_tag + gc_container_results = subprocess.run(command, capture_output=True, text=True) + if gc_container_results and gc_container_results.returncode == 0: + gc_container_results_json = json.loads(gc_container_results.stdout) + try : + latest_tag = gc_container_results_json[0].get("tags")[0] + return latest_tag if latest_tag is not None else "NA" + except IndexError: - # Error handling - error_message = process.stderr.strip() if process.stderr else process.stdout.strip() - #print(f"Error: {error_message}") - return None + logging.warning(f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}") + return "NA" + else: + # Error handling + error_message = gc_container_results.stderr.strip() if gc_container_results.stderr else gc_container_results.stdout.strip() + #print(f"Error: {error_message}") else: - return None + return "NA" def is_gcloud_installed() -> bool: @@ -288,13 +316,13 @@ def get_latest_tag_from_quay(docker_path: str) -> str: pass -def get_latest_local_docker_tag(docker_path: str) -> str: +def get_latest_local_docker_tag(docker_name: str) -> str: """ - Returns the latest tag of a docker - @param docker_path: + Returns the latest tag of a docker from the local docker directory + @param docker_name: name of the docker e.g. "gatk" @return: """ - docker_name = os.path.basename(docker_path) + docker_name = os.path.basename(docker_name) docker_dir = "../docker" latest_tag = "NA" From 4995a573c8af64633245e885f3d8edbc5738b76c Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 22 Aug 2023 15:55:32 -0400 Subject: [PATCH 06/18] Refactoring, divide up some functions --- scripts/docker/docker_usage_sum.py | 104 +++++++++++++++++------------ 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 073b0eec0..21a9b4038 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -109,23 +109,39 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: wdl_path_sum = wdl_path[wdl_path.find("/wdl/"):] for line_num, line_content in wdl_lines: - docker_names = re.findall(r'docker:\s*"(\S*?)"', line_content) # - if docker_names: - docker_name_and_version = docker_names[0] + docker_image = re.findall(r'docker:\s*"(\S*?)"', line_content) + if docker_image: + docker_name_and_version = docker_image[0] used_tag = os.path.basename(docker_name_and_version).split(":")[1] docker_name = docker_name_and_version.split(":")[0] - latest_tag = get_latest_local_docker_tag(docker_name) + + # Get latest tag from list of docker details if it was already retrieved + latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, docker_name=docker_name) + # Get latest tag from local docker if it was not retrieved from list of docker details + latest_tag = get_latest_local_docker_tag(docker_name) if latest_tag == "NA" else latest_tag # If the latest tag is not found locally, try to get it from remote - latest_tag = get_latest_remote_docker_tag( - docker_name) if latest_tag == "NA" else latest_tag - docker_detail.append( - f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") + latest_tag = get_latest_remote_docker_tag(docker_name) if latest_tag == "NA" else latest_tag + docker_detail.append(f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") else: pass return docker_detail +def get_tag_from_docker_details(docker_detail: list, docker_name: str) -> str: + """ + Returns the latest tag of a docker from a list of docker details + @param docker_detail: list of docker details e.g. ["docker_name\tlatest_tag\tused_tag\tline_num\twdl_path, ..."] + @param docker_name: docker name + @return: latest tag + """ + latest_tag = "NA" + for docker_info in docker_detail: + if docker_name in docker_info: + latest_tag = docker_info.split("\t")[1] + break + return latest_tag + def get_latest_remote_docker_tag(docker_path: str) -> str: """ Returns the latest tag of a docker from gcr, quay or dockerhub @@ -134,7 +150,7 @@ def get_latest_remote_docker_tag(docker_path: str) -> str: """ if "gcr" in docker_path or "ghcr" in docker_path: latest_tag = get_latest_tag_from_gcr(docker_path) - if latest_tag == "NA" or latest_tag == "None" or latest_tag == "latest" or latest_tag is None: + if latest_tag in ["NA", "None", "latest"] or latest_tag is None: latest_tag = get_gcr_tag_with_gcloud(docker_path) elif "quay.io" in docker_path: latest_tag = get_latest_tag_from_quay(docker_path) @@ -193,29 +209,7 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: try: # Send the GET request to the Container Registry API with urllib.request.urlopen(registry_url) as response: - data = response.read().decode("utf-8") - json_data = json.loads(data) - tags = json_data.get("tags") - manifest = json_data.get("manifest") - - # The manifest is a list of dicts for each version of an image, each dict - # has a key called "tag", which is a list of tags for that version. - # The image version having "latest" as part of its tag is what we want. - for sha_key in manifest: - sha_key_tags = manifest[sha_key].get("tag") - if "latest" in sha_key_tags: - latest_tag = sha_key_tags[0] - return latest_tag if latest_tag else "NA" - - # If the image doesn't have a "latest" tag, return the tag with the - # highest version number. - tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] - if tags_str_removed: - if tags_str_removed is not None: - latest_tag = max(tags_str_removed) - return latest_tag - else: - return "NA" + return extract_latest_tag_from_registry_response(response) except urllib.error.HTTPError as e: # print(f"Error: {e.code} - {e.reason}") pass @@ -223,6 +217,41 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: # print(f"Error: Failed to reach the server - {e.reason}") pass +def extract_latest_tag_from_registry_response(response) -> str: + """ + Extracts the latest tag from the response of a registry API call + @param response: + @return: + """ + + response_content = response.read().decode("utf-8") + latest_tag = "NA" + + if response_content: + json_data = json.loads(response_content) + tags = json_data.get("tags") + manifest = json_data.get("manifest") + + # The manifest is a dict of dicts for each version of an image, each dict + # has a key called "tag", which is a list of tags for that version. + # The image version having "latest" as part of its tag is returned. + if manifest: + for sha_key in manifest: + sha_key_tags = manifest[sha_key].get("tag") + if "latest" in sha_key_tags: + latest_tag = sha_key_tags[0] + return latest_tag + + # If the image doesn't have a "latest" tag, return the tag with the + # highest version number. + tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] + if tags_str_removed: + latest_tag = max(tags_str_removed) + return latest_tag + else: + return latest_tag # If no numerical version tags are found, return NA + else: + return latest_tag # If manifest is empty, return NA def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: @@ -298,16 +327,7 @@ def get_latest_tag_from_quay(docker_path: str) -> str: try: # Send the GET request to the Container Registry API with urllib.request.urlopen(registry_url) as response: - data = response.read().decode("utf-8") - json_data = json.loads(data) - tags = json_data.get("tags") - - tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] - if tags_str_removed: - latest_tag = max(tags_str_removed) - return latest_tag - else: - return "NA" + return extract_latest_tag_from_registry_response(response) except urllib.error.HTTPError as e: # print(f"Error: {e.code} - {e.reason}") pass From e0d745b8ba94bde7bcdc7cde366d77801d453b09 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 23 Aug 2023 09:46:39 -0400 Subject: [PATCH 07/18] Note about having gcloud installed --- scripts/docker/docker_usage_sum.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 21a9b4038..24088cc20 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -21,6 +21,8 @@ # created, after the official tag was created, the script will retrieve the # unofficial tag as the latest tag. (It tries to avoid this by filtering out # tags with no digits). +# - The script occasionally uses gcloud to retrieve the latest tag. Its suggested +# to have gcloud installed. # TODO: Future suggestion: have the results be generated for main branch for each merge From b4eb61c78210a840842d80e0fe03990e83d45e07 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 25 Oct 2023 15:27:44 -0400 Subject: [PATCH 08/18] rm collect_docker_in_system.sh --- scripts/docker/collect_docker_in_system.sh | 40 ---------------------- 1 file changed, 40 deletions(-) delete mode 100644 scripts/docker/collect_docker_in_system.sh diff --git a/scripts/docker/collect_docker_in_system.sh b/scripts/docker/collect_docker_in_system.sh deleted file mode 100644 index c390eb5eb..000000000 --- a/scripts/docker/collect_docker_in_system.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -set -eu - -################################################################################ -# A script to collect which dockers are in use and which latest dockers available -################################################################################ - -dir=$(cd -P -- "$(dirname -- "$0")" && pwd -P) -cd "${dir}" - - -echo "COLLECTING DOCKERS IN USE..." -cd ../wdl -rm -f dockers.in_use.tsv -for wdl in $(find . -name "*.wdl"| sed "s|^\./||") ; do - if ! grep -qE 'docker:\s+\"' "${wdl}"; then continue; fi; - grep -nE 'docker:\s+\"' "${wdl}" > tmp.0.txt - awk -F ':' '{print $3"\t"$4"\t"$1}' tmp.0.txt | sed -e 's/^[[:space:]]*//' | sed "s/\"//g" | awk -F '/' '{print $NF}' | sort > tmp.1.txt - sed -e "s%$%\t$wdl%" tmp.1.txt >> dockers.in_use.tsv - rm tmp.*.txt -done -echo -e "name\ttag\tline\twdl\n$(sort dockers.in_use.tsv)" > dockers.in_use.sorted.tsv -rm dockers.in_use.tsv - -echo "COLLECTING LATEST DOCKERS AVAILABLE..." -cd ../docker -rm -f dockers.latest.tsv -for makefile in $(find . -mindepth 2 -name "Makefile" | sed "s|^\./||") ; do - name=$(grep -m 1 -F 'TAG1' "${makefile}" | awk -F '/' '{print $NF}' | awk -F ':' '{print $1}') - tag=$(head -n 1 "${makefile}" | awk -F '=' '{print $NF}' | sed 's% %%g' | awk -F '#' '{print $1}') - echo -e "${name}\t${tag}" >> dockers.latest.tsv -done -sort dockers.latest.tsv > dockers.latest.sorted.tsv -rm dockers.latest.tsv - -echo "DONE. PLEASE CHECKOUT TWO TSV FILES: [dockers.in_use.sorted.tsv, dockers.latest.sorted.tsv]" -cd "${dir}" -mv ../wdl/dockers.in_use.sorted.tsv . -mv ../docker/dockers.latest.sorted.tsv . From 0a9bdac4bae8971347b9838e5354dcd1afbc8465 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 25 Oct 2023 16:02:49 -0400 Subject: [PATCH 09/18] Added help message --- scripts/docker/docker_usage_sum.py | 73 ++++++++++++++++++------------ 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 24088cc20..b65b349a3 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -1,3 +1,4 @@ +import argparse import os import re import subprocess @@ -9,24 +10,32 @@ logging.basicConfig(level=logging.INFO) -# A script to collect which dockers are in use and which latest dockers are available -# Usage: python3 docker_usage_sum.py -# Output: dockers.in_use.tsv -# Note: - This script is not perfect. It will not be able to detect dockers that are -# imported from other wdl files. It will only detect dockers that are -# explicitly defined in the wdl file. -# - The script assumes it is executed from the scripts/docker directory, and the -# wdl files are in ../../wdl directory. -# - The script will retrieve the "latest" tag by date, so if an unofficial tag was -# created, after the official tag was created, the script will retrieve the -# unofficial tag as the latest tag. (It tries to avoid this by filtering out -# tags with no digits). -# - The script occasionally uses gcloud to retrieve the latest tag. Its suggested -# to have gcloud installed. - # TODO: Future suggestion: have the results be generated for main branch for each merge def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=''' +Collects docker usage summary from wdl files. + +Output: + dockers.in_use.tsv +Notes: + - This script is not perfect. It will not be able to detect dockers that are + imported from other wdl files. It will only detect dockers that are + explicitly defined in the wdl file. + - The script assumes it is executed from the scripts/docker directory, and the + wdl files are in ../../wdl directory. + - The script will retrieve the "latest" tag by date, so if an unofficial tag was + created, after the official tag was created, the script will retrieve the + unofficial tag as the latest tag. (It tries to avoid this by filtering out + tags with no digits). + - The script occasionally uses gcloud to retrieve the latest tag. Its suggested + to have gcloud installed. + ''', + ) + parser.parse_args() + current_dir = os.path.abspath(os.path.dirname(__file__)) print("COLLECTING DOCKERS IN USE...") @@ -48,7 +57,7 @@ def main(): with open(wdl_path, "r") as file_content: content = file_content.read() pattern = re.compile(r'.*docker:.*"') - if pattern.search(content): # If wdl file contains "docker:" + if pattern.search(content): # If wdl file contains "docker:" matched_lines = [] file_content.seek(0) lines = file_content.readlines() @@ -118,12 +127,16 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: docker_name = docker_name_and_version.split(":")[0] # Get latest tag from list of docker details if it was already retrieved - latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, docker_name=docker_name) + latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, + docker_name=docker_name) # Get latest tag from local docker if it was not retrieved from list of docker details - latest_tag = get_latest_local_docker_tag(docker_name) if latest_tag == "NA" else latest_tag + latest_tag = get_latest_local_docker_tag( + docker_name) if latest_tag == "NA" else latest_tag # If the latest tag is not found locally, try to get it from remote - latest_tag = get_latest_remote_docker_tag(docker_name) if latest_tag == "NA" else latest_tag - docker_detail.append(f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") + latest_tag = get_latest_remote_docker_tag( + docker_name) if latest_tag == "NA" else latest_tag + docker_detail.append( + f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") else: pass @@ -144,6 +157,7 @@ def get_tag_from_docker_details(docker_detail: list, docker_name: str) -> str: break return latest_tag + def get_latest_remote_docker_tag(docker_path: str) -> str: """ Returns the latest tag of a docker from gcr, quay or dockerhub @@ -162,7 +176,6 @@ def get_latest_remote_docker_tag(docker_path: str) -> str: def get_latest_tag_from_dockerhub(docker_path: str) -> str: - """ Returns the latest tag of a docker from dockerhub using the dockerhub API @param docker_path: @@ -190,7 +203,6 @@ def get_latest_tag_from_dockerhub(docker_path: str) -> str: def get_latest_tag_from_gcr(docker_path: str) -> str: - """ Returns the latest tag of a docker from GCR using the Container Registry API @param docker_path: @@ -219,6 +231,7 @@ def get_latest_tag_from_gcr(docker_path: str) -> str: # print(f"Error: Failed to reach the server - {e.reason}") pass + def extract_latest_tag_from_registry_response(response) -> str: """ Extracts the latest tag from the response of a registry API call @@ -246,17 +259,18 @@ def extract_latest_tag_from_registry_response(response) -> str: # If the image doesn't have a "latest" tag, return the tag with the # highest version number. - tags_str_removed = [item for item in tags if any(char.isdigit() for char in item)] + tags_str_removed = [item for item in tags if + any(char.isdigit() for char in item)] if tags_str_removed: latest_tag = max(tags_str_removed) return latest_tag else: - return latest_tag # If no numerical version tags are found, return NA + return latest_tag # If no numerical version tags are found, return NA else: - return latest_tag # If manifest is empty, return NA + return latest_tag # If manifest is empty, return NA -def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: +def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: """ Returns the latest tag of a docker using gcloud @param docker_path: @@ -282,17 +296,18 @@ def get_gcr_tag_with_gcloud(docker_path: str) -> str or None: gc_container_results = subprocess.run(command, capture_output=True, text=True) if gc_container_results and gc_container_results.returncode == 0: gc_container_results_json = json.loads(gc_container_results.stdout) - try : + try: latest_tag = gc_container_results_json[0].get("tags")[0] return latest_tag if latest_tag is not None else "NA" except IndexError: - logging.warning(f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}") + logging.warning( + f"Gcloud Container obtain empty tag for : {gc_container_results_json} - {docker_path}") return "NA" else: # Error handling error_message = gc_container_results.stderr.strip() if gc_container_results.stderr else gc_container_results.stdout.strip() - #print(f"Error: {error_message}") + # print(f"Error: {error_message}") else: return "NA" From 76a86eae4bf62c4052381a10430ee42c9e53180b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 25 Oct 2023 16:09:25 -0400 Subject: [PATCH 10/18] make backup of old tsv instead of deleting it --- scripts/docker/docker_usage_sum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index b65b349a3..6266be2b0 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -43,7 +43,7 @@ def main(): OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv") if os.path.exists(OUT_SUMMARY_TSV): - os.remove(OUT_SUMMARY_TSV) + os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak") wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR) global_docker_info = [] From c94e6801cc45ac684565d5e66aa238608b974611 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 25 Oct 2023 16:45:58 -0400 Subject: [PATCH 11/18] updated regex pattern --- scripts/docker/docker_usage_sum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 6266be2b0..4f380ff78 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -56,7 +56,7 @@ def main(): with open(wdl_path, "r") as file_content: content = file_content.read() - pattern = re.compile(r'.*docker:.*"') + pattern = re.compile(r'\s*docker:\s*"') if pattern.search(content): # If wdl file contains "docker:" matched_lines = [] file_content.seek(0) From 17043ef94b15110d861d90a3971b9c36bff065f2 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 25 Oct 2023 17:03:24 -0400 Subject: [PATCH 12/18] changed way of writing to tsv file --- scripts/docker/docker_usage_sum.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 4f380ff78..6f3022167 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -82,12 +82,13 @@ def main(): # Clear the previous line and print the progress print(f"Progress: {progress:.2f}%\r", end="") - with open(OUT_SUMMARY_TSV, "a") as tsv_file: + with open(OUT_SUMMARY_TSV, "w") as tsv_file: # Add header - tsv_file.write(f"DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH") + tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH") # Add content for docker_info_line in sorted(global_docker_info): - tsv_file.write("\n".join(docker_info_line) + "\n") + delimiter = "\n" + tsv_file.write(delimiter.join(docker_info_line) + "\n") print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}") @@ -137,8 +138,6 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: docker_name) if latest_tag == "NA" else latest_tag docker_detail.append( f"{docker_name}\t{latest_tag}\t{used_tag}\t{line_num}\t{wdl_path_sum}") - else: - pass return docker_detail From 77dc9ce8084f42e7fafde82cafccf668fe7b712d Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 26 Oct 2023 14:11:04 -0400 Subject: [PATCH 13/18] Added py script to create markdown file from docker usage tsv --- scripts/docker/docker_usage_sum.py | 2 +- scripts/git_page/docker_usage_md.py | 87 +++++++++++++++++++++++++++++ scripts/git_page/utility.py | 34 +++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 scripts/git_page/docker_usage_md.py diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 6f3022167..e44b4db3a 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -40,7 +40,7 @@ def main(): print("COLLECTING DOCKERS IN USE...") WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl")) - OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers.in_use.tsv") + OUT_SUMMARY_TSV = os.path.join(current_dir, "dockers_in_use.tsv") if os.path.exists(OUT_SUMMARY_TSV): os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak") diff --git a/scripts/git_page/docker_usage_md.py b/scripts/git_page/docker_usage_md.py new file mode 100644 index 000000000..e455df5a0 --- /dev/null +++ b/scripts/git_page/docker_usage_md.py @@ -0,0 +1,87 @@ +import argparse +import os +import logging +import utility as util + +# Get the current working directory +cwd = os.getcwd() + +# Get the path to the scripts directory +scripts_dir = os.path.join(cwd, "scripts") + +# Get the path to the git_page directory +git_page_dir = os.path.join(scripts_dir, "git_page") + +Logger = logging.getLogger(__name__) +logging.basicConfig() + + +def main(): + # read command-line arguments + parser = argparse.ArgumentParser( + description="Generate docker_usage.md file by running the docker_usage_sum.py and using the output to generate the markdown file." + ) + parser.add_argument("--output_path", help="Path to the out markdown file") + parser.add_argument("--debug", action="store_true", help="verbose logging") + + args = parser.parse_args() + util.set_logging_level(args) + + current_dir = os.path.abspath(os.path.dirname(__file__)) + parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) + docker_script_dir = os.path.join(parent_dir, "docker") + OUT_FILE_BASENAME = "Dockers_In_Use" + OUT_SUMMARY_TSV = os.path.join(docker_script_dir, OUT_FILE_BASENAME + ".tsv") + + if args.output_path: + markdown_file = os.path.join(args.output_path, OUT_FILE_BASENAME + ".md") + else: + markdown_file = os.path.join(current_dir, OUT_FILE_BASENAME + ".md") + + resolved_markdown_file = os.path.abspath(markdown_file) + + Logger.debug(f"markdown_file: {resolved_markdown_file}") + + # Run docker_usage_sum.py + docker_usage_sum_cmd = ["python3", + os.path.join(docker_script_dir, "docker_usage_sum.py")] + util.run_command(docker_usage_sum_cmd) + + markdown_table = tsv_to_markdown(tsv_file=OUT_SUMMARY_TSV) + write_docker_usage_to_markdown( + resolved_markdown_file=resolved_markdown_file, markdown_table=markdown_table + ) + + +def tsv_to_markdown(tsv_file: str) -> str: + """ + Converts a tsv file to markdown table format + @param tsv_file: + @return: + """ + with open(tsv_file, "r") as f: + lines = f.readlines() + + markdown_table = "|".join(lines[0].split("\t")) + markdown_table += "|".join(["---"] * len(lines[0].split("\t"))) + markdown_table += "\n" + for line in lines[1:]: + markdown_table += "|".join(line.split("\t")) + + return markdown_table + + +def write_docker_usage_to_markdown(resolved_markdown_file, markdown_table): + with open(resolved_markdown_file, "w") as md_file: + md_file.write("# Docker Usage\n\n") + md_file.write("The following table lists the docker images used in the " + "workflows in this repository.\n\n") + md_file.write("The table is generated by running the [docker_usage_sum.py](" + "../docker/docker_usage_sum.py) script.\n\n") + md_file.write("The script is run by the [generate_docker_usage_md.py](" + "../git_page/generate_docker_usage_md.py) script.\n\n") + md_file.write(markdown_table) + + +if __name__ == "__main__": + main() diff --git a/scripts/git_page/utility.py b/scripts/git_page/utility.py index 82061afb1..d92b4fe12 100644 --- a/scripts/git_page/utility.py +++ b/scripts/git_page/utility.py @@ -1,5 +1,6 @@ import glob import logging +import subprocess from pathlib import Path, PurePosixPath, PurePath Logger = logging.getLogger(__name__) @@ -62,3 +63,36 @@ def get_all_files_with_extension(directory: Path, ext: str) -> list: Logger.debug(f'Getting all files with extension {ext} in {directory}...') return glob.glob(f'{directory}/**/*.{ext}', recursive=True) + + +def run_command(command: list, log_output=True, ) -> None: + """ + Run a shell command and wait for it to complete. + + :param command: A list representing the shell command to execute. + :param log_output: Whether to log the command's output (default is True). + :return: None + + This function runs the specified shell command and waits for it to complete. + It logs the command before execution and raises an exception if the command fails. + """ + cmd_str = ' '.join(command) # Convert the command list to a string for logging + logging.debug(f'Running command: {cmd_str}...') + + try: + result = subprocess.run(command, check=True, shell=False, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, encoding='utf-8') + + if log_output: + if result.stdout: + logging.debug(f'Command output (stdout):\n{result.stdout}') + if result.stderr: + logging.debug(f'Command output (stderr):\n{result.stderr}') + except subprocess.CalledProcessError as e: + logging.error(f'Command failed with error: {e}') + raise + except Exception as e: + logging.error(f'An unexpected error occurred: {e}') + raise + From b9fed9c02caa61c83a8e12f1de762f648bccdd78 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 26 Oct 2023 14:26:25 -0400 Subject: [PATCH 14/18] edited docker_usage_sum.py so that it can be executed from any dir --- scripts/docker/docker_usage_sum.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index e44b4db3a..557ac0f47 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -37,6 +37,7 @@ def main(): parser.parse_args() current_dir = os.path.abspath(os.path.dirname(__file__)) + repo_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir)) print("COLLECTING DOCKERS IN USE...") WDLS_DIR = os.path.abspath(os.path.join(current_dir, "../../wdl")) @@ -67,7 +68,7 @@ def main(): matched_lines.append((line_number, line.strip())) docker_info: list[str] = get_docker_info_from_string( - wdl_lines=matched_lines, wdl_path=wdl_name + wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir ) sorted_info: list = sorted(docker_info, reverse=False) @@ -108,7 +109,7 @@ def get_wdl_files(dir_to_wdls: str) -> list: return wdl_files -def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: +def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str, repo_dir: str) -> list: """ Returns a list of docker info @param wdl_path: path to wdl file @@ -128,11 +129,14 @@ def get_docker_info_from_string(wdl_lines: [tuple], wdl_path: str) -> list: docker_name = docker_name_and_version.split(":")[0] # Get latest tag from list of docker details if it was already retrieved - latest_tag = get_tag_from_docker_details(docker_detail=docker_detail, - docker_name=docker_name) + latest_tag = get_tag_from_docker_details( + docker_detail=docker_detail, docker_name=docker_name + ) # Get latest tag from local docker if it was not retrieved from list of docker details latest_tag = get_latest_local_docker_tag( - docker_name) if latest_tag == "NA" else latest_tag + docker_name=docker_name, repo_dir=repo_dir + ) if latest_tag == "NA" else latest_tag + # If the latest tag is not found locally, try to get it from remote latest_tag = get_latest_remote_docker_tag( docker_name) if latest_tag == "NA" else latest_tag @@ -352,14 +356,14 @@ def get_latest_tag_from_quay(docker_path: str) -> str: pass -def get_latest_local_docker_tag(docker_name: str) -> str: +def get_latest_local_docker_tag(docker_name: str, repo_dir: str) -> str: """ Returns the latest tag of a docker from the local docker directory @param docker_name: name of the docker e.g. "gatk" @return: """ docker_name = os.path.basename(docker_name) - docker_dir = "../docker" + docker_dir = os.path.join(repo_dir, "docker") latest_tag = "NA" for docker_im_dir in os.listdir(docker_dir): From fd1db31ce11baf94faf7d3d3ba946e257a0da63a Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 26 Oct 2023 14:27:25 -0400 Subject: [PATCH 15/18] renamed docker_usage_md.py to generate_docker_usage_md.py. added generate_docker_usage_md.py to repo site by way of adding it to git_page.yml --- .github/workflows/git_page.yml | 2 ++ .../{docker_usage_md.py => generate_docker_usage_md.py} | 0 2 files changed, 2 insertions(+) rename scripts/git_page/{docker_usage_md.py => generate_docker_usage_md.py} (100%) diff --git a/.github/workflows/git_page.yml b/.github/workflows/git_page.yml index 89558586e..6737883a5 100644 --- a/.github/workflows/git_page.yml +++ b/.github/workflows/git_page.yml @@ -45,6 +45,8 @@ jobs: python3 ./scripts/git_page/add_dot_link_to_md.py --md_dir ./docs/workflows --dot_dir ./docs/dot + python ./scripts/git_page/generate_docker_usage_md.py --output_path ./docs/development_guide + mkdocs build - name: Deploy uses: JamesIves/github-pages-deploy-action@releases/v4 diff --git a/scripts/git_page/docker_usage_md.py b/scripts/git_page/generate_docker_usage_md.py similarity index 100% rename from scripts/git_page/docker_usage_md.py rename to scripts/git_page/generate_docker_usage_md.py From 5a2c250fb6a494cf466afff3abc22e9c43d7f67a Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 26 Oct 2023 14:38:13 -0400 Subject: [PATCH 16/18] edited markdown title, removed links in header because doesn't work if output file directed to a different location from current dir --- scripts/git_page/generate_docker_usage_md.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/git_page/generate_docker_usage_md.py b/scripts/git_page/generate_docker_usage_md.py index e455df5a0..1a78c4338 100644 --- a/scripts/git_page/generate_docker_usage_md.py +++ b/scripts/git_page/generate_docker_usage_md.py @@ -73,13 +73,13 @@ def tsv_to_markdown(tsv_file: str) -> str: def write_docker_usage_to_markdown(resolved_markdown_file, markdown_table): with open(resolved_markdown_file, "w") as md_file: - md_file.write("# Docker Usage\n\n") + md_file.write("# Docker Usage Summary\n\n") md_file.write("The following table lists the docker images used in the " "workflows in this repository.\n\n") - md_file.write("The table is generated by running the [docker_usage_sum.py](" - "../docker/docker_usage_sum.py) script.\n\n") - md_file.write("The script is run by the [generate_docker_usage_md.py](" - "../git_page/generate_docker_usage_md.py) script.\n\n") + md_file.write("The table is generated by running the docker_usage_sum.py" + " script.\n\n") + md_file.write("The script is run by the ../git_page/generate_docker_usage_md.py" + " script.\n\n") md_file.write(markdown_table) From aa50c394b26a4629ec1466a8da39c587c3be51cf Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 1 Nov 2023 16:34:06 -0400 Subject: [PATCH 17/18] refactored main function --- scripts/docker/docker_usage_sum.py | 108 ++++++++++++++++++----------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/scripts/docker/docker_usage_sum.py b/scripts/docker/docker_usage_sum.py index 557ac0f47..446c0f85d 100644 --- a/scripts/docker/docker_usage_sum.py +++ b/scripts/docker/docker_usage_sum.py @@ -47,49 +47,19 @@ def main(): os.rename(OUT_SUMMARY_TSV, OUT_SUMMARY_TSV + ".bak") wdl_files = get_wdl_files(dir_to_wdls=WDLS_DIR) - global_docker_info = [] - - total_files = len(wdl_files) # Used for Progression calculation - - for index, wdl_path in enumerate(wdl_files, start=1): - - wdl_name = wdl_path - - with open(wdl_path, "r") as file_content: - content = file_content.read() - pattern = re.compile(r'\s*docker:\s*"') - if pattern.search(content): # If wdl file contains "docker:" - matched_lines = [] - file_content.seek(0) - lines = file_content.readlines() - - for line_number, line in enumerate(lines, start=1): - if pattern.search(line): - matched_lines.append((line_number, line.strip())) - - docker_info: list[str] = get_docker_info_from_string( - wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir - ) - - sorted_info: list = sorted(docker_info, reverse=False) - global_docker_info.append(sorted_info) - else: - pass + pattern = re.compile(r'^\s*docker:\s*"') + global_docker_info = process_wdl_files( + wdl_files=wdl_files, pattern=pattern, repo_dir=repo_dir + ) - # Progression - # Calculate the percentage completion - progress: float = (index + 1) / total_files * 100 + # Remove empty elements in list + non_empty_docker_info = [x for x in global_docker_info if x] - # Clear the previous line and print the progress - print(f"Progress: {progress:.2f}%\r", end="") - with open(OUT_SUMMARY_TSV, "w") as tsv_file: - # Add header - tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH") - # Add content - for docker_info_line in sorted(global_docker_info): - delimiter = "\n" - tsv_file.write(delimiter.join(docker_info_line) + "\n") + write_docker_info_to_tsv( + output_summary_tsv=OUT_SUMMARY_TSV, + docker_info=non_empty_docker_info + ) print(f"DONE. PLEASE CHECKOUT TSV FILE: {OUT_SUMMARY_TSV}") @@ -381,5 +351,63 @@ def get_latest_local_docker_tag(docker_name: str, repo_dir: str) -> str: return latest_tag +def process_wdl_files(wdl_files: list[str], pattern, repo_dir: str): + """ + Returns a list of docker info + @param wdl_files: list of wdl files + @param pattern: pattern to search for + @param repo_dir: directory of the repo + @return: + """ + global_docker_info = [] + total_files = len(wdl_files) + + for index, wdl_path in enumerate(wdl_files, start=1): + wdl_name = wdl_path + matched_lines = [] + + with open(wdl_path, "r") as file_content: + lines = file_content.readlines() + for line_number, line in enumerate(lines, start=1): + if pattern.search(line): + matched_lines.append((line_number, line.strip())) + + docker_info: list[str] = get_docker_info_from_string( + wdl_lines=matched_lines, wdl_path=wdl_name, repo_dir=repo_dir + ) + + sorted_info: list = sorted(docker_info, reverse=False) + global_docker_info.append(sorted_info) + + # Visual progression to show percentage of files processed + progress: float = (index + 1) / total_files * 100 + + # Clear the previous line and print the progress + print(f"Progress: {progress:.2f}%\r", end="") + + return global_docker_info + + +def write_docker_info_to_tsv( + output_summary_tsv: str, docker_info: list[list] +): + """ + Writes docker info to tsv file + + @param output_summary_tsv: + @param docker_info: + @return: + """ + + with open(output_summary_tsv, "w") as tsv_file: + # Add header + tsv_file.write("DOCKER_NAME\tLATEST_TAG\tUSED_TAG\tFILE_LINE\tWDL_PATH\n") + # Add content + for docker_info_line in sorted(docker_info): + delimiter = "\n" + tsv_file.write(delimiter.join(docker_info_line) + "\n") + + + if __name__ == "__main__": main() From 85621db42fd06b639c6f0fbb4391165fe9c5c834 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 1 Nov 2023 16:34:22 -0400 Subject: [PATCH 18/18] fix docker line --- wdl/tasks/Utility/Utils.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wdl/tasks/Utility/Utils.wdl b/wdl/tasks/Utility/Utils.wdl index 754eda331..62ee671e2 100644 --- a/wdl/tasks/Utility/Utils.wdl +++ b/wdl/tasks/Utility/Utils.wdl @@ -2258,7 +2258,9 @@ task StopWorkflow { command <<< echo -e "Workflow explicitly stopped because \n ~{reason}." && exit 1 >>> - runtime {docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"} + runtime { + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } } task InferSampleName {